1 files changed, 16828 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch
new file mode 100644
index 00000000..c037b8f2
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch
@@ -0,0 +1,16828 @@
+From 879030b8b91026fde404c0ab73293655d0684333 Mon Sep 17 00:00:00 2001
+From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com>
+Date: Thu, 18 Oct 2018 18:30:48 +0530
+Subject: [PATCH 1370/4131] port in all files amdkfd source files snapshot at
+
+commit 9918a8f15a957dff68d8bb7d88a2e6485368b626
+Author: shaoyunl <Shaoyun.Liu@amd.com>
+Date:   Mon Mar 28 16:13:27 2016 -0400
+
+    drm/amdkfd: Assign SDMA engine in an alternative order when creating
+        sdma queues
+
+Change-Id: I705be5e2d78cfe8c4035eb9493432f466aefb007
+Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c         |  271 +++-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h         |  104 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c  |  307 +++-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c  |  241 ++-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c   | 1619 ++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c           |    2 +-
+ drivers/gpu/drm/amd/amdkfd/Kconfig                 |    1 +
+ drivers/gpu/drm/amd/amdkfd/Makefile                |    2 +-
+ drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c   |   43 +-
+ drivers/gpu/drm/amd/amdkfd/cik_int.h               |   22 +-
+ drivers/gpu/drm/amd/amdkfd/cik_regs.h              |  175 ++-
+ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h | 1377 +++++++++++++++++
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           | 1357 +++++++++++++---
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.c              | 1163 ++++++++++++++
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.h              |   40 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c            |  972 ++++++------
+ drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h            |   66 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c            |  247 ++-
+ drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h            |  313 ++--
+ drivers/gpu/drm/amd/amdkfd/kfd_device.c            |  283 +++-
+ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  497 +++++-
+ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |   29 +-
+ .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c  |    2 +
+ .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c   |  106 ++
+ drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c          |    3 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_events.c            |  522 ++++---
+ drivers/gpu/drm/amd/amdkfd/kfd_events.h            |    3 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c       |   79 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c         |    6 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      |   26 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_module.c            |   30 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c       |    3 +
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h       |    3 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c   |   92 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c    |  227 ++-
+ drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |  212 ++-
+ drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h       |  120 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  269 +++-
+ drivers/gpu/drm/amd/amdkfd/kfd_process.c           |  542 ++++++-
+ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  102 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_rdma.c              |  296 ++++
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.c          |  841 +++++-----
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.h          |   23 +-
+ drivers/gpu/drm/amd/include/kgd_kfd_interface.h    |  144 +-
+ 44 files changed, 10790 insertions(+), 1992 deletions(-)
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+ create mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h
+ create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+ create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+index ef56352..daeb85f 100755
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+@@ -21,12 +21,14 @@
+  */
+ 
+ #include "amdgpu_amdkfd.h"
+-#include "amd_shared.h"
++#include <linux/dma-buf.h>
+ #include <drm/drmP.h>
+ #include "amdgpu.h"
+ #include "amdgpu_gfx.h"
+ #include <linux/module.h>
+ 
++#define AMDKFD_SKIP_UNCOMPILED_CODE 1
++
+ const struct kfd2kgd_calls *kfd2kgd;
+ const struct kgd2kfd_calls *kgd2kfd;
+ bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**);
+@@ -64,12 +66,12 @@ int amdgpu_amdkfd_init(void)
+ bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev)
+ {
+ 	switch (adev->asic_type) {
+-#ifdef CONFIG_DRM_AMDGPU_CIK
+ 	case CHIP_KAVERI:
+ 		kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions();
+ 		break;
+-#endif
+ 	case CHIP_CARRIZO:
++	case CHIP_TONGA:
++	case CHIP_FIJI:
+ 		kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions();
+ 		break;
+ 	default:
+@@ -102,7 +104,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
+ 		struct kgd2kfd_shared_resources gpu_resources = {
+ 			.compute_vmid_bitmap = 0xFF00,
+ 			.num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec,
+-			.num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe
++			.num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe,
++			.gpuvm_size = (uint64_t)amdgpu_vm_size << 30
+ 		};
+ 
+ 		/* this is going to have a few of the MSBs set that we need to
+@@ -167,6 +170,115 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev)
+ 	return r;
+ }
+ 
++int amdgpu_amdkfd_evict_mem(struct amdgpu_device *adev, struct kgd_mem *mem,
++			    struct mm_struct *mm)
++{
++	int r;
++
++	if (!adev->kfd)
++		return -ENODEV;
++
++	mutex_lock(&mem->data2.lock);
++
++	if (mem->data2.evicted == 1 && delayed_work_pending(&mem->data2.work))
++		/* Cancelling a scheduled restoration */
++		cancel_delayed_work(&mem->data2.work);
++
++	if (++mem->data2.evicted > 1) {
++		mutex_unlock(&mem->data2.lock);
++		return 0;
++	}
++
++	r = amdgpu_amdkfd_gpuvm_evict_mem(mem, mm);
++
++	if (r != 0)
++		/* First eviction failed, setting count back to 0 will
++		 * make the corresponding restore fail gracefully */
++		mem->data2.evicted = 0;
++	else
++		/* First eviction counts as 2. Eviction counter == 1
++		 * means that restoration is scheduled. */
++		mem->data2.evicted = 2;
++
++	mutex_unlock(&mem->data2.lock);
++
++	return r;
++}
++
++static void amdgdu_amdkfd_restore_mem_worker(struct work_struct *work)
++{
++	struct delayed_work *dwork = to_delayed_work(work);
++	struct kgd_mem *mem = container_of(dwork, struct kgd_mem, data2.work);
++	struct amdgpu_device *adev;
++	struct mm_struct *mm;
++
++	mutex_lock(&mem->data2.lock);
++
++	adev = mem->data2.bo->adev;
++	mm = mem->data2.mm;
++
++	/* Restoration may have been canceled by another eviction or
++	 * could already be done by a restore scheduled earlier */
++	if (mem->data2.evicted == 1) {
++		amdgpu_amdkfd_gpuvm_restore_mem(mem, mm);
++		mem->data2.evicted = 0;
++	}
++
++	mutex_unlock(&mem->data2.lock);
++}
++
++int amdgpu_amdkfd_schedule_restore_mem(struct amdgpu_device *adev,
++				       struct kgd_mem *mem,
++				       struct mm_struct *mm,
++				       unsigned long delay)
++{
++	int r = 0;
++
++	if (!adev->kfd)
++		return -ENODEV;
++
++	mutex_lock(&mem->data2.lock);
++
++	if (mem->data2.evicted <= 1) {
++		/* Buffer is not evicted (== 0) or its restoration is
++		 * already scheduled (== 1) */
++		pr_err("Unbalanced restore of evicted buffer %p\n", mem);
++		mutex_unlock(&mem->data2.lock);
++		return -EFAULT;
++	} else if (--mem->data2.evicted > 1) {
++		mutex_unlock(&mem->data2.lock);
++		return 0;
++	}
++
++	/* mem->data2.evicted is 1 after decrememting. Schedule
++	 * restoration. */
++	if (delayed_work_pending(&mem->data2.work))
++		cancel_delayed_work(&mem->data2.work);
++	mem->data2.mm = mm;
++	INIT_DELAYED_WORK(&mem->data2.work,
++			  amdgdu_amdkfd_restore_mem_worker);
++	schedule_delayed_work(&mem->data2.work, delay);
++
++	mutex_unlock(&mem->data2.lock);
++
++	return r;
++}
++
++void amdgpu_amdkfd_cancel_restore_mem(struct amdgpu_device *adev,
++				      struct kgd_mem *mem)
++{
++	if (delayed_work_pending(&mem->data2.work))
++		cancel_delayed_work_sync(&mem->data2.work);
++}
++
++u32 pool_to_domain(enum kgd_memory_pool p)
++{
++	switch (p) {
++	case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM;
++	default: return AMDGPU_GEM_DOMAIN_GTT;
++	}
++}
++
+ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
+ 			void **mem_obj, uint64_t *gpu_addr,
+ 			void **cpu_ptr)
+@@ -192,38 +304,38 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
+ 	}
+ 
+ 	/* map the buffer */
+-	r = amdgpu_bo_reserve((*mem)->bo, true);
++	r = amdgpu_bo_reserve((*mem)->data1.bo, true);
+ 	if (r) {
+ 		dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r);
+ 		goto allocate_mem_reserve_bo_failed;
+ 	}
+ 
+-	r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT,
+-				&(*mem)->gpu_addr);
++	r = amdgpu_bo_pin((*mem)->data1.bo, AMDGPU_GEM_DOMAIN_GTT,
++				&(*mem)->data1.gpu_addr);
+ 	if (r) {
+ 		dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r);
+ 		goto allocate_mem_pin_bo_failed;
+ 	}
+-	*gpu_addr = (*mem)->gpu_addr;
++	*gpu_addr = (*mem)->data1.gpu_addr;
+ 
+-	r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr);
++	r = amdgpu_bo_kmap((*mem)->data1.bo, &(*mem)->data1.cpu_ptr);
+ 	if (r) {
+ 		dev_err(adev->dev,
+ 			"(%d) failed to map bo to kernel for amdkfd\n", r);
+ 		goto allocate_mem_kmap_bo_failed;
+ 	}
+-	*cpu_ptr = (*mem)->cpu_ptr;
++	*cpu_ptr = (*mem)->data1.cpu_ptr;
+ 
+-	amdgpu_bo_unreserve((*mem)->bo);
++	amdgpu_bo_unreserve((*mem)->data1.bo);
+ 
+ 	return 0;
+ 
+ allocate_mem_kmap_bo_failed:
+-	amdgpu_bo_unpin((*mem)->bo);
++	amdgpu_bo_unpin((*mem)->data1.bo);
+ allocate_mem_pin_bo_failed:
+-	amdgpu_bo_unreserve((*mem)->bo);
++	amdgpu_bo_unreserve((*mem)->data1.bo);
+ allocate_mem_reserve_bo_failed:
+-	amdgpu_bo_unref(&(*mem)->bo);
++	amdgpu_bo_unref(&(*mem)->data1.bo);
+ 
+ 	return r;
+ }
+@@ -234,22 +346,44 @@ void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
+ 
+ 	BUG_ON(mem == NULL);
+ 
+-	amdgpu_bo_reserve(mem->bo, true);
+-	amdgpu_bo_kunmap(mem->bo);
+-	amdgpu_bo_unpin(mem->bo);
+-	amdgpu_bo_unreserve(mem->bo);
+-	amdgpu_bo_unref(&(mem->bo));
++	amdgpu_bo_reserve(mem->data1.bo, true);
++	amdgpu_bo_kunmap(mem->data1.bo);
++	amdgpu_bo_unpin(mem->data1.bo);
++	amdgpu_bo_unreserve(mem->data1.bo);
++	amdgpu_bo_unref(&(mem->data1.bo));
+ 	kfree(mem);
+ }
+ 
+-uint64_t get_vmem_size(struct kgd_dev *kgd)
++void get_local_mem_info(struct kgd_dev *kgd,
++				struct kfd_local_mem_info *mem_info)
+ {
+-	struct amdgpu_device *adev =
+-		(struct amdgpu_device *)kgd;
++	uint64_t address_mask;
++	resource_size_t aper_limit;
++	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+ 
+ 	BUG_ON(kgd == NULL);
+ 
+-	return adev->mc.real_vram_size;
++	address_mask = ~((1UL << 40) - 1);
++	aper_limit = adev->mc.aper_base + adev->mc.aper_size;
++	memset(mem_info, 0, sizeof(*mem_info));
++	if (!(adev->mc.aper_base & address_mask ||
++			aper_limit & address_mask)) {
++		mem_info->local_mem_size_public = adev->mc.visible_vram_size;
++		mem_info->local_mem_size_private = adev->mc.real_vram_size -
++				adev->mc.visible_vram_size;
++		mem_info->vram_width = adev->mc.vram_width;
++	} else {
++		pr_err("amdgpu: vram aperture is out of 40bit address base: 0x%llx limit 0x%llx\n",
++				adev->mc.aper_base, aper_limit);
++	}
++
++	pr_debug("amdgpu: address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n",
++			adev->mc.aper_base, aper_limit,
++			mem_info->local_mem_size_public,
++			mem_info->local_mem_size_private);
++
++	if (amdgpu_powerplay || adev->pm.funcs->get_mclk)
++		mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100;
+ }
+ 
+ uint64_t get_gpu_clock_counter(struct kgd_dev *kgd)
+@@ -265,5 +399,94 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd)
+ {
+ 	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+         /* The sclk is in quantas of 10kHz */
+-        return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100;
++        if (amdgpu_powerplay)
++                return amdgpu_dpm_get_sclk(adev, false) / 100;
++        else
++                return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100;
++}
++
++void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info)
++{
++        struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++        struct amdgpu_cu_info acu_info;
++
++        memset(cu_info, 0, sizeof(*cu_info));
++        if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap))
++                return;
++
++        memset(&acu_info, 0, sizeof(acu_info));
++
++        cu_info->cu_active_number = acu_info.number;
++        cu_info->cu_ao_mask = acu_info.ao_cu_mask;
++        memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], sizeof(acu_info.bitmap));
++        cu_info->num_shader_engines = adev->gfx.config.max_shader_engines;
++        cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se;
++        cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh;
++        cu_info->simd_per_cu = acu_info.simd_per_cu;
++        cu_info->max_waves_per_simd = acu_info.max_waves_per_simd;
++        cu_info->wave_front_size = acu_info.wave_front_size;
++        cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu;
++        cu_info->lds_size = acu_info.lds_size;
++}
++
++int map_gtt_bo_to_kernel(struct kgd_dev *kgd,
++                struct kgd_mem *mem, void **kptr)
++{
++        return 0;
++}
++
++int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
++                                  struct kgd_dev **dma_buf_kgd,
++                                  uint64_t *bo_size, void *metadata_buffer,
++                                  size_t buffer_size, uint32_t *metadata_size,
++                                  uint32_t *flags)
++{
++        struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++        struct dma_buf *dma_buf;
++        struct drm_gem_object *obj;
++        struct amdgpu_bo *bo;
++        uint64_t metadata_flags;
++        int r = -EINVAL;
++
++        dma_buf = dma_buf_get(dma_buf_fd);
++        if (IS_ERR(dma_buf))
++                return PTR_ERR(dma_buf);
++
++        if (dma_buf->ops != &drm_gem_prime_dmabuf_ops)
++                /* Can't handle non-graphics buffers */
++               goto out_put;
++
++        obj = dma_buf->priv;
++        if (obj->dev->driver != adev->ddev->driver)
++                /* Can't handle buffers from different drivers */
++                goto out_put;
++
++        adev = obj->dev->dev_private;
++        bo = gem_to_amdgpu_bo(obj);
++        if (!(bo->initial_domain & (AMDGPU_GEM_DOMAIN_VRAM |
++                                    AMDGPU_GEM_DOMAIN_GTT)))
++                /* Only VRAM and GTT BOs are supported */
++                goto out_put;
++
++        r = 0;
++        if (dma_buf_kgd)
++                *dma_buf_kgd = (struct kgd_dev *)adev;
++        if (bo_size)
++                *bo_size = amdgpu_bo_size(bo);
++        if (metadata_size)
++                *metadata_size = bo->metadata_size;
++        if (metadata_buffer)
++                r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size,
++                                           metadata_size, &metadata_flags);
++        if (flags) {
++                *flags = (bo->initial_domain & AMDGPU_GEM_DOMAIN_VRAM) ?
++                        ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT;
++
++                if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
++                        *flags |= ALLOC_MEM_FLAGS_PUBLIC;
++        }
++
++out_put:
++        dma_buf_put(dma_buf);
++        return r;
+ }
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+index 8e8c10e..5fa506d 100755
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+@@ -27,14 +27,46 @@
+ 
+ #include <linux/types.h>
+ #include <linux/mm.h>
++#include <linux/workqueue.h>
+ #include <kgd_kfd_interface.h>
+ 
++extern const struct kgd2kfd_calls *kgd2kfd;
++
+ struct amdgpu_device;
+ 
++struct kfd_bo_va_list {
++       struct list_head bo_list;
++       struct amdgpu_bo_va *bo_va;
++       void *kgd_dev;
++       bool is_mapped;
++};
++
+ struct kgd_mem {
+-	struct amdgpu_bo *bo;
+-        uint64_t gpu_addr;
+-        void *cpu_ptr;
++        union {
++                struct {
++                        struct amdgpu_bo *bo;
++                        uint64_t gpu_addr;
++                        void *cpu_ptr;
++                } data1;
++                struct {
++                        struct mutex lock;
++                        struct amdgpu_bo *bo;
++                        struct list_head bo_va_list;
++                        uint32_t domain;
++                        unsigned int mapped_to_gpu_memory;
++                        void *kptr;
++                        uint64_t va;
++                        unsigned evicted; /* eviction counter */
++                        struct delayed_work work; /* for restore evicted mem */
++                        struct mm_struct *mm; /* for restore */
++                        /* flags bitfield */
++                        bool readonly      : 1;
++                        bool execute       : 1;
++                        bool no_substitute : 1;
++                        bool aql_queue     : 1;
++                } data2;
++        };
++
+ };
+ 
+ 
+@@ -51,17 +83,81 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
+ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev);
+ void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev);
+ 
++int amdgpu_amdkfd_evict_mem(struct amdgpu_device *adev, struct kgd_mem *mem,
++			    struct mm_struct *mm);
++int amdgpu_amdkfd_schedule_restore_mem(struct amdgpu_device *adev,
++				       struct kgd_mem *mem,
++				       struct mm_struct *mm,
++				       unsigned long delay);
++void amdgpu_amdkfd_cancel_restore_mem(struct amdgpu_device *adev,
++				      struct kgd_mem *mem);
++
+ struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void);
+ struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void);
+ 
+ /* Shared API */
++int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm,
++		struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va);
+ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
+ 			void **mem_obj, uint64_t *gpu_addr,
+ 			void **cpu_ptr);
+ void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
+-uint64_t get_vmem_size(struct kgd_dev *kgd);
++void get_local_mem_info(struct kgd_dev *kgd,
++			struct kfd_local_mem_info *mem_info);
+ uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);
+ 
+ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
++void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info);
++int map_gtt_bo_to_kernel(struct kgd_dev *kgd,
++		struct kgd_mem *mem, void **kptr);
++int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
++				  struct kgd_dev **dmabuf_kgd,
++				  uint64_t *bo_size, void *metadata_buffer,
++				  size_t buffer_size, uint32_t *metadata_size,
++				  uint32_t *flags);
++
++/* GPUVM API */
++int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
++		struct kgd_dev *kgd, uint64_t va, size_t size,
++		void *vm, struct kgd_mem **mem,
++		uint64_t *offset, void **kptr,
++		struct kfd_process_device *pdd, uint32_t flags);
++int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
++		struct kgd_dev *kgd, struct kgd_mem *mem);
++int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
++		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm);
++int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
++		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm);
++
++int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm);
++void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm);
++
++uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm);
++
++int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
++					      struct kfd_vm_fault_info *info);
++
++int amdgpu_amdkfd_gpuvm_mmap_bo(
++		struct kgd_dev *kgd, struct vm_area_struct *vma);
++
++int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
++		struct kgd_mem *mem, void **kptr);
++
++struct kfd_process_device *amdgpu_amdkfd_gpuvm_get_pdd_from_buffer_object(
++		struct kgd_dev *kgd, struct kgd_mem *mem);
++int amdgpu_amdkfd_gpuvm_return_bo_size(
++		struct kgd_dev *kgd, struct kgd_mem *mem);
++
++int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd,
++		struct kgd_mem *mem, uint64_t offset,
++		uint64_t size, struct sg_table **ret_sg);
++void amdgpu_amdkfd_gpuvm_unpin_put_sg_table(
++		struct kgd_mem *mem, struct sg_table *sg);
++int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, int dma_buf_fd,
++				      uint64_t va, void *vm,
++				      struct kgd_mem **mem, uint64_t *size);
++int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm);
++int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm);
+ 
+ #endif /* AMDGPU_AMDKFD_H_INCLUDED */
++
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+index e283d31..873e2b7 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+@@ -38,6 +38,9 @@
+ #include "gmc/gmc_7_1_sh_mask.h"
+ #include "cik_structs.h"
+ 
++
++#define AMDKFD_SKIP_UNCOMPILED_CODE 1
++
+ enum {
+ 	MAX_TRAPID = 8,		/* 3 bits in the bitfield. */
+ 	MAX_WATCH_ADDRESSES = 4
+@@ -54,8 +57,8 @@ enum {
+ enum {
+ 	ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL,
+ 	ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF,
+-	ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000,
+-	/* extend the mask to 26 bits to match the low address field */
++	ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000,
++	/* extend the mask to 26 bits in order to match the low address field. */
+ 	ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6,
+ 	ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF
+ };
+@@ -80,30 +83,43 @@ union TCP_WATCH_CNTL_BITS {
+ 	float f32All;
+ };
+ 
++static int create_process_vm(struct kgd_dev *kgd, void **vm);
++static void destroy_process_vm(struct kgd_dev *kgd, void *vm);
++
++static uint32_t get_process_page_dir(void *vm);
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem);
++static int map_memory_to_gpu(struct kgd_dev *kgd, struct kgd_mem *mem,
++		void *vm);
++static int unmap_memory_from_gpu(struct kgd_dev *kgd, struct kgd_mem *mem,
++		void *vm);
++static int alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, size_t size,
++		void *vm, struct kgd_mem **mem,
++		uint64_t *offset, void **kptr, struct kfd_process_device *pdd,
++		uint32_t flags);
++static int free_memory_of_gpu(struct kgd_dev *kgd, struct kgd_mem *mem);
++
++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
++
+ /*
+  * Register access functions
+  */
+ 
+-static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
+-		uint32_t sh_mem_config,	uint32_t sh_mem_ape1_base,
+-		uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
+-
+-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+-					unsigned int vmid);
+-
+-static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
+-				uint32_t hpd_size, uint64_t hpd_gpu_addr);
++static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, uint32_t sh_mem_config,
++		uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
++static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, unsigned int vmid);
++static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, uint32_t hpd_size, uint64_t hpd_gpu_addr);
+ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
+ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+-			uint32_t queue_id, uint32_t __user *wptr);
++		uint32_t queue_id, uint32_t __user *wptr,
++		uint32_t page_table_base);
+ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
+ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
+-				uint32_t pipe_id, uint32_t queue_id);
+-
++		uint32_t pipe_id, uint32_t queue_id);
++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
+ static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
+ 				unsigned int utimeout, uint32_t pipe_id,
+ 				uint32_t queue_id);
+-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
+ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+ 				unsigned int utimeout);
+ static int kgd_address_watch_disable(struct kgd_dev *kgd);
+@@ -123,15 +139,25 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid);
+ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
+ 							uint8_t vmid);
+ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
+-
+-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
++static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req);
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++					 uint64_t va, uint32_t vmid);
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++		uint8_t element_size, uint8_t index_stride, uint8_t mtype);
++static int mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma);
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++		uint32_t page_table_base);
+ 
+ static const struct kfd2kgd_calls kfd2kgd = {
+ 	.init_gtt_mem_allocation = alloc_gtt_mem,
+ 	.free_gtt_mem = free_gtt_mem,
+-	.get_vmem_size = get_vmem_size,
++	.get_local_mem_info = get_local_mem_info,
+ 	.get_gpu_clock_counter = get_gpu_clock_counter,
+ 	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
++	.create_process_vm = create_process_vm,
++	.destroy_process_vm = destroy_process_vm,
++	.get_process_page_dir = get_process_page_dir,
++	.open_graphic_handle = open_graphic_handle,
+ 	.program_sh_mem_settings = kgd_program_sh_mem_settings,
+ 	.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
+ 	.init_pipeline = kgd_init_pipeline,
+@@ -149,14 +175,103 @@ static const struct kfd2kgd_calls kfd2kgd = {
+ 	.get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid,
+ 	.get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid,
+ 	.write_vmid_invalidate_request = write_vmid_invalidate_request,
+-	.get_fw_version = get_fw_version
++	.alloc_memory_of_gpu = alloc_memory_of_gpu,
++	.free_memory_of_gpu = free_memory_of_gpu,
++	.map_memory_to_gpu = map_memory_to_gpu,
++	.unmap_memory_to_gpu = unmap_memory_from_gpu,
++	.get_fw_version = get_fw_version,
++	.set_num_of_requests = set_num_of_requests,
++	.get_cu_info = get_cu_info,
++	.alloc_memory_of_scratch = alloc_memory_of_scratch,
++	.write_config_static_mem = write_config_static_mem,
++	.mmap_bo = mmap_bo,
++	.map_gtt_bo_to_kernel = map_gtt_bo_to_kernel,
++	.set_vm_context_page_table_base = set_vm_context_page_table_base,
++	.get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info,
++	.get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info
+ };
+ 
+-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions()
+ {
+ 	return (struct kfd2kgd_calls *)&kfd2kgd;
+ }
+ 
++/*
++ * Creates a VM context for HSA process
++ */
++static int create_process_vm(struct kgd_dev *kgd, void **vm)
++{
++	int ret;
++	struct amdgpu_vm *new_vm;
++	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++	BUG_ON(kgd == NULL);
++	BUG_ON(vm == NULL);
++
++	new_vm = kzalloc(sizeof(struct amdgpu_vm), GFP_KERNEL);
++	if (new_vm == NULL)
++		return -ENOMEM;
++
++	/* Initialize the VM context, allocate the page directory and zero it */
++	ret = amdgpu_vm_init(adev, new_vm);
++	if (ret != 0) {
++		/* Undo everything related to the new VM context */
++		amdgpu_vm_fini(adev, new_vm);
++		kfree(new_vm);
++		new_vm = NULL;
++	}
++
++	/* Pin the PD directory*/
++	amdgpu_bo_reserve(new_vm->page_directory, true);
++	amdgpu_bo_pin(new_vm->page_directory, AMDGPU_GEM_DOMAIN_VRAM, NULL);
++	amdgpu_bo_unreserve(new_vm->page_directory);
++#if 0
++	new_vm->pd_gpu_addr = amdgpu_bo_gpu_offset(new_vm->page_directory);
++#endif
++	*vm = (void *) new_vm;
++
++	return ret;
++}
++
++/*
++ * Destroys a VM context of HSA process
++ */
++static void destroy_process_vm(struct kgd_dev *kgd, void *vm)
++{
++	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++	struct amdgpu_vm *rvm = (struct amdgpu_vm *) vm;
++
++	BUG_ON(kgd == NULL);
++	BUG_ON(vm == NULL);
++
++	/* Unpin the PD directory*/
++	amdgpu_bo_reserve(rvm->page_directory, true);
++	amdgpu_bo_unpin(rvm->page_directory);
++	amdgpu_bo_unreserve(rvm->page_directory);
++
++	/* Release the VM context */
++	amdgpu_vm_fini(adev, rvm);
++	kfree(vm);
++}
++
++static uint32_t get_process_page_dir(void *vm)
++{
++#if 0
++	struct amdgpu_vm *rvm = (struct amdgpu_vm *) vm;
++
++	BUG_ON(vm == NULL);
++
++	return rvm->pd_gpu_addr >> AMDGPU_GPU_PAGE_SHIFT;
++#endif
++	return 0;
++}
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++				int fd, uint32_t handle, struct kgd_mem **mem)
++{
++	return 0;
++}
++
+ static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
+ {
+ 	return (struct amdgpu_device *)kgd;
+@@ -221,12 +336,11 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+ 
+ 	/*
+ 	 * We have to assume that there is no outstanding mapping.
+-	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
+-	 * a mapping is in progress or because a mapping finished and the
+-	 * SW cleared it. So the protocol is to always wait & clear.
++	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a mapping
++	 * is in progress or because a mapping finished and the SW cleared it.
++	 * So the protocol is to always wait & clear.
+ 	 */
+-	uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
+-			ATC_VMID0_PASID_MAPPING__VALID_MASK;
++	uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK;
+ 
+ 	WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping);
+ 
+@@ -253,7 +367,7 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
+ 	uint32_t mec;
+ 	uint32_t pipe;
+ 
+-	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
++	mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
+ 	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
+ 
+ 	lock_srbm(kgd, mec, pipe, 0, 0);
+@@ -272,8 +386,7 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
+ 
+ 	retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
+ 			m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
+-
+-	pr_debug("kfd: sdma base address: 0x%x\n", retval);
++	pr_err("kfd: sdma base address: 0x%x\n", retval);
+ 
+ 	return retval;
+ }
+@@ -289,7 +402,8 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+ }
+ 
+ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+-			uint32_t queue_id, uint32_t __user *wptr)
++		uint32_t queue_id, uint32_t __user *wptr,
++		uint32_t page_table_base)
+ {
+ 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+ 	uint32_t wptr_shadow, is_wptr_shadow_valid;
+@@ -363,24 +477,13 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
+ 	m = get_sdma_mqd(mqd);
+ 	sdma_base_addr = get_sdma_base_addr(m);
+ 
+-        WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
+-                        m->sdma_rlc_virtual_addr);
+-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE,
+-                        m->sdma_rlc_rb_base);
+-        WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
+-                                m->sdma_rlc_virtual_addr);
+-
+-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
+-			m->sdma_rlc_rb_base_hi);
+-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
+-			m->sdma_rlc_rb_rptr_addr_lo);
+-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
+-			m->sdma_rlc_rb_rptr_addr_hi);
+-        WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL,
+-                        m->sdma_rlc_doorbell);
+-
+-        WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
+-                        m->sdma_rlc_rb_cntl);
++        WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, m->sdma_rlc_virtual_addr);
++        WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base);
++        WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdma_rlc_rb_base_hi);
++        WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdma_rlc_rb_rptr_addr_lo);
++        WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdma_rlc_rb_rptr_addr_hi);
++        WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, m->sdma_rlc_doorbell);
++        WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, m->sdma_rlc_rb_cntl);
+ 
+         return 0;
+ }
+@@ -440,10 +543,11 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
+ 
+ 	while (true) {
+ 		temp = RREG32(mmCP_HQD_ACTIVE);
+-		if (temp & CP_HQD_ACTIVE__ACTIVE_MASK)
++		if (temp & CP_HQD_ACTIVE__ACTIVE__SHIFT)
+ 			break;
+ 		if (timeout <= 0) {
+-			pr_err("kfd: cp queue preemption time out.\n");
++			pr_err("kfd: cp queue preemption time out (%dms)\n",
++					temp);
+ 			release_queue(kgd);
+ 			return -ETIME;
+ 		}
+@@ -503,8 +607,8 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd)
+ 
+ 	/* Turning off this address until we set all the registers */
+ 	for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
+-		WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX +
+-			ADDRESS_WATCH_REG_CNTL], cntl.u32All);
++		WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++			cntl.u32All);
+ 
+ 	return 0;
+ }
+@@ -522,20 +626,20 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd,
+ 
+ 	/* Turning off this watch point until we set all the registers */
+ 	cntl.bitfields.valid = 0;
+-	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
+-		ADDRESS_WATCH_REG_CNTL], cntl.u32All);
++	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++		cntl.u32All);
+ 
+-	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
+-		ADDRESS_WATCH_REG_ADDR_HI], addr_hi);
++	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI],
++		addr_hi);
+ 
+-	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
+-		ADDRESS_WATCH_REG_ADDR_LO], addr_lo);
++	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO],
++		addr_lo);
+ 
+ 	/* Enable the watch point */
+ 	cntl.bitfields.valid = 1;
+ 
+-	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
+-		ADDRESS_WATCH_REG_CNTL], cntl.u32All);
++	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++		cntl.u32All);
+ 
+ 	return 0;
+ }
+@@ -589,7 +693,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
+ 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
+ 
+ 	reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid);
+-	return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
++	return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
+ }
+ 
+ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
+@@ -599,6 +703,56 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
+ 	WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid);
+ }
+ 
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++		uint8_t element_size, uint8_t index_stride, uint8_t mtype)
++{
++	uint32_t reg;
++	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++	reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT |
++		element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT |
++		index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT |
++		mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT;
++
++	WREG32(mmSH_STATIC_MEM_CONFIG, reg);
++	return 0;
++}
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++				 uint64_t va, uint32_t vmid)
++{
++	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++	lock_srbm(kgd, 0, 0, 0, vmid);
++	WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va);
++	unlock_srbm(kgd);
++
++	return 0;
++}
++
++
++static int alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, size_t size,
++		void *vm, struct kgd_mem **mem, uint64_t *offset,
++		void **kptr, struct kfd_process_device *pdd, uint32_t flags)
++{
++	return -EFAULT;
++}
++
++static int free_memory_of_gpu(struct kgd_dev *kgd, struct kgd_mem *mem)
++{
++	return -EFAULT;
++}
++
++static int map_memory_to_gpu(struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
++{
++	return -EFAULT;
++}
++
++static int unmap_memory_from_gpu(struct kgd_dev *kgd, struct kgd_mem *mem,
++		void *vm)
++{
++	return -EFAULT;
++}
++
+ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
+ {
+ 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
+@@ -639,12 +793,12 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
+ 
+ 	case KGD_ENGINE_SDMA1:
+ 		hdr = (const union amdgpu_firmware_header *)
+-							adev->sdma.instance[0].fw->data;
++							adev->sdma[0].fw->data;
+ 		break;
+ 
+ 	case KGD_ENGINE_SDMA2:
+ 		hdr = (const union amdgpu_firmware_header *)
+-							adev->sdma.instance[1].fw->data;
++							adev->sdma[1].fw->data;
+ 		break;
+ 
+ 	default:
+@@ -658,3 +812,32 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
+ 	return hdr->common.ucode_version;
+ }
+ 
++static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req)
++{
++	uint32_t value;
++	struct amdgpu_device *adev = get_amdgpu_device(dev);
++
++	value = RREG32(mmATC_ATS_DEBUG);
++	value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK;
++	value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT);
++
++	WREG32(mmATC_ATS_DEBUG, value);
++}
++
++static int mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma)
++{
++	return 0;
++}
++
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++			uint32_t page_table_base)
++{
++	struct amdgpu_device *adev = get_amdgpu_device(kgd);
++	/* TODO: Don't use hardcoded VMIDs */
++	if (vmid < 8 || vmid > 15) {
++		pr_err("amdkfd: trying to set page table base for wrong VMID\n");
++		return;
++	}
++	WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base);
++}
++
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+index e00fadd..aeca2b6 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+@@ -28,6 +28,7 @@
+ #include "amdgpu.h"
+ #include "amdgpu_amdkfd.h"
+ #include "amdgpu_ucode.h"
++#include "amdgpu_amdkfd_gfx_v8.h"
+ #include "gca/gfx_8_0_sh_mask.h"
+ #include "gca/gfx_8_0_d.h"
+ #include "gca/gfx_8_0_enum.h"
+@@ -38,7 +39,24 @@
+ #include "vi_structs.h"
+ #include "vid.h"
+ 
+-struct cik_sdma_rlc_registers;
++static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = {
++	mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL,
++	mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL,
++	mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL,
++	mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL
++};
++
++
++struct vi_sdma_mqd;
++
++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
++		void *vm, struct kgd_mem **mem);
++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem);
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++				int fd, uint32_t handle, struct kgd_mem **mem);
++
++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
+ 
+ /*
+  * Register access functions
+@@ -54,7 +72,8 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
+ 		uint32_t hpd_size, uint64_t hpd_gpu_addr);
+ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
+ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+-		uint32_t queue_id, uint32_t __user *wptr);
++		uint32_t queue_id, uint32_t __user *wptr,
++		uint32_t page_table_base);
+ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
+ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
+ 		uint32_t pipe_id, uint32_t queue_id);
+@@ -83,14 +102,27 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
+ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
+ 		uint8_t vmid);
+ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
+-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
++static void set_num_of_requests(struct kgd_dev *kgd,
++			uint8_t num_of_requests);
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++				 uint64_t va, uint32_t vmid);
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++		uint8_t element_size, uint8_t index_stride, uint8_t mtype);
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++		uint32_t page_table_base);
+ 
+ static const struct kfd2kgd_calls kfd2kgd = {
+ 	.init_gtt_mem_allocation = alloc_gtt_mem,
+ 	.free_gtt_mem = free_gtt_mem,
+-	.get_vmem_size = get_vmem_size,
++	.get_local_mem_info = get_local_mem_info,
+ 	.get_gpu_clock_counter = get_gpu_clock_counter,
+ 	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
++	.create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
++	.destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
++	.create_process_gpumem = create_process_gpumem,
++	.destroy_process_gpumem = destroy_process_gpumem,
++	.get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
++	.open_graphic_handle = open_graphic_handle,
+ 	.program_sh_mem_settings = kgd_program_sh_mem_settings,
+ 	.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
+ 	.init_pipeline = kgd_init_pipeline,
+@@ -110,14 +142,52 @@ static const struct kfd2kgd_calls kfd2kgd = {
+ 	.get_atc_vmid_pasid_mapping_valid =
+ 			get_atc_vmid_pasid_mapping_valid,
+ 	.write_vmid_invalidate_request = write_vmid_invalidate_request,
+-	.get_fw_version = get_fw_version
++	.alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
++	.free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
++	.map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
++	.unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
++	.get_fw_version = get_fw_version,
++	.set_num_of_requests = set_num_of_requests,
++	.get_cu_info = get_cu_info,
++	.set_num_of_requests = set_num_of_requests,
++	.alloc_memory_of_scratch = alloc_memory_of_scratch,
++	.write_config_static_mem = write_config_static_mem,
++	.mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo,
++	.map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
++	.set_vm_context_page_table_base = set_vm_context_page_table_base,
++	.get_pdd_from_buffer_object =
++			amdgpu_amdkfd_gpuvm_get_pdd_from_buffer_object,
++	.return_bo_size = amdgpu_amdkfd_gpuvm_return_bo_size,
++	.pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table,
++	.unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table,
++	.get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info,
++	.import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf,
++	.get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info
+ };
+ 
+-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void)
++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions()
+ {
+ 	return (struct kfd2kgd_calls *)&kfd2kgd;
+ }
+ 
++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
++				void *vm, struct kgd_mem **mem)
++{
++	return 0;
++}
++
++/* Destroys the GPU allocation and frees the kgd_mem structure */
++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem)
++{
++
++}
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++				int fd, uint32_t handle, struct kgd_mem **mem)
++{
++	return 0;
++}
++
+ static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
+ {
+ 	return (struct amdgpu_device *)kgd;
+@@ -227,9 +297,15 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
+ 	return 0;
+ }
+ 
+-static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
++static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m)
+ {
+-	return 0;
++	uint32_t retval;
++
++	retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
++		m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET;
++	pr_debug("kfd: sdma base address: 0x%x\n", retval);
++
++	return retval;
+ }
+ 
+ static inline struct vi_mqd *get_mqd(void *mqd)
+@@ -237,13 +313,14 @@ static inline struct vi_mqd *get_mqd(void *mqd)
+ 	return (struct vi_mqd *)mqd;
+ }
+ 
+-static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
++static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
+ {
+-	return (struct cik_sdma_rlc_registers *)mqd;
++	return (struct vi_sdma_mqd *)mqd;
+ }
+ 
+ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+-			uint32_t queue_id, uint32_t __user *wptr)
++		uint32_t queue_id, uint32_t __user *wptr,
++		uint32_t page_table_base)
+ {
+ 	struct vi_mqd *m;
+ 	uint32_t shadow_wptr, valid_wptr;
+@@ -306,6 +383,49 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+ 
+ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
+ {
++	struct amdgpu_device *adev = get_amdgpu_device(kgd);
++	struct vi_sdma_mqd *m;
++	uint32_t sdma_base_addr;
++	uint32_t temp, timeout = 2000;
++	uint32_t data;
++
++
++	m = get_sdma_mqd(mqd);
++	sdma_base_addr = get_sdma_base_addr(m);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
++		m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
++
++	while (true) {
++		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
++		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
++			break;
++		if (timeout == 0)
++			return -ETIME;
++		msleep(10);
++		timeout -= 10;
++	}
++	if (m->sdma_engine_id) {
++		data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL);
++		data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL,
++				RESUME_CTX, 0);
++		WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data);
++	} else {
++		data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL);
++		data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
++				RESUME_CTX, 0);
++		WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
++	}
++
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, m->sdmax_rlcx_doorbell);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, m->sdmax_rlcx_virtual_addr);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdmax_rlcx_rb_base_hi);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdmax_rlcx_rb_rptr_addr_lo);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdmax_rlcx_rb_rptr_addr_hi);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, m->sdmax_rlcx_rb_cntl);
++
+ 	return 0;
+ }
+ 
+@@ -334,7 +454,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
+ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
+ {
+ 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+-	struct cik_sdma_rlc_registers *m;
++	struct vi_sdma_mqd *m;
+ 	uint32_t sdma_base_addr;
+ 	uint32_t sdma_rlc_rb_cntl;
+ 
+@@ -382,7 +502,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+ 				unsigned int utimeout)
+ {
+ 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+-	struct cik_sdma_rlc_registers *m;
++	struct vi_sdma_mqd *m;
+ 	uint32_t sdma_base_addr;
+ 	uint32_t temp;
+ 	int timeout = utimeout;
+@@ -396,7 +516,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+ 
+ 	while (true) {
+ 		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
+-		if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
++		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
+ 			break;
+ 		if (timeout <= 0)
+ 			return -ETIME;
+@@ -405,9 +525,9 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+ 	}
+ 
+ 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
+-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
+-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
+-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0);
++	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
++		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
++		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
+ 
+ 	return 0;
+ }
+@@ -429,7 +549,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
+ 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
+ 
+ 	reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid);
+-	return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
++	return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
+ }
+ 
+ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
+@@ -441,6 +561,21 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
+ 
+ static int kgd_address_watch_disable(struct kgd_dev *kgd)
+ {
++	struct amdgpu_device *adev = get_amdgpu_device(kgd);
++	union TCP_WATCH_CNTL_BITS cntl;
++	unsigned int i;
++
++	cntl.u32All = 0;
++
++	cntl.bitfields.valid = 0;
++	cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
++	cntl.bitfields.atc = 1;
++
++	/* Turning off this address until we set all the registers */
++	for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
++		WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++			cntl.u32All);
++
+ 	return 0;
+ }
+ 
+@@ -450,6 +585,28 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd,
+ 					uint32_t addr_hi,
+ 					uint32_t addr_lo)
+ {
++	struct amdgpu_device *adev = get_amdgpu_device(kgd);
++	union TCP_WATCH_CNTL_BITS cntl;
++
++	cntl.u32All = cntl_val;
++
++	/* Turning off this watch point until we set all the registers */
++	cntl.bitfields.valid = 0;
++	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++			cntl.u32All);
++
++	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI],
++			addr_hi);
++
++	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO],
++			addr_lo);
++
++	/* Enable the watch point */
++	cntl.bitfields.valid = 1;
++
++	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++			cntl.u32All);
++
+ 	return 0;
+ }
+ 
+@@ -482,6 +639,32 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
+ 					unsigned int watch_point_id,
+ 					unsigned int reg_offset)
+ {
++	return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset];
++}
++
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++		uint8_t element_size, uint8_t index_stride, uint8_t mtype)
++{
++	uint32_t reg;
++	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++	reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT |
++		element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT |
++		index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT |
++		mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT;
++
++	WREG32(mmSH_STATIC_MEM_CONFIG, reg);
++	return 0;
++}
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++				 uint64_t va, uint32_t vmid)
++{
++	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++	lock_srbm(kgd, 0, 0, 0, vmid);
++	WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va);
++	unlock_srbm(kgd);
++
+ 	return 0;
+ }
+ 
+@@ -525,12 +708,12 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
+ 
+ 	case KGD_ENGINE_SDMA1:
+ 		hdr = (const union amdgpu_firmware_header *)
+-							adev->sdma.instance[0].fw->data;
++							adev->sdma[0].fw->data;
+ 		break;
+ 
+ 	case KGD_ENGINE_SDMA2:
+ 		hdr = (const union amdgpu_firmware_header *)
+-							adev->sdma.instance[1].fw->data;
++							adev->sdma[1].fw->data;
+ 		break;
+ 
+ 	default:
+@@ -543,3 +726,21 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
+ 	/* Only 12 bit in use*/
+ 	return hdr->common.ucode_version;
+ }
++
++static void set_num_of_requests(struct kgd_dev *kgd,
++			uint8_t num_of_requests)
++{
++	pr_debug("in %s this is a stub\n", __func__);
++}
++
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++		uint32_t page_table_base)
++{
++	struct amdgpu_device *adev = get_amdgpu_device(kgd);
++	/* TODO: Don't use hardcoded VMIDs */
++	if (vmid < 8 || vmid > 15) {
++		pr_err("amdkfd: trying to set page table base for wrong VMID\n");
++		return;
++	}
++	WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base);
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+new file mode 100644
+index 0000000..454c247
+--- /dev/null
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+@@ -0,0 +1,1619 @@
++/*
++ * Copyright 2014 Advanced Micro Devices, Inc.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
++ * OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include <linux/module.h>
++#include <linux/fdtable.h>
++#include <linux/uaccess.h>
++#include <linux/firmware.h>
++#include <linux/list.h>
++#include <drm/drmP.h>
++#include <linux/dma-buf.h>
++#include "amdgpu.h"
++#include "amdgpu_amdkfd.h"
++#include "amdgpu_ucode.h"
++#include "gca/gfx_8_0_sh_mask.h"
++#include "gca/gfx_8_0_d.h"
++#include "gca/gfx_8_0_enum.h"
++#include "oss/oss_3_0_sh_mask.h"
++#include "oss/oss_3_0_d.h"
++#include "gmc/gmc_8_1_sh_mask.h"
++#include "gmc/gmc_8_1_d.h"
++#include "vi_structs.h"
++#include "vid.h"
++
++/* Special VM and GART address alignment needed for VI pre-Fiji due to
++ * a HW bug. */
++#define VI_BO_SIZE_ALIGN (0x8000)
++
++static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
++{
++	return (struct amdgpu_device *)kgd;
++}
++
++struct kfd_process_device *amdgpu_amdkfd_gpuvm_get_pdd_from_buffer_object(
++		struct kgd_dev *kgd, struct kgd_mem *mem)
++{
++	return mem->data2.bo->pdd;
++}
++
++static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm,
++		struct list_head *list_bo_va)
++{
++	struct kfd_bo_va_list *entry;
++
++	list_for_each_entry(entry, list_bo_va, bo_list)
++		if (entry->bo_va->vm == avm)
++			return false;
++
++	return true;
++}
++
++static int add_bo_to_vm(struct amdgpu_device *adev, uint64_t va,
++		struct amdgpu_vm *avm, struct amdgpu_bo *bo,
++		struct list_head *list_bo_va,
++		bool readonly, bool execute)
++{
++	int ret;
++	struct kfd_bo_va_list *bo_va_entry;
++	uint32_t flags;
++
++	bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL);
++	if (!bo_va_entry)
++		return -ENOMEM;
++
++	BUG_ON(va == 0);
++
++	pr_debug("amdkfd: adding bo_va to bo %p and va 0x%llx id 0x%x\n",
++			bo, va, adev->dev->id);
++
++	amdgpu_bo_reserve(bo, true);
++
++	/* Add BO to VM internal data structures*/
++	bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo);
++	if (bo_va_entry->bo_va == NULL) {
++		ret = -EINVAL;
++		pr_err("amdkfd: Failed to add BO object to VM. ret == %d\n",
++				ret);
++		goto err_vmadd;
++	}
++
++	flags = AMDGPU_PTE_READABLE | AMDGPU_PTE_WRITEABLE;
++	if (readonly)
++		flags = AMDGPU_PTE_READABLE;
++	if (execute)
++		flags |= AMDGPU_PTE_EXECUTABLE;
++
++	/* Set virtual address for the allocation, allocate PTs,
++	 * if needed, and zero them */
++	ret = amdgpu_vm_bo_map(adev, bo_va_entry->bo_va,
++			va, 0, amdgpu_bo_size(bo),
++			flags | AMDGPU_PTE_VALID);
++	if (ret != 0) {
++		pr_err("amdkfd: Failed to set virtual address for BO. ret == %d (0x%llx)\n",
++				ret, va);
++		goto err_vmsetaddr;
++	}
++
++	bo_va_entry->kgd_dev = (void *)adev;
++	bo_va_entry->is_mapped = false;
++	list_add(&bo_va_entry->bo_list, list_bo_va);
++
++	return 0;
++
++err_vmsetaddr:
++	amdgpu_bo_reserve(bo, true);
++	amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va);
++	/* This will put the bo_va_mapping on the vm->freed
++	 * list. amdgpu_vm_clear_freed needs the PTs to be reserved so
++	 * we don't call it here. That can wait until the next time
++	 * the page tables are updated for a map or unmap. */
++	kfree(bo_va_entry);
++err_vmadd:
++	amdgpu_bo_unreserve(bo);
++	return ret;
++}
++
++static void remove_bo_from_vm(struct amdgpu_device *adev,
++		struct amdgpu_bo *bo, struct amdgpu_bo_va *bo_va)
++{
++	amdgpu_bo_reserve(bo, true);
++	amdgpu_vm_bo_rmv(adev, bo_va);
++	amdgpu_bo_unreserve(bo);
++}
++
++
++static int try_pin_bo(struct amdgpu_bo *bo, uint64_t *mc_address, bool resv,
++		uint32_t domain)
++{
++	int ret = 0;
++	uint64_t temp;
++
++	if (resv) {
++		ret = amdgpu_bo_reserve(bo, true);
++		if (ret != 0)
++			return ret;
++	}
++
++	if (!amdgpu_ttm_tt_has_userptr(bo->tbo.ttm)) {
++		ret = amdgpu_bo_pin(bo, domain, &temp);
++		if (mc_address)
++			*mc_address = temp;
++		if (ret != 0)
++			goto error;
++		if (domain == AMDGPU_GEM_DOMAIN_GTT) {
++			ret = amdgpu_bo_kmap(bo, NULL);
++			if (ret != 0) {
++				pr_err("amdgpu: failed kmap GTT BO\n");
++				goto error;
++			}
++		}
++	} else {
++		/* amdgpu_bo_pin doesn't support userptr. Therefore we
++		 * can use the bo->pin_count for our version of
++		 * pinning without conflict. */
++		if (bo->pin_count == 0) {
++			amdgpu_ttm_placement_from_domain(bo, domain);
++			ret = ttm_bo_validate(&bo->tbo, &bo->placement,
++					      true, false);
++			if (ret != 0) {
++				pr_err("amdgpu: failed to validate BO\n");
++				goto error;
++			}
++		}
++		bo->pin_count++;
++	}
++
++error:
++	if (resv)
++		amdgpu_bo_unreserve(bo);
++
++	return ret;
++}
++
++static int unpin_bo(struct amdgpu_bo *bo, bool resv)
++{
++	int ret = 0;
++
++	if (resv) {
++		ret = amdgpu_bo_reserve(bo, true);
++		if (ret != 0)
++			return ret;
++	}
++
++	amdgpu_bo_kunmap(bo);
++
++	if (!amdgpu_ttm_tt_has_userptr(bo->tbo.ttm)) {
++		ret = amdgpu_bo_unpin(bo);
++		if (ret != 0)
++			goto error;
++	} else if (--bo->pin_count == 0) {
++		amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
++		ret = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
++		if (ret != 0) {
++			pr_err("amdgpu: failed to validate BO\n");
++			goto error;
++		}
++	}
++
++error:
++	if (resv)
++		amdgpu_bo_unreserve(bo);
++
++	return ret;
++}
++
++
++static int try_pin_pts(struct amdgpu_bo_va *bo_va, bool resv)
++{
++	int ret;
++	uint64_t pt_idx, start, last, failed;
++	struct amdgpu_vm *vm;
++	struct amdgpu_bo_va_mapping *mapping;
++
++	vm = bo_va->vm;
++	list_for_each_entry(mapping, &bo_va->valids, list) {
++		start = mapping->it.start >> amdgpu_vm_block_size;
++		last = mapping->it.last >> amdgpu_vm_block_size;
++
++		pr_debug("start PT index %llu  last PT index %llu\n", start, last);
++
++		/* walk over the address space and pin the page tables BOs*/
++		for (pt_idx = start; pt_idx <= last; pt_idx++) {
++			ret = try_pin_bo(vm->page_tables[pt_idx].bo, NULL, resv,
++					AMDGPU_GEM_DOMAIN_VRAM);
++			if (ret != 0) {
++				failed = pt_idx;
++				goto err;
++			}
++		}
++	}
++
++	list_for_each_entry(mapping, &bo_va->invalids, list) {
++		start = mapping->it.start >> amdgpu_vm_block_size;
++		last = mapping->it.last >> amdgpu_vm_block_size;
++
++		pr_debug("start PT index %llu  last PT index %llu\n", start, last);
++
++		/* walk over the address space and pin the page tables BOs*/
++		for (pt_idx = start; pt_idx <= last; pt_idx++) {
++			ret = try_pin_bo(vm->page_tables[pt_idx].bo, NULL, resv,
++					AMDGPU_GEM_DOMAIN_VRAM);
++			if (ret != 0) {
++				failed = pt_idx;
++				goto err;
++			}
++		}
++	}
++
++	return 0;
++
++err:
++	pr_err("amdgpu: Failed to pin BO's PTEs\n");
++	/* Unpin all already pinned BOs*/
++	if (failed > 0) {
++		for (pt_idx = start; pt_idx <= failed - 1; pt_idx++)
++			unpin_bo(vm->page_tables[pt_idx].bo, resv);
++	}
++	return ret;
++}
++
++static void unpin_pts(struct amdgpu_bo_va *bo_va, struct amdgpu_vm *vm,
++			bool resv)
++{
++	uint64_t pt_idx, start, last;
++	struct amdgpu_bo_va_mapping *mapping;
++
++	list_for_each_entry(mapping, &bo_va->valids, list) {
++		start = mapping->it.start >> amdgpu_vm_block_size;
++		last = mapping->it.last >> amdgpu_vm_block_size;
++
++		pr_debug("start PT index %llu  last PT index %llu\n", start, last);
++
++		/* walk over the address space and unpin the page tables BOs*/
++		for (pt_idx = start; pt_idx <= last; pt_idx++)
++			unpin_bo(vm->page_tables[pt_idx].bo, resv);
++	}
++
++	list_for_each_entry(mapping, &bo_va->invalids, list) {
++		start = mapping->it.start >> amdgpu_vm_block_size;
++		last = mapping->it.last >> amdgpu_vm_block_size;
++
++		pr_debug("start PT index %llu  last PT index %llu\n", start, last);
++
++		/* walk over the address space and unpin the page tables BOs*/
++		for (pt_idx = start; pt_idx <= last; pt_idx++)
++			unpin_bo(vm->page_tables[pt_idx].bo, resv);
++	}
++}
++
++static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va,
++		size_t size, void *vm, struct kgd_mem **mem,
++		uint64_t *offset, void **kptr, struct kfd_process_device *pdd,
++		u32 domain, u64 flags, bool aql_queue,
++		bool readonly, bool execute, bool no_sub, bool userptr)
++{
++	struct amdgpu_device *adev;
++	int ret;
++	struct amdgpu_bo *bo;
++	uint64_t user_addr = 0;
++	int byte_align;
++
++	BUG_ON(kgd == NULL);
++	BUG_ON(size == 0);
++	BUG_ON(mem == NULL);
++	BUG_ON(vm == NULL);
++
++	if (aql_queue)
++		size = size >> 1;
++	if (userptr) {
++		if (!offset || !*offset)
++			return -EINVAL;
++		user_addr = *offset;
++	}
++
++	adev = get_amdgpu_device(kgd);
++	byte_align = adev->asic_type != CHIP_FIJI ? VI_BO_SIZE_ALIGN : 1;
++
++	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
++	if (*mem == NULL) {
++		ret = -ENOMEM;
++		goto err;
++	}
++	INIT_LIST_HEAD(&(*mem)->data2.bo_va_list);
++	mutex_init(&(*mem)->data2.lock);
++	(*mem)->data2.readonly = readonly;
++	(*mem)->data2.execute = execute;
++	(*mem)->data2.no_substitute = no_sub;
++	(*mem)->data2.aql_queue = aql_queue;
++
++	pr_debug("amdkfd: allocating GTT BO size %lu\n", size);
++
++	/* Allocate buffer object. Userptr objects need to start out
++	 * in the CPU domain, get moved to GTT when pinned. */
++	ret = amdgpu_bo_create(adev, size, byte_align, false,
++			       userptr ? AMDGPU_GEM_DOMAIN_CPU : domain,
++			       flags, NULL, NULL, &bo);
++	if (ret != 0) {
++		pr_err("amdkfd: Failed to create BO object on GTT. ret == %d\n",
++				ret);
++		goto err_bo_create;
++	}
++	bo->kfd_bo = *mem;
++	bo->pdd = pdd;
++	(*mem)->data2.bo = bo;
++
++	pr_debug("Created BO on GTT with size %zu bytes\n", size);
++
++	if (userptr) {
++		ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr,
++						AMDGPU_GEM_USERPTR_ANONONLY);
++		if (ret) {
++			dev_err(adev->dev,
++				"(%d) failed to set userptr\n", ret);
++			goto allocate_mem_set_userptr_failed;
++		}
++
++		ret = amdgpu_mn_register(bo, user_addr);
++		if (ret) {
++			dev_err(adev->dev,
++				"(%d) failed to register MMU notifier\n", ret);
++			goto allocate_mem_set_userptr_failed;
++		}
++	}
++
++	ret = add_bo_to_vm(adev, va, vm, bo, &(*mem)->data2.bo_va_list,
++			(*mem)->data2.readonly, (*mem)->data2.execute);
++	if (ret != 0)
++		goto err_map;
++
++	if (aql_queue) {
++		ret = add_bo_to_vm(adev, va + size,
++				vm, bo, &(*mem)->data2.bo_va_list,
++				(*mem)->data2.readonly, (*mem)->data2.execute);
++		if (ret != 0)
++			goto err_map;
++	}
++
++	pr_debug("Set BO to VA %p\n", (void *) va);
++
++	if (kptr) {
++		ret = amdgpu_bo_reserve(bo, true);
++		if (ret) {
++			dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", ret);
++			goto allocate_mem_reserve_bo_failed;
++		}
++
++		ret = amdgpu_bo_pin(bo, domain,
++					NULL);
++		if (ret) {
++			dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", ret);
++			goto allocate_mem_pin_bo_failed;
++		}
++
++		ret = amdgpu_bo_kmap(bo, kptr);
++		if (ret) {
++			dev_err(adev->dev,
++				"(%d) failed to map bo to kernel for amdkfd\n", ret);
++			goto allocate_mem_kmap_bo_failed;
++		}
++		(*mem)->data2.kptr = *kptr;
++
++		amdgpu_bo_unreserve(bo);
++	}
++
++	(*mem)->data2.va = va;
++	(*mem)->data2.domain = domain;
++	(*mem)->data2.mapped_to_gpu_memory = 0;
++
++	if (offset)
++		*offset = amdgpu_bo_mmap_offset(bo);
++
++	return 0;
++
++allocate_mem_kmap_bo_failed:
++	amdgpu_bo_unpin(bo);
++allocate_mem_pin_bo_failed:
++	amdgpu_bo_unreserve(bo);
++allocate_mem_reserve_bo_failed:
++err_map:
++	if (userptr)
++		amdgpu_mn_unregister(bo);
++allocate_mem_set_userptr_failed:
++	amdgpu_bo_unref(&bo);
++err_bo_create:
++	kfree(*mem);
++err:
++	return ret;
++}
++
++/* Reserving a BO and its page table BOs must happen atomically to
++ * avoid deadlocks. When updating userptrs we need to temporarily
++ * back-off the reservation and then reacquire it. Track all the
++ * reservation info in a context structure. Buffers can be mapped to
++ * multiple VMs simultaneously (buffers being restored on multiple
++ * GPUs). */
++struct bo_vm_reservation_context {
++	struct amdgpu_bo_list_entry kfd_bo;
++	unsigned n_vms;
++	struct amdgpu_bo_list_entry **vm_bos;
++	struct ww_acquire_ctx ticket;
++	struct list_head list, duplicates;
++	bool reserved;
++};
++
++static int reserve_bo_and_vms(struct amdgpu_device *adev, struct amdgpu_bo *bo,
++			      struct list_head *bo_va_list,
++			      struct amdgpu_vm *vm, bool is_mapped,
++			      struct bo_vm_reservation_context *ctx)
++{
++	struct kfd_bo_va_list *entry;
++	unsigned i;
++	int ret;
++
++	INIT_LIST_HEAD(&ctx->list);
++	INIT_LIST_HEAD(&ctx->duplicates);
++
++	ctx->kfd_bo.robj = bo;
++	ctx->kfd_bo.prefered_domains = bo->initial_domain;
++	ctx->kfd_bo.allowed_domains = bo->initial_domain;
++	ctx->kfd_bo.priority = 0;
++	ctx->kfd_bo.tv.bo = &bo->tbo;
++	ctx->kfd_bo.tv.shared = true;
++	ctx->kfd_bo.user_pages = NULL;
++	list_add(&ctx->kfd_bo.tv.head, &ctx->list);
++
++	ctx->reserved = false;
++
++	ctx->n_vms = 0;
++	list_for_each_entry(entry, bo_va_list, bo_list) {
++		if ((vm && vm != entry->bo_va->vm) ||
++		    entry->is_mapped != is_mapped)
++			continue;
++		ctx->n_vms++;
++	}
++	if (ctx->n_vms == 0)
++		ctx->vm_bos = NULL;
++	else {
++		ctx->vm_bos = kzalloc(sizeof(struct amdgpu_bo_list_entry *)
++				      * ctx->n_vms, GFP_KERNEL);
++		if (ctx->vm_bos == NULL)
++			return -ENOMEM;
++	}
++
++	i = 0;
++	list_for_each_entry(entry, bo_va_list, bo_list) {
++		if ((vm && vm != entry->bo_va->vm) ||
++		    entry->is_mapped != is_mapped)
++			continue;
++
++		ctx->vm_bos[i] = amdgpu_vm_get_bos(adev, entry->bo_va->vm,
++						   &ctx->list);
++		if (!ctx->vm_bos[i]) {
++			pr_err("amdkfd: Failed to get bos from vm\n");
++			ret = -ENOMEM;
++			goto out;
++		}
++		i++;
++	}
++
++	ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
++				     false, &ctx->duplicates);
++	if (!ret)
++		ctx->reserved = true;
++	else
++		pr_err("amdkfd: Failed to reserve buffers in ttm\n");
++
++out:
++	if (ret) {
++		for (i = 0; i < ctx->n_vms; i++) {
++			if (ctx->vm_bos[i])
++				drm_free_large(ctx->vm_bos[i]);
++		}
++		kfree(ctx->vm_bos);
++		ctx->vm_bos = NULL;
++	}
++
++	return ret;
++}
++
++static void unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
++				 bool wait)
++{
++	if (wait) {
++		struct ttm_validate_buffer *entry;
++		int ret;
++
++		list_for_each_entry(entry, &ctx->list, head) {
++			ret = ttm_bo_wait(entry->bo, false, false, false);
++			if (ret != 0)
++				pr_err("amdkfd: Failed to wait for PT/PD update (err == %d)\n",
++				       ret);
++		}
++	}
++	if (ctx->reserved)
++		ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list);
++	if (ctx->vm_bos) {
++		unsigned i;
++
++		for (i = 0; i < ctx->n_vms; i++) {
++			if (ctx->vm_bos[i])
++				drm_free_large(ctx->vm_bos[i]);
++		}
++		kfree(ctx->vm_bos);
++	}
++	ctx->reserved = false;
++	ctx->vm_bos = NULL;
++}
++
++/* Must be called with mem->data2.lock held and a BO/VM reservation
++ * context. Temporarily drops the lock and reservation for updating
++ * user pointers, to avoid circular lock dependencies between MM locks
++ * and buffer reservations. If user pages are invalidated while the
++ * lock and reservation are dropped, try again. */
++static int update_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
++			     struct bo_vm_reservation_context *ctx)
++{
++	struct amdgpu_bo *bo;
++	unsigned tries = 10;
++	int ret;
++
++	bo = mem->data2.bo;
++	if (!amdgpu_ttm_tt_has_userptr(bo->tbo.ttm))
++		return 0;
++
++	if (bo->tbo.ttm->state != tt_bound) {
++		struct page **pages;
++		int invalidated;
++
++		/* get user pages without locking the BO to avoid
++		 * circular lock dependency with MMU notifier. Retry
++		 * until we have the current version. */
++		ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list);
++		ctx->reserved = false;
++		pages = drm_calloc_large(bo->tbo.ttm->num_pages,
++					 sizeof(struct page *));
++		if (!pages)
++			return -ENOMEM;
++
++		mutex_unlock(&mem->data2.lock);
++
++		while (true) {
++			down_read(&mm->mmap_sem);
++			ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, pages);
++			up_read(&mm->mmap_sem);
++
++			mutex_lock(&mem->data2.lock);
++			if (ret != 0)
++				return ret;
++
++			BUG_ON(bo != mem->data2.bo);
++
++			ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
++						     false, &ctx->duplicates);
++			if (unlikely(ret != 0)) {
++				release_pages(pages, bo->tbo.ttm->num_pages, 0);
++				drm_free_large(pages);
++				return ret;
++			}
++			ctx->reserved = true;
++			if (!amdgpu_ttm_tt_userptr_invalidated(bo->tbo.ttm,
++							       &invalidated) ||
++			    bo->tbo.ttm->state == tt_bound ||
++			    --tries == 0)
++				break;
++
++			release_pages(pages, bo->tbo.ttm->num_pages, 0);
++			ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list);
++			ctx->reserved = false;
++			mutex_unlock(&mem->data2.lock);
++		}
++
++		/* If someone else already bound it, release our pages
++		 * array, otherwise copy it into the ttm BO. */
++		if (bo->tbo.ttm->state == tt_bound || tries == 0)
++			release_pages(pages, bo->tbo.ttm->num_pages, 0);
++		else
++			memcpy(bo->tbo.ttm->pages, pages,
++			       sizeof(struct page *) * bo->tbo.ttm->num_pages);
++		drm_free_large(pages);
++	}
++
++	if (tries == 0) {
++		pr_err("Gave up trying to update user pages\n");
++		return -EDEADLK;
++	}
++
++	return 0;
++}
++
++static int map_bo_to_gpuvm(struct amdgpu_device *adev, struct amdgpu_bo *bo,
++		struct amdgpu_bo_va *bo_va)
++{
++	struct amdgpu_vm_id *vm_id;
++	struct amdgpu_vm *vm;
++	int ret;
++
++	/* Pin PTs */
++	ret = try_pin_pts(bo_va, false);
++	if (ret != 0) {
++		pr_err("amdkfd: Failed to pin PTs\n");
++		goto err_failed_to_pin_pts;
++	}
++
++	/* Pin the PD directory*/
++	vm = bo_va->vm;
++	vm_id = &vm->ids[7];
++	ret = try_pin_bo(vm->page_directory, &vm_id->pd_gpu_addr, false,
++			AMDGPU_GEM_DOMAIN_VRAM);
++	if (ret != 0) {
++		pr_err("amdkfd: Failed to pin PD\n");
++		goto err_failed_to_pin_pd;
++	}
++
++	mutex_lock(&vm->mutex);
++
++	/* Update the page directory */
++	ret = amdgpu_vm_update_page_directory(adev, vm);
++	if (ret != 0) {
++		pr_err("amdkfd: Failed to radeon_vm_update_page_directory\n");
++		goto err_failed_to_update_pd;
++	}
++
++	/*
++	 * The previously "released" BOs are really released and their VAs are
++	 * removed from PT. This function is called here because it requires
++	 * the radeon_vm::mutex to be locked and PT to be reserved
++	 */
++	ret = amdgpu_vm_clear_freed(adev, vm);
++	if (ret != 0) {
++		pr_err("amdkfd: Failed to radeon_vm_clear_freed\n");
++		goto err_failed_vm_clear_freed;
++	}
++
++	/* Update the page tables  */
++	ret = amdgpu_vm_bo_update(adev, bo_va, &bo->tbo.mem);
++	if (ret != 0) {
++		pr_err("amdkfd: Failed to radeon_vm_bo_update\n");
++		goto err_failed_to_update_pts;
++	}
++
++	ret = amdgpu_vm_clear_invalids(adev, vm, NULL);
++	if (ret != 0) {
++		pr_err("amdkfd: Failed to radeon_vm_clear_invalids\n");
++		goto err_failed_to_vm_clear_invalids;
++	}
++
++	mutex_unlock(&vm->mutex);
++
++	return 0;
++
++err_failed_to_vm_clear_invalids:
++	amdgpu_vm_bo_update(adev, bo_va, NULL);
++err_failed_to_update_pts:
++err_failed_vm_clear_freed:
++err_failed_to_update_pd:
++	mutex_unlock(&vm->mutex);
++	unpin_bo(vm->page_directory, false);
++err_failed_to_pin_pd:
++	unpin_pts(bo_va, vm, false);
++err_failed_to_pin_pts:
++
++	return ret;
++}
++
++#define BOOL_TO_STR(b)	(b == true) ? "true" : "false"
++
++int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
++		struct kgd_dev *kgd, uint64_t va, size_t size,
++		void *vm, struct kgd_mem **mem,
++		uint64_t *offset, void **kptr,
++		struct kfd_process_device *pdd, uint32_t flags)
++{
++	bool aql_queue, public, readonly, execute, no_sub, userptr;
++	u64 alloc_flag;
++	uint32_t domain;
++	uint64_t *temp_offset;
++
++	if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) {
++		pr_err("amdgpu: current hw doesn't support paged memory\n");
++		return -EINVAL;
++	}
++
++	domain = 0;
++	alloc_flag = 0;
++	temp_offset = NULL;
++
++	aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false;
++	public    = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false;
++	readonly  = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false;
++	execute   = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false;
++	no_sub    = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false;
++	userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false;
++
++	if (userptr && kptr) {
++		pr_err("amdgpu: userptr can't be mapped to kernel\n");
++		return -EINVAL;
++	}
++
++	/*
++	 * Check on which domain to allocate BO
++	 */
++	if (offset && !userptr)
++		*offset = 0;
++	if (flags & ALLOC_MEM_FLAGS_VRAM) {
++		domain = AMDGPU_GEM_DOMAIN_VRAM;
++		alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
++		if (public) {
++			alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
++			temp_offset = offset;
++		}
++	} else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) {
++		domain = AMDGPU_GEM_DOMAIN_GTT;
++		alloc_flag = 0;
++		temp_offset = offset;
++	}
++
++	pr_debug("amdgpu: allocating BO domain %d alloc_flag 0x%llu public %s readonly %s execute %s no substitue %s va 0x%llx\n",
++			domain,
++			alloc_flag,
++			BOOL_TO_STR(public),
++			BOOL_TO_STR(readonly),
++			BOOL_TO_STR(execute),
++			BOOL_TO_STR(no_sub),
++			va);
++
++	return __alloc_memory_of_gpu(kgd, va, size, vm, mem,
++			temp_offset, kptr, pdd, domain,
++			alloc_flag,
++			aql_queue, readonly, execute,
++			no_sub, userptr);
++}
++
++int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
++		struct kgd_dev *kgd, struct kgd_mem *mem)
++{
++	struct amdgpu_device *adev;
++	struct kfd_bo_va_list *entry, *tmp;
++
++	BUG_ON(kgd == NULL);
++	BUG_ON(mem == NULL);
++
++	adev = get_amdgpu_device(kgd);
++
++	mutex_lock(&mem->data2.lock);
++
++	if (mem->data2.mapped_to_gpu_memory > 0) {
++		pr_err("BO with size %lu bytes is mapped to GPU. Need to unmap it before release va 0x%llx\n",
++			mem->data2.bo->tbo.mem.size, mem->data2.va);
++		mutex_unlock(&mem->data2.lock);
++		return -EBUSY;
++	}
++
++	mutex_unlock(&mem->data2.lock);
++	/* lock is not needed after this, since mem is unused and will
++	 * be freed anyway */
++
++	amdgpu_mn_unregister(mem->data2.bo);
++	if (mem->data2.work.work.func)
++		cancel_delayed_work_sync(&mem->data2.work);
++
++	/* Remove from VM internal data structures */
++	list_for_each_entry_safe(entry, tmp, &mem->data2.bo_va_list, bo_list) {
++		pr_debug("Releasing BO with VA %p, size %lu bytes\n",
++				entry->bo_va,
++				mem->data2.bo->tbo.mem.size);
++		if (entry->bo_va->vm != NULL)
++			remove_bo_from_vm(
++				(struct amdgpu_device *)entry->kgd_dev,
++				mem->data2.bo, entry->bo_va);
++		list_del(&entry->bo_list);
++		kfree(entry);
++	}
++
++	/* Free the BO*/
++	amdgpu_bo_unref(&mem->data2.bo);
++	kfree(mem);
++
++	return 0;
++}
++int amdgpu_amdkfd_gpuvm_return_bo_size(struct kgd_dev *kgd, struct kgd_mem *mem)
++{
++	struct amdgpu_bo *bo;
++
++	BUG_ON(mem == NULL);
++
++	bo = mem->data2.bo;
++	return bo->tbo.mem.size;
++
++}
++int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
++		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
++{
++	struct amdgpu_device *adev;
++	int ret;
++	struct amdgpu_bo *bo;
++	uint32_t domain;
++	struct kfd_bo_va_list *entry;
++	struct bo_vm_reservation_context ctx;
++
++	BUG_ON(kgd == NULL);
++	BUG_ON(mem == NULL);
++
++	adev = get_amdgpu_device(kgd);
++
++	mutex_lock(&mem->data2.lock);
++
++	bo = mem->data2.bo;
++
++	BUG_ON(bo == NULL);
++
++	domain = mem->data2.domain;
++
++	pr_debug("amdgpu: try to map VA 0x%llx domain %d\n",
++			mem->data2.va, domain);
++
++	if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm,
++			&mem->data2.bo_va_list)) {
++		pr_debug("amdkfd: add new BO_VA to list 0x%llx\n",
++				mem->data2.va);
++		ret = add_bo_to_vm(adev, mem->data2.va, (struct amdgpu_vm *)vm,
++				   bo, &mem->data2.bo_va_list,
++				   mem->data2.readonly, mem->data2.execute);
++		if (ret != 0)
++			goto add_bo_to_vm_failed;
++		if (mem->data2.aql_queue) {
++			ret = add_bo_to_vm(adev,
++					   mem->data2.va + bo->tbo.mem.size,
++					   (struct amdgpu_vm *)vm,
++					   bo, &mem->data2.bo_va_list,
++					   mem->data2.readonly,
++					   mem->data2.execute);
++			if (ret != 0)
++				goto add_bo_to_vm_failed;
++		}
++	}
++
++	if (!mem->data2.evicted) {
++		ret = reserve_bo_and_vms(adev, bo, &mem->data2.bo_va_list,
++					 vm, false, &ctx);
++		if (unlikely(ret != 0))
++			goto bo_reserve_failed;
++
++		ret = update_user_pages(mem, current->mm, &ctx);
++		if (ret != 0)
++			goto update_user_pages_failed;
++	}
++
++	list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) {
++		if (entry->bo_va->vm == vm && entry->is_mapped == false) {
++			if (mem->data2.evicted) {
++				/* If the BO is evicted, just mark the
++				 * mapping as mapped and stop the GPU's
++				 * queues until the BO is restored. */
++				ret = kgd2kfd->quiesce_mm(adev->kfd,
++							  current->mm);
++				if (ret != 0)
++					goto quiesce_failed;
++				entry->is_mapped = true;
++				mem->data2.mapped_to_gpu_memory++;
++				continue;
++			}
++
++			pr_debug("amdkfd: Trying to map VA 0x%llx to vm %p\n",
++					mem->data2.va, vm);
++			/*
++			 * We need to pin the allocated BO, PD and appropriate PTs and to
++			 * create a mapping of virtual to MC address
++			 */
++			/* Pin BO*/
++			ret = try_pin_bo(bo, NULL, false, domain);
++			if (ret != 0) {
++				pr_err("amdkfd: Failed to pin BO\n");
++				goto pin_bo_failed;
++			}
++
++			ret = map_bo_to_gpuvm(adev, bo, entry->bo_va);
++			if (ret != 0) {
++				pr_err("amdkfd: Failed to map radeon bo to gpuvm\n");
++				goto map_bo_to_gpuvm_failed;
++			}
++			entry->is_mapped = true;
++			mem->data2.mapped_to_gpu_memory++;
++				pr_debug("amdgpu: INC mapping count %d\n",
++					mem->data2.mapped_to_gpu_memory);
++		}
++	}
++
++	if (!mem->data2.evicted)
++		unreserve_bo_and_vms(&ctx, true);
++	mutex_unlock(&mem->data2.lock);
++	return 0;
++
++map_bo_to_gpuvm_failed:
++	unpin_bo(bo, false);
++pin_bo_failed:
++quiesce_failed:
++update_user_pages_failed:
++	if (!mem->data2.evicted)
++		unreserve_bo_and_vms(&ctx, false);
++bo_reserve_failed:
++add_bo_to_vm_failed:
++	mutex_unlock(&mem->data2.lock);
++	return ret;
++}
++
++int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm)
++{
++	int ret;
++	struct amdgpu_vm *new_vm;
++	struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++	BUG_ON(kgd == NULL);
++	BUG_ON(vm == NULL);
++
++	new_vm = kzalloc(sizeof(struct amdgpu_vm), GFP_KERNEL);
++	if (new_vm == NULL)
++		return -ENOMEM;
++
++	/* Initialize the VM context, allocate the page directory and zero it */
++	ret = amdgpu_vm_init(adev, new_vm);
++	if (ret != 0) {
++		pr_err("amdgpu: failed init vm ret %d\n", ret);
++		/* Undo everything related to the new VM context */
++		amdgpu_vm_fini(adev, new_vm);
++		kfree(new_vm);
++		new_vm = NULL;
++	}
++
++	*vm = (void *) new_vm;
++
++	/*
++	 * The previously "released" BOs are really released and their VAs are
++	 * removed from PT. This function is called here because it requires
++	 * the radeon_vm::mutex to be locked and PT to be reserved
++	 */
++	ret = amdgpu_vm_clear_freed(adev, new_vm);
++	if (ret != 0)
++		pr_err("amdgpu: Failed to amdgpu_vm_clear_freed\n");
++
++	pr_debug("amdgpu: created process vm with address 0x%llx\n",
++			new_vm->ids[7].pd_gpu_addr);
++
++	return ret;
++}
++
++void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm)
++{
++	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++	struct amdgpu_vm *avm = (struct amdgpu_vm *) vm;
++
++	BUG_ON(kgd == NULL);
++	BUG_ON(vm == NULL);
++
++	pr_debug("Destroying process vm with address %p\n", vm);
++
++	/* Release the VM context */
++	amdgpu_vm_fini(adev, avm);
++	kfree(vm);
++}
++
++uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm)
++{
++	struct amdgpu_vm *avm = (struct amdgpu_vm *) vm;
++	struct amdgpu_vm_id *vm_id;
++
++	BUG_ON(avm == NULL);
++
++	vm_id = &avm->ids[7];
++	return vm_id->pd_gpu_addr >> AMDGPU_GPU_PAGE_SHIFT;
++}
++
++int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
++					      struct kfd_vm_fault_info *mem)
++{
++	struct amdgpu_device *adev;
++
++	BUG_ON(kgd == NULL);
++	adev = (struct amdgpu_device *) kgd;
++	if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) {
++		*mem = *adev->mc.vm_fault_info;
++		mb();
++		atomic_set(&adev->mc.vm_fault_info_updated, 0);
++	}
++	return 0;
++}
++
++static int unmap_bo_from_gpuvm(struct amdgpu_device *adev,
++				struct amdgpu_bo_va *bo_va)
++{
++	struct amdgpu_vm *vm;
++	int ret;
++	struct ttm_validate_buffer tv;
++	struct amdgpu_bo_list_entry *vm_bos;
++	struct ww_acquire_ctx ticket;
++	struct list_head list, duplicates;
++
++	INIT_LIST_HEAD(&list);
++	INIT_LIST_HEAD(&duplicates);
++
++	vm = bo_va->vm;
++	tv.bo = &bo_va->bo->tbo;
++	tv.shared = true;
++	list_add(&tv.head, &list);
++
++	vm_bos = amdgpu_vm_get_bos(adev, vm, &list);
++	if (!vm_bos) {
++		pr_err("amdkfd: Failed to get bos from vm\n");
++		ret = -ENOMEM;
++		goto err_failed_to_get_bos;
++	}
++
++	ret = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);
++	if (ret) {
++		pr_err("amdkfd: Failed to reserve buffers in ttm\n");
++		goto err_failed_to_ttm_reserve;
++	}
++
++	mutex_lock(&vm->mutex);
++
++	/*
++	 * The previously "released" BOs are really released and their VAs are
++	 * removed from PT. This function is called here because it requires
++	 * the radeon_vm::mutex to be locked and PT to be reserved
++	 */
++	amdgpu_vm_clear_freed(adev, vm);
++
++	/* Update the page tables - Remove the mapping from bo_va */
++	amdgpu_vm_bo_update(adev, bo_va, NULL);
++
++	amdgpu_vm_clear_invalids(adev, vm, NULL);
++
++	mutex_unlock(&vm->mutex);
++
++	ttm_eu_backoff_reservation(&ticket, &list);
++	drm_free_large(vm_bos);
++
++	return 0;
++err_failed_to_ttm_reserve:
++	drm_free_large(vm_bos);
++err_failed_to_get_bos:
++	return ret;
++}
++
++static bool is_mem_on_local_device(struct kgd_dev *kgd,
++		struct list_head *bo_va_list, void *vm)
++{
++	struct kfd_bo_va_list *entry;
++
++	list_for_each_entry(entry, bo_va_list, bo_list) {
++		if (entry->kgd_dev == kgd && entry->bo_va->vm == vm)
++			return true;
++	}
++
++	return false;
++}
++
++int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
++		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
++{
++	struct kfd_bo_va_list *entry;
++	struct amdgpu_device *adev;
++	unsigned mapped_before;
++	int ret = 0;
++
++	BUG_ON(kgd == NULL);
++	BUG_ON(mem == NULL);
++
++	adev = (struct amdgpu_device *) kgd;
++
++	mutex_lock(&mem->data2.lock);
++
++	/*
++	 * Make sure that this BO mapped on KGD before unmappping it
++	 */
++	if (!is_mem_on_local_device(kgd, &mem->data2.bo_va_list, vm)) {
++		ret = -EINVAL;
++		goto out;
++	}
++
++	if (mem->data2.mapped_to_gpu_memory == 0) {
++		pr_debug("BO size %lu bytes at va 0x%llx is not mapped\n",
++			 mem->data2.bo->tbo.mem.size, mem->data2.va);
++		ret = -EINVAL;
++		goto out;
++	}
++	mapped_before = mem->data2.mapped_to_gpu_memory;
++
++	list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) {
++		if (entry->kgd_dev == kgd &&
++				entry->bo_va->vm == vm &&
++				entry->is_mapped) {
++			if (mem->data2.evicted) {
++				/* If the BO is evicted, just mark the
++				 * mapping as unmapped and allow the
++				 * GPU's queues to resume. */
++				ret = kgd2kfd->resume_mm(adev->kfd,
++							 current->mm);
++				if (ret != 0)
++					goto out;
++				entry->is_mapped = false;
++				mem->data2.mapped_to_gpu_memory--;
++				continue;
++			}
++
++			pr_debug("unmapping BO with VA 0x%llx, size %lu bytes from GPU memory\n",
++				mem->data2.va,
++				mem->data2.bo->tbo.mem.size);
++			/* Unpin the PD directory*/
++			unpin_bo(entry->bo_va->vm->page_directory, true);
++			/* Unpin PTs */
++			unpin_pts(entry->bo_va, entry->bo_va->vm, true);
++
++			/* Unpin BO*/
++			unpin_bo(mem->data2.bo, true);
++			ret = unmap_bo_from_gpuvm(adev, entry->bo_va);
++			if (ret == 0) {
++				entry->is_mapped = false;
++			} else {
++				pr_err("amdgpu: failed unmap va 0x%llx\n",
++						mem->data2.va);
++				goto out;
++			}
++			mem->data2.mapped_to_gpu_memory--;
++			pr_debug("amdgpu: DEC mapping count %d\n",
++					mem->data2.mapped_to_gpu_memory);
++		}
++	}
++	if (mapped_before == mem->data2.mapped_to_gpu_memory) {
++		pr_debug("BO size %lu bytes at va 0x%llx is not mapped on GPU %x:%x.%x\n",
++			 mem->data2.bo->tbo.mem.size, mem->data2.va,
++			 adev->pdev->bus->number, PCI_SLOT(adev->pdev->devfn),
++			 PCI_FUNC(adev->pdev->devfn));
++		ret = -EINVAL;
++	}
++
++out:
++	mutex_unlock(&mem->data2.lock);
++	return ret;
++}
++
++int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma)
++{
++	struct amdgpu_device *adev;
++
++	adev = get_amdgpu_device(kgd);
++	BUG_ON(!adev);
++
++	return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev);
++}
++
++int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
++		struct kgd_mem *mem, void **kptr)
++{
++	int ret;
++	struct amdgpu_device *adev;
++	struct amdgpu_bo *bo;
++
++	adev = get_amdgpu_device(kgd);
++
++	mutex_lock(&mem->data2.lock);
++
++	bo = mem->data2.bo;
++	/* map the buffer */
++	ret = amdgpu_bo_reserve(bo, true);
++	if (ret) {
++		dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", ret);
++		mutex_unlock(&mem->data2.lock);
++		return ret;
++	}
++
++	ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT,
++			NULL);
++	if (ret) {
++		dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", ret);
++		amdgpu_bo_unreserve(bo);
++		mutex_unlock(&mem->data2.lock);
++		return ret;
++	}
++
++	ret = amdgpu_bo_kmap(bo, kptr);
++	if (ret) {
++		dev_err(adev->dev,
++			"(%d) failed to map bo to kernel for amdkfd\n", ret);
++		amdgpu_bo_unpin(bo);
++		amdgpu_bo_unreserve(bo);
++		mutex_unlock(&mem->data2.lock);
++		return ret;
++	}
++
++	mem->data2.kptr = *kptr;
++
++	amdgpu_bo_unreserve(bo);
++	mutex_unlock(&mem->data2.lock);
++
++	return 0;
++}
++
++static int pin_bo_wo_map(struct kgd_mem *mem)
++{
++	struct amdgpu_bo *bo = mem->data2.bo;
++	int ret = 0;
++
++	ret = amdgpu_bo_reserve(bo, false);
++	if (unlikely(ret != 0))
++		return ret;
++
++	ret = amdgpu_bo_pin(bo, mem->data2.domain, NULL);
++	amdgpu_bo_unreserve(bo);
++
++	return ret;
++}
++
++static void unpin_bo_wo_map(struct kgd_mem *mem)
++{
++	struct amdgpu_bo *bo = mem->data2.bo;
++	int ret = 0;
++
++	ret = amdgpu_bo_reserve(bo, false);
++	if (unlikely(ret != 0))
++		return;
++
++	amdgpu_bo_unpin(bo);
++	amdgpu_bo_unreserve(bo);
++}
++
++#define AMD_GPU_PAGE_SHIFT	PAGE_SHIFT
++#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT)
++
++static int get_sg_table(struct amdgpu_device *adev,
++		struct kgd_mem *mem, uint64_t offset,
++		uint64_t size, struct sg_table **ret_sg)
++{
++	struct amdgpu_bo *bo = mem->data2.bo;
++	struct sg_table *sg = NULL;
++	unsigned long bus_addr;
++	unsigned int chunks;
++	unsigned int i;
++	struct scatterlist *s;
++	uint64_t offset_in_page;
++	unsigned int page_size;
++	int ret;
++
++	sg = kmalloc(sizeof(struct sg_table), GFP_KERNEL);
++	if (!sg) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	if (bo->initial_domain == AMDGPU_GEM_DOMAIN_VRAM)
++		page_size = AMD_GPU_PAGE_SIZE;
++	else
++		page_size = PAGE_SIZE;
++
++
++	offset_in_page = offset & (page_size - 1);
++	chunks = (size  + offset_in_page + page_size - 1)
++			/ page_size;
++
++	ret = sg_alloc_table(sg, chunks, GFP_KERNEL);
++	if (unlikely(ret))
++		goto out;
++
++	if (bo->initial_domain == AMDGPU_GEM_DOMAIN_VRAM) {
++		bus_addr = bo->tbo.offset + adev->mc.aper_base + offset;
++
++		for_each_sg(sg->sgl, s, sg->orig_nents, i) {
++			uint64_t chunk_size, length;
++
++			chunk_size = page_size - offset_in_page;
++			length = min(size, chunk_size);
++
++			sg_set_page(s, NULL, length, offset_in_page);
++			s->dma_address = bus_addr;
++			s->dma_length = length;
++
++			size -= length;
++			offset_in_page = 0;
++			bus_addr += length;
++		}
++	} else {
++		struct page **pages;
++		unsigned int cur_page;
++
++		pages = bo->tbo.ttm->pages;
++
++		cur_page = offset / page_size;
++		for_each_sg(sg->sgl, s, sg->orig_nents, i) {
++			uint64_t chunk_size, length;
++
++			chunk_size = page_size - offset_in_page;
++			length = min(size, chunk_size);
++
++			sg_set_page(s, pages[cur_page], length, offset_in_page);
++			s->dma_address = page_to_phys(pages[cur_page]);
++			s->dma_length = length;
++
++			size -= length;
++			offset_in_page = 0;
++			cur_page++;
++		}
++	}
++
++	*ret_sg = sg;
++	return 0;
++out:
++	kfree(sg);
++	*ret_sg = NULL;
++	return ret;
++}
++
++int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd,
++		struct kgd_mem *mem, uint64_t offset,
++		uint64_t size, struct sg_table **ret_sg)
++{
++	int ret;
++	struct amdgpu_device *adev;
++
++	ret = pin_bo_wo_map(mem);
++	if (unlikely(ret != 0))
++		return ret;
++
++	adev = get_amdgpu_device(kgd);
++
++	ret = get_sg_table(adev, mem, offset, size, ret_sg);
++	if (ret)
++		unpin_bo_wo_map(mem);
++
++	return ret;
++}
++
++void amdgpu_amdkfd_gpuvm_unpin_put_sg_table(
++		struct kgd_mem *mem, struct sg_table *sg)
++{
++	sg_free_table(sg);
++	kfree(sg);
++
++	unpin_bo_wo_map(mem);
++}
++
++int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, int dma_buf_fd,
++				      uint64_t va, void *vm,
++				      struct kgd_mem **mem, uint64_t *size)
++{
++	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++	struct dma_buf *dma_buf;
++	struct drm_gem_object *obj;
++	struct amdgpu_bo *bo;
++	int r = -EINVAL;
++
++	dma_buf = dma_buf_get(dma_buf_fd);
++	if (IS_ERR(dma_buf))
++		return PTR_ERR(dma_buf);
++
++	if (dma_buf->ops != &drm_gem_prime_dmabuf_ops)
++		/* Can't handle non-graphics buffers */
++		goto out_put;
++
++	obj = dma_buf->priv;
++	if (obj->dev->dev_private != adev)
++		/* Can't handle buffers from other devices */
++		goto out_put;
++
++	bo = gem_to_amdgpu_bo(obj);
++	if (!(bo->initial_domain & (AMDGPU_GEM_DOMAIN_VRAM |
++				    AMDGPU_GEM_DOMAIN_GTT)))
++		/* Only VRAM and GTT BOs are supported */
++		goto out_put;
++
++	if (size)
++		*size = amdgpu_bo_size(bo);
++
++	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
++	if (*mem == NULL) {
++		r = -ENOMEM;
++		goto out_put;
++	}
++
++	INIT_LIST_HEAD(&(*mem)->data2.bo_va_list);
++	mutex_init(&(*mem)->data2.lock);
++	(*mem)->data2.execute = true; /* executable by default */
++
++	(*mem)->data2.bo = amdgpu_bo_ref(bo);
++	(*mem)->data2.va = va;
++	(*mem)->data2.domain = (bo->initial_domain & AMDGPU_GEM_DOMAIN_VRAM) ?
++		AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
++	(*mem)->data2.mapped_to_gpu_memory = 0;
++
++	r = add_bo_to_vm(adev, va, vm, bo, &(*mem)->data2.bo_va_list,
++			 false, true);
++
++	if (r) {
++		amdgpu_bo_unref(&bo);
++		kfree(*mem);
++		*mem = NULL;
++	}
++
++out_put:
++	dma_buf_put(dma_buf);
++	return r;
++}
++
++/* Runs out of process context. mem->data2.lock must be held. */
++int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm)
++{
++	struct kfd_bo_va_list *entry;
++	unsigned n_evicted;
++	int r = 0;
++
++	pr_debug("Evicting buffer %p\n", mem);
++
++	if (mem->data2.mapped_to_gpu_memory == 0)
++		return 0;
++
++	/* Remove all GPU mappings of the buffer, but don't change any
++	 * of the is_mapped flags so we can restore it later. The
++	 * queues of the affected GPUs are quiesced first. Count the
++	 * number of evicted mappings so we can roll back if something
++	 * goes wrong. */
++	n_evicted = 0;
++	list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) {
++		struct amdgpu_device *adev;
++
++		if (!entry->is_mapped)
++			continue;
++
++		adev = (struct amdgpu_device *)entry->kgd_dev;
++
++		r = kgd2kfd->quiesce_mm(adev->kfd, mm);
++		if (r != 0) {
++			pr_err("failed to quiesce KFD\n");
++			goto fail;
++		}
++
++		r = unmap_bo_from_gpuvm(adev, entry->bo_va);
++		if (r != 0) {
++			pr_err("failed unmap va 0x%llx\n",
++			       mem->data2.va);
++			kgd2kfd->resume_mm(adev->kfd, mm);
++			goto fail;
++		}
++
++		/* Unpin the PD directory*/
++		unpin_bo(entry->bo_va->vm->page_directory, true);
++		/* Unpin PTs */
++		unpin_pts(entry->bo_va, entry->bo_va->vm, true);
++
++		/* Unpin BO*/
++		unpin_bo(mem->data2.bo, true);
++
++		n_evicted++;
++	}
++
++	return 0;
++
++fail:
++	/* To avoid hangs and keep state consistent, roll back partial
++	 * eviction by restoring queues and marking mappings as
++	 * unmapped. Access to now unmapped buffers will fault. */
++	list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) {
++		struct amdgpu_device *adev;
++
++		if (n_evicted == 0)
++			break;
++		if (!entry->is_mapped)
++			continue;
++
++		entry->is_mapped = false;
++
++		adev = (struct amdgpu_device *)entry->kgd_dev;
++		if (kgd2kfd->resume_mm(adev->kfd, mm))
++			pr_err("Failed to resume KFD\n");
++
++		n_evicted--;
++	}
++
++	return r;
++}
++
++/* Runs out of process context. mem->data2.lock must be held. */
++int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm)
++{
++	struct bo_vm_reservation_context ctx;
++	struct kfd_bo_va_list *entry;
++	uint32_t domain;
++	int r, ret = 0;
++	bool have_pages = false;
++
++	pr_debug("Restoring buffer %p\n", mem);
++
++	if (mem->data2.mapped_to_gpu_memory == 0)
++		return 0;
++
++	domain = mem->data2.domain;
++
++	ret = reserve_bo_and_vms(mem->data2.bo->adev, mem->data2.bo,
++				 &mem->data2.bo_va_list, NULL, true, &ctx);
++	if (likely(ret == 0)) {
++		ret = update_user_pages(mem, mm, &ctx);
++		have_pages = !ret;
++		if (!have_pages)
++			unreserve_bo_and_vms(&ctx, false);
++	}
++
++	/* update_user_pages drops the lock briefly. Check if someone
++	 * else evicted or restored the buffer in the mean time */
++	if (mem->data2.evicted != 1) {
++		unreserve_bo_and_vms(&ctx, false);
++		return 0;
++	}
++
++	/* Try to restore all mappings. Mappings that fail to restore
++	 * will be marked as unmapped. If we failed to get the user
++	 * pages, all mappings will be marked as unmapped. */
++	list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) {
++		struct amdgpu_device *adev;
++
++		if (!entry->is_mapped)
++			continue;
++
++		adev = (struct amdgpu_device *)entry->kgd_dev;
++
++		if (unlikely(!have_pages)) {
++			entry->is_mapped = false;
++			goto resume_kfd;
++		}
++
++		r = try_pin_bo(mem->data2.bo, NULL, false, domain);
++		if (unlikely(r != 0)) {
++			pr_err("Failed to pin BO\n");
++			entry->is_mapped = false;
++			if (ret == 0)
++				ret = r;
++			goto resume_kfd;
++		}
++
++		r = map_bo_to_gpuvm(adev, mem->data2.bo, entry->bo_va);
++		if (unlikely(r != 0)) {
++			pr_err("Failed to map BO to gpuvm\n");
++			entry->is_mapped = false;
++			unpin_bo(mem->data2.bo, true);
++			if (ret == 0)
++				ret = r;
++		}
++
++		/* Resume queues even if restore failed. Worst case
++		 * the app will get a GPUVM fault. That's better than
++		 * hanging the queues indefinitely. */
++resume_kfd:
++		r = kgd2kfd->resume_mm(adev->kfd, mm);
++		if (ret != 0) {
++			pr_err("Failed to resume KFD\n");
++			if (ret == 0)
++				ret = r;
++		}
++	}
++
++	if (have_pages)
++		unreserve_bo_and_vms(&ctx, true);
++
++	return ret;
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+index 06b824c..5ce6528 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+@@ -381,7 +381,7 @@ void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring *ring)
+ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf,
+ 					size_t size, loff_t *pos)
+ {
+-	struct amdgpu_ring *ring = (struct amdgpu_ring*)kcl_file_private(f);
++	struct amdgpu_ring *ring = file_inode(f)->i_private;
+ 	int r, i;
+ 	uint32_t value, result, early[3];
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
+index e13c67c..ac49532 100644
+--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
++++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
+@@ -5,5 +5,6 @@
+ config HSA_AMD
+ 	tristate "HSA kernel driver for AMD GPU devices"
+ 	depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64
++	select DRM_AMDGPU_USERPTR
+ 	help
+ 	  Enable this if you want to use HSA features on AMD GPU devices.
+diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
+index b400d56..60c60c0 100644
+--- a/drivers/gpu/drm/amd/amdkfd/Makefile
++++ b/drivers/gpu/drm/amd/amdkfd/Makefile
+@@ -14,6 +14,6 @@ amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
+ 		kfd_process_queue_manager.o kfd_device_queue_manager.o \
+ 		kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \
+ 		kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
+-		kfd_dbgdev.o kfd_dbgmgr.o
++		kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o
+ 
+ obj-$(CONFIG_HSA_AMD)	+= amdkfd.o
+diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+index 211fc48..02a9082 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
++++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+@@ -24,40 +24,59 @@
+ #include "kfd_events.h"
+ #include "cik_int.h"
+ 
+-static bool cik_event_interrupt_isr(struct kfd_dev *dev,
++static bool is_cpc_vm_fault(struct kfd_dev *dev,
+ 					const uint32_t *ih_ring_entry)
+ {
+-	unsigned int pasid;
+ 	const struct cik_ih_ring_entry *ihre =
+ 			(const struct cik_ih_ring_entry *)ih_ring_entry;
+ 
+-	pasid = (ihre->ring_id & 0xffff0000) >> 16;
++	if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
++		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
++	    ihre->vmid >= dev->vm_info.first_vmid_kfd &&
++	    ihre->vmid <= dev->vm_info.last_vmid_kfd)
++		return true;
++	return false;
++}
++static bool cik_event_interrupt_isr(struct kfd_dev *dev,
++					const uint32_t *ih_ring_entry)
++{
++	const struct cik_ih_ring_entry *ihre =
++			(const struct cik_ih_ring_entry *)ih_ring_entry;
+ 
+ 	/* Do not process in ISR, just request it to be forwarded to WQ. */
+-	return (pasid != 0) &&
++	return (ihre->pasid != 0) &&
+ 		(ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
+ 		ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
+-		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE);
++		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
++		is_cpc_vm_fault(dev, ih_ring_entry));
+ }
+ 
+ static void cik_event_interrupt_wq(struct kfd_dev *dev,
+ 					const uint32_t *ih_ring_entry)
+ {
+-	unsigned int pasid;
+ 	const struct cik_ih_ring_entry *ihre =
+ 			(const struct cik_ih_ring_entry *)ih_ring_entry;
+ 
+-	pasid = (ihre->ring_id & 0xffff0000) >> 16;
+-
+-	if (pasid == 0)
++	if (ihre->pasid == 0)
+ 		return;
+ 
+ 	if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE)
+-		kfd_signal_event_interrupt(pasid, 0, 0);
++		kfd_signal_event_interrupt(ihre->pasid, 0, 0);
+ 	else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG)
+-		kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8);
++		kfd_signal_event_interrupt(ihre->pasid, ihre->data & 0xFF, 8);
+ 	else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
+-		kfd_signal_hw_exception_event(pasid);
++		kfd_signal_hw_exception_event(ihre->pasid);
++	else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
++		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
++		struct kfd_vm_fault_info info;
++
++		dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
++		kfd_process_vm_fault(dev->dqm, ihre->pasid);
++		if (info.vmid == ihre->vmid)
++			kfd_signal_vm_fault_event(dev, ihre->pasid, &info);
++		else
++			kfd_signal_vm_fault_event(dev, ihre->pasid, NULL);
++	}
+ }
+ 
+ const struct kfd_event_interrupt_class event_interrupt_class_cik = {
+diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h
+index 79a16d2..feb3c24 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h
++++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h
+@@ -26,16 +26,30 @@
+ #include <linux/types.h>
+ 
+ struct cik_ih_ring_entry {
+-	uint32_t source_id;
+-	uint32_t data;
+-	uint32_t ring_id;
+-	uint32_t reserved;
++	uint32_t source_id:8;
++	uint32_t reserved1:8;
++	uint32_t reserved2:16;
++
++	uint32_t data:28;
++	uint32_t reserved3:4;
++
++	/* pipeid, meid and unused3 are officially called RINGID,
++	 * but for our purposes, they always decode into pipe and ME. */
++	uint32_t pipeid:2;
++	uint32_t meid:2;
++	uint32_t reserved4:4;
++	uint32_t vmid:8;
++	uint32_t pasid:16;
++
++	uint32_t reserved5;
+ };
+ 
+ #define CIK_INTSRC_DEQUEUE_COMPLETE	0xC6
+ #define CIK_INTSRC_CP_END_OF_PIPE	0xB5
+ #define CIK_INTSRC_CP_BAD_OPCODE	0xB7
+ #define CIK_INTSRC_SQ_INTERRUPT_MSG	0xEF
++#define CIK_INTSRC_GFX_PAGE_INV_FAULT	0x92
++#define CIK_INTSRC_GFX_MEM_PROT_FAULT	0x93
+ 
+ #endif
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
+index 48769d1..607fc5c 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h
++++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
+@@ -23,11 +23,33 @@
+ #ifndef CIK_REGS_H
+ #define CIK_REGS_H
+ 
++#define IH_VMID_0_LUT					0x3D40u
++
++#define BIF_DOORBELL_CNTL				0x530Cu
++
++#define	SRBM_GFX_CNTL					0xE44
++#define	PIPEID(x)					((x) << 0)
++#define	MEID(x)						((x) << 2)
++#define	VMID(x)						((x) << 4)
++#define	QUEUEID(x)					((x) << 8)
++
++#define	SQ_CONFIG					0x8C00
++
++#define	SH_MEM_BASES					0x8C28
+ /* if PTR32, these are the bases for scratch and lds */
+ #define	PRIVATE_BASE(x)					((x) << 0) /* scratch */
+ #define	SHARED_BASE(x)					((x) << 16) /* LDS */
++#define	SH_MEM_APE1_BASE				0x8C2C
++/* if PTR32, this is the base location of GPUVM */
++#define	SH_MEM_APE1_LIMIT				0x8C30
++/* if PTR32, this is the upper limit of GPUVM */
++#define	SH_MEM_CONFIG					0x8C34
+ #define	PTR32						(1 << 0)
++#define PRIVATE_ATC					(1 << 1)
+ #define	ALIGNMENT_MODE(x)				((x) << 2)
++#define	SH_MEM_ALIGNMENT_MODE_DWORD			0
++#define	SH_MEM_ALIGNMENT_MODE_DWORD_STRICT		1
++#define	SH_MEM_ALIGNMENT_MODE_STRICT			2
+ #define	SH_MEM_ALIGNMENT_MODE_UNALIGNED			3
+ #define	DEFAULT_MTYPE(x)				((x) << 4)
+ #define	APE1_MTYPE(x)					((x) << 7)
+@@ -36,37 +58,164 @@
+ #define	MTYPE_CACHED					0
+ #define	MTYPE_NONCACHED					3
+ 
++
++#define SH_STATIC_MEM_CONFIG				0x9604u
++
++#define	TC_CFG_L1_LOAD_POLICY0				0xAC68
++#define	TC_CFG_L1_LOAD_POLICY1				0xAC6C
++#define	TC_CFG_L1_STORE_POLICY				0xAC70
++#define	TC_CFG_L2_LOAD_POLICY0				0xAC74
++#define	TC_CFG_L2_LOAD_POLICY1				0xAC78
++#define	TC_CFG_L2_STORE_POLICY0				0xAC7C
++#define	TC_CFG_L2_STORE_POLICY1				0xAC80
++#define	TC_CFG_L2_ATOMIC_POLICY				0xAC84
++#define	TC_CFG_L1_VOLATILE				0xAC88
++#define	TC_CFG_L2_VOLATILE				0xAC8C
++
++#define CP_PQ_WPTR_POLL_CNTL				0xC20C
++#define	WPTR_POLL_EN					(1 << 31)
++
++#define CPC_INT_CNTL					0xC2D0
++#define CP_ME1_PIPE0_INT_CNTL				0xC214
++#define CP_ME1_PIPE1_INT_CNTL				0xC218
++#define CP_ME1_PIPE2_INT_CNTL				0xC21C
++#define CP_ME1_PIPE3_INT_CNTL				0xC220
++#define CP_ME2_PIPE0_INT_CNTL				0xC224
++#define CP_ME2_PIPE1_INT_CNTL				0xC228
++#define CP_ME2_PIPE2_INT_CNTL				0xC22C
++#define CP_ME2_PIPE3_INT_CNTL				0xC230
++#define DEQUEUE_REQUEST_INT_ENABLE			(1 << 13)
++#define WRM_POLL_TIMEOUT_INT_ENABLE			(1 << 17)
++#define PRIV_REG_INT_ENABLE				(1 << 23)
++#define TIME_STAMP_INT_ENABLE				(1 << 26)
++#define GENERIC2_INT_ENABLE				(1 << 29)
++#define GENERIC1_INT_ENABLE				(1 << 30)
++#define GENERIC0_INT_ENABLE				(1 << 31)
++#define CP_ME1_PIPE0_INT_STATUS				0xC214
++#define CP_ME1_PIPE1_INT_STATUS				0xC218
++#define CP_ME1_PIPE2_INT_STATUS				0xC21C
++#define CP_ME1_PIPE3_INT_STATUS				0xC220
++#define CP_ME2_PIPE0_INT_STATUS				0xC224
++#define CP_ME2_PIPE1_INT_STATUS				0xC228
++#define CP_ME2_PIPE2_INT_STATUS				0xC22C
++#define CP_ME2_PIPE3_INT_STATUS				0xC230
++#define DEQUEUE_REQUEST_INT_STATUS			(1 << 13)
++#define WRM_POLL_TIMEOUT_INT_STATUS			(1 << 17)
++#define PRIV_REG_INT_STATUS				(1 << 23)
++#define TIME_STAMP_INT_STATUS				(1 << 26)
++#define GENERIC2_INT_STATUS				(1 << 29)
++#define GENERIC1_INT_STATUS				(1 << 30)
++#define GENERIC0_INT_STATUS				(1 << 31)
++
++#define CP_HPD_EOP_BASE_ADDR				0xC904
++#define CP_HPD_EOP_BASE_ADDR_HI				0xC908
++#define CP_HPD_EOP_VMID					0xC90C
++#define CP_HPD_EOP_CONTROL				0xC910
++#define	EOP_SIZE(x)					((x) << 0)
++#define	EOP_SIZE_MASK					(0x3f << 0)
++#define CP_MQD_BASE_ADDR				0xC914
++#define CP_MQD_BASE_ADDR_HI				0xC918
++#define CP_HQD_ACTIVE					0xC91C
++#define CP_HQD_VMID					0xC920
++
++#define CP_HQD_PERSISTENT_STATE				0xC924u
+ #define	DEFAULT_CP_HQD_PERSISTENT_STATE			(0x33U << 8)
+ #define	PRELOAD_REQ					(1 << 0)
+ 
+-#define	MQD_CONTROL_PRIV_STATE_EN			(1U << 8)
+-
+-#define	DEFAULT_MIN_IB_AVAIL_SIZE			(3U << 20)
+-
+-#define	IB_ATC_EN					(1U << 23)
+-
++#define CP_HQD_PIPE_PRIORITY				0xC928u
++#define CP_HQD_QUEUE_PRIORITY				0xC92Cu
++#define CP_HQD_QUANTUM					0xC930u
+ #define	QUANTUM_EN					1U
+ #define	QUANTUM_SCALE_1MS				(1U << 4)
+ #define	QUANTUM_DURATION(x)				((x) << 8)
+ 
++#define CP_HQD_PQ_BASE					0xC934
++#define CP_HQD_PQ_BASE_HI				0xC938
++#define CP_HQD_PQ_RPTR					0xC93C
++#define CP_HQD_PQ_RPTR_REPORT_ADDR			0xC940
++#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI			0xC944
++#define CP_HQD_PQ_WPTR_POLL_ADDR			0xC948
++#define CP_HQD_PQ_WPTR_POLL_ADDR_HI			0xC94C
++#define CP_HQD_PQ_DOORBELL_CONTROL			0xC950
++#define	DOORBELL_OFFSET(x)				((x) << 2)
++#define	DOORBELL_OFFSET_MASK				(0x1fffff << 2)
++#define	DOORBELL_SOURCE					(1 << 28)
++#define	DOORBELL_SCHD_HIT				(1 << 29)
++#define	DOORBELL_EN					(1 << 30)
++#define	DOORBELL_HIT					(1 << 31)
++#define CP_HQD_PQ_WPTR					0xC954
++#define CP_HQD_PQ_CONTROL				0xC958
++#define	QUEUE_SIZE(x)					((x) << 0)
++#define	QUEUE_SIZE_MASK					(0x3f << 0)
+ #define	RPTR_BLOCK_SIZE(x)				((x) << 8)
++#define	RPTR_BLOCK_SIZE_MASK				(0x3f << 8)
+ #define	MIN_AVAIL_SIZE(x)				((x) << 20)
++#define	PQ_ATC_EN					(1 << 23)
++#define	PQ_VOLATILE					(1 << 26)
++#define	NO_UPDATE_RPTR					(1 << 27)
++#define	UNORD_DISPATCH					(1 << 28)
++#define	ROQ_PQ_IB_FLIP					(1 << 29)
++#define	PRIV_STATE					(1 << 30)
++#define	KMD_QUEUE					(1 << 31)
++
+ #define	DEFAULT_RPTR_BLOCK_SIZE				RPTR_BLOCK_SIZE(5)
+ #define	DEFAULT_MIN_AVAIL_SIZE				MIN_AVAIL_SIZE(3)
+ 
+-#define	PQ_ATC_EN					(1 << 23)
+-#define	NO_UPDATE_RPTR					(1 << 27)
++#define CP_HQD_IB_BASE_ADDR				0xC95Cu
++#define CP_HQD_IB_BASE_ADDR_HI				0xC960u
++#define CP_HQD_IB_RPTR					0xC964u
++#define CP_HQD_IB_CONTROL				0xC968u
++#define	IB_ATC_EN					(1U << 23)
++#define	DEFAULT_MIN_IB_AVAIL_SIZE			(3U << 20)
+ 
+-#define	DOORBELL_OFFSET(x)				((x) << 2)
+-#define	DOORBELL_EN					(1 << 30)
++#define CP_HQD_DEQUEUE_REQUEST				0xC974
++#define	DEQUEUE_REQUEST_DRAIN				1
++#define DEQUEUE_REQUEST_RESET				2
++#define		DEQUEUE_INT					(1U << 8)
+ 
+-#define	PRIV_STATE					(1 << 30)
+-#define	KMD_QUEUE					(1 << 31)
++#define CP_HQD_SEMA_CMD					0xC97Cu
++#define CP_HQD_MSG_TYPE					0xC980u
++#define CP_HQD_ATOMIC0_PREOP_LO				0xC984u
++#define CP_HQD_ATOMIC0_PREOP_HI				0xC988u
++#define CP_HQD_ATOMIC1_PREOP_LO				0xC98Cu
++#define CP_HQD_ATOMIC1_PREOP_HI				0xC990u
++#define CP_HQD_HQ_SCHEDULER0				0xC994u
++#define CP_HQD_HQ_SCHEDULER1				0xC998u
+ 
+-#define	AQL_ENABLE					1
++
++#define CP_MQD_CONTROL					0xC99C
++#define	MQD_VMID(x)					((x) << 0)
++#define	MQD_VMID_MASK					(0xf << 0)
++#define	MQD_CONTROL_PRIV_STATE_EN			(1U << 8)
+ 
+ #define GRBM_GFX_INDEX					0x30800
++#define	INSTANCE_INDEX(x)				((x) << 0)
++#define	SH_INDEX(x)					((x) << 8)
++#define	SE_INDEX(x)					((x) << 16)
++#define	SH_BROADCAST_WRITES				(1 << 29)
++#define	INSTANCE_BROADCAST_WRITES			(1 << 30)
++#define	SE_BROADCAST_WRITES				(1 << 31)
+ 
++#define SQC_CACHES					0x30d20
++#define SQC_POLICY					0x8C38u
++#define SQC_VOLATILE					0x8C3Cu
++
++#define CP_PERFMON_CNTL					0x36020
++
++#define ATC_VMID0_PASID_MAPPING				0x339Cu
++#define	ATC_VMID_PASID_MAPPING_UPDATE_STATUS		0x3398u
+ #define	ATC_VMID_PASID_MAPPING_VALID			(1U << 31)
+ 
++#define ATC_VM_APERTURE0_CNTL				0x3310u
++#define	ATS_ACCESS_MODE_NEVER				0
++#define	ATS_ACCESS_MODE_ALWAYS				1
++
++#define ATC_VM_APERTURE0_CNTL2				0x3318u
++#define ATC_VM_APERTURE0_HIGH_ADDR			0x3308u
++#define ATC_VM_APERTURE0_LOW_ADDR			0x3300u
++#define ATC_VM_APERTURE1_CNTL				0x3314u
++#define ATC_VM_APERTURE1_CNTL2				0x331Cu
++#define ATC_VM_APERTURE1_HIGH_ADDR			0x330Cu
++#define ATC_VM_APERTURE1_LOW_ADDR			0x3304u
++
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h
+new file mode 100644
+index 0000000..1880dc0
+--- /dev/null
++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h
+@@ -0,0 +1,1377 @@
++/*
++ * Copyright 2015 Advanced Micro Devices, Inc.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
++ * OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#if 0
++  HW (CARRIZO) source code for CWSR trap handler
++
++var G8SR_WDMEM_HWREG_OFFSET = 0
++var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
++
++// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
++
++var G8SR_DEBUG_TIMESTAMP = 0
++var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4  // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
++var s_g8sr_ts_save_s    = s[34:35]   // save start
++var s_g8sr_ts_sq_save_msg  = s[36:37]   // The save shader send SAVEWAVE msg to spi
++var s_g8sr_ts_spi_wrexec   = s[38:39]   // the SPI write the sr address to SQ
++var s_g8sr_ts_save_d    = s[40:41]   // save end
++var s_g8sr_ts_restore_s = s[42:43]   // restore start
++var s_g8sr_ts_restore_d = s[44:45]   // restore end
++
++var G8SR_VGPR_SR_IN_DWX4 = 0
++var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000    // DWx4 stride is 4*4Bytes
++var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
++
++
++/*************************************************************************/
++/*					control on how to run the shader					 */
++/*************************************************************************/
++//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
++var EMU_RUN_HACK					=	0
++var EMU_RUN_HACK_RESTORE_NORMAL		=	0
++var EMU_RUN_HACK_SAVE_NORMAL_EXIT	=	0
++var	EMU_RUN_HACK_SAVE_SINGLE_WAVE	=	0
++var EMU_RUN_HACK_SAVE_FIRST_TIME	= 	0					//for interrupted restore in which the first save is through EMU_RUN_HACK
++var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO	= 	0					//for interrupted restore in which the first save is through EMU_RUN_HACK
++var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI	= 	0					//for interrupted restore in which the first save is through EMU_RUN_HACK
++var SAVE_LDS						= 	1
++var WG_BASE_ADDR_LO					=   0x9000a000
++var WG_BASE_ADDR_HI					=	0x0
++var WAVE_SPACE						=	0x5000				//memory size that each wave occupies in workgroup state mem
++var CTX_SAVE_CONTROL				=	0x0
++var CTX_RESTORE_CONTROL				=	CTX_SAVE_CONTROL
++var SIM_RUN_HACK					=	0					//any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
++var	SGPR_SAVE_USE_SQC				=	1					//use SQC D$ to do the write
++var USE_MTBUF_INSTEAD_OF_MUBUF		=	0					//becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
++var SWIZZLE_EN						=	0					//whether we use swizzled buffer addressing
++
++/**************************************************************************/
++/*                     	variables							              */
++/**************************************************************************/
++var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
++var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
++
++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT	= 12
++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE		= 9
++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT	= 8
++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE	= 6
++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT	= 24
++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE	= 3						//FIXME	 sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
++
++var	SQ_WAVE_TRAPSTS_SAVECTX_MASK	=	0x400
++var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF          			// Exception mask					
++var	SQ_WAVE_TRAPSTS_SAVECTX_SHIFT	=	10					
++var	SQ_WAVE_TRAPSTS_MEM_VIOL_MASK	=	0x100					
++var	SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT	=	8		
++var	SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK 	=	0x3FF
++var	SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT 	=	0x0
++var	SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE 	=	10
++var	SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK 	=	0xFFFFF800	
++var	SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT 	=	11
++var	SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE 	=	21	
++
++var SQ_WAVE_IB_STS_RCNT_SHIFT			=	16					//FIXME
++var SQ_WAVE_IB_STS_RCNT_SIZE			=	4					//FIXME
++var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT	=	15					//FIXME
++var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE	=	1					//FIXME
++var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF	//FIXME
++ 
++var	SQ_BUF_RSRC_WORD1_ATC_SHIFT		=	24
++var	SQ_BUF_RSRC_WORD3_MTYPE_SHIFT	=	27
++
++
++/*      Save        */
++var	S_SAVE_BUF_RSRC_WORD1_STRIDE		=	0x00040000  		//stride is 4 bytes 
++var	S_SAVE_BUF_RSRC_WORD3_MISC			= 	0x00807FAC			//SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE			
++
++var	S_SAVE_SPI_INIT_ATC_MASK			=	0x08000000			//bit[27]: ATC bit
++var	S_SAVE_SPI_INIT_ATC_SHIFT			=	27
++var	S_SAVE_SPI_INIT_MTYPE_MASK			=	0x70000000			//bit[30:28]: Mtype
++var	S_SAVE_SPI_INIT_MTYPE_SHIFT			=	28
++var	S_SAVE_SPI_INIT_FIRST_WAVE_MASK		=	0x04000000			//bit[26]: FirstWaveInTG
++var	S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT	=	26
++
++var S_SAVE_PC_HI_RCNT_SHIFT				=	28					//FIXME	 check with Brian to ensure all fields other than PC[47:0] can be used
++var S_SAVE_PC_HI_RCNT_MASK				=   0xF0000000			//FIXME
++var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT		=	27					//FIXME
++var S_SAVE_PC_HI_FIRST_REPLAY_MASK		=	0x08000000			//FIXME
++
++var	s_save_spi_init_lo				=	exec_lo
++var s_save_spi_init_hi				=	exec_hi
++
++												//tba_lo and tba_hi need to be saved/restored
++var	s_save_pc_lo			=	ttmp0			//{TTMP1, TTMP0} = {3��h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
++var	s_save_pc_hi			=	ttmp1			
++var s_save_exec_lo			=	ttmp2
++var s_save_exec_hi			= 	ttmp3			
++var	s_save_status			=	ttmp4			
++var	s_save_trapsts			=	ttmp5			//not really used until the end of the SAVE routine
++var s_save_xnack_mask_lo	=	ttmp6
++var s_save_xnack_mask_hi	=	ttmp7
++var	s_save_buf_rsrc0		=	ttmp8
++var	s_save_buf_rsrc1		=	ttmp9
++var	s_save_buf_rsrc2		=	ttmp10
++var	s_save_buf_rsrc3		=	ttmp11
++
++var s_save_mem_offset		= 	tma_lo				
++var s_save_alloc_size		=	s_save_trapsts			//conflict
++var s_save_tmp              =   s_save_buf_rsrc2       	//shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
++var s_save_m0				=	tma_hi					
++
++/*      Restore     */
++var	S_RESTORE_BUF_RSRC_WORD1_STRIDE			=	S_SAVE_BUF_RSRC_WORD1_STRIDE 
++var	S_RESTORE_BUF_RSRC_WORD3_MISC			= 	S_SAVE_BUF_RSRC_WORD3_MISC		 
++
++var	S_RESTORE_SPI_INIT_ATC_MASK			    =	0x08000000			//bit[27]: ATC bit
++var	S_RESTORE_SPI_INIT_ATC_SHIFT			=	27
++var	S_RESTORE_SPI_INIT_MTYPE_MASK			=	0x70000000			//bit[30:28]: Mtype
++var	S_RESTORE_SPI_INIT_MTYPE_SHIFT			=	28
++var	S_RESTORE_SPI_INIT_FIRST_WAVE_MASK		=	0x04000000			//bit[26]: FirstWaveInTG
++var	S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT	    =	26
++
++var S_RESTORE_PC_HI_RCNT_SHIFT				=	S_SAVE_PC_HI_RCNT_SHIFT
++var S_RESTORE_PC_HI_RCNT_MASK				=   S_SAVE_PC_HI_RCNT_MASK
++var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT		=	S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
++var S_RESTORE_PC_HI_FIRST_REPLAY_MASK		=	S_SAVE_PC_HI_FIRST_REPLAY_MASK
++
++var s_restore_spi_init_lo                   =   exec_lo
++var s_restore_spi_init_hi                   =   exec_hi
++
++var s_restore_mem_offset		= 	ttmp2
++var s_restore_alloc_size		=	ttmp3
++var s_restore_tmp           	=   ttmp6				//tba_lo/hi need to be restored
++var s_restore_mem_offset_save	= 	s_restore_tmp 		//no conflict
++
++var s_restore_m0			=	s_restore_alloc_size	//no conflict			
++
++var s_restore_mode			=  	ttmp7
++
++var	s_restore_pc_lo		    =	ttmp0			
++var	s_restore_pc_hi		    =	ttmp1
++var s_restore_exec_lo		=	tma_lo					//no conflict
++var s_restore_exec_hi		= 	tma_hi					//no conflict
++var	s_restore_status	    =	ttmp4			
++var	s_restore_trapsts	    =	ttmp5
++var s_restore_xnack_mask_lo	=	xnack_mask_lo
++var s_restore_xnack_mask_hi	=	xnack_mask_hi
++var	s_restore_buf_rsrc0		=	ttmp8
++var	s_restore_buf_rsrc1		=	ttmp9
++var	s_restore_buf_rsrc2		=	ttmp10
++var	s_restore_buf_rsrc3		=	ttmp11
++
++/**************************************************************************/
++/*                     	trap handler entry points			              */
++/**************************************************************************/
++/* Shader Main*/
++
++shader main
++  asic(CARRIZO)
++  type(CS)
++
++
++    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) 					//hack to use trap_id for determining save/restore
++		//FIXME VCCZ un-init assertion s_getreg_b32  	s_save_status, hwreg(HW_REG_STATUS)			//save STATUS since we will change SCC
++		s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 				//change SCC
++    	s_cmp_eq_u32 s_save_tmp, 0x007e0000  						//Save: trap_id = 0x7e. Restore: trap_id = 0x7f.  
++    	s_cbranch_scc0 L_JUMP_TO_RESTORE							//do not need to recover STATUS here  since we are going to RESTORE
++		//FIXME  s_setreg_b32 	hwreg(HW_REG_STATUS), 	s_save_status		//need to recover STATUS since we are going to SAVE	
++		s_branch L_SKIP_RESTORE 									//NOT restore, SAVE actually
++	else	
++		s_branch L_SKIP_RESTORE 									//NOT restore. might be a regular trap or save
++    end
++
++L_JUMP_TO_RESTORE:
++    s_branch L_RESTORE												//restore
++
++L_SKIP_RESTORE:
++	
++	s_getreg_b32  	s_save_status, hwreg(HW_REG_STATUS)								//save STATUS since we will change SCC
++	s_getreg_b32  	s_save_trapsts, hwreg(HW_REG_TRAPSTS)    		 				
++	s_and_b32		s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK	//check whether this is for save  
++	s_cbranch_scc1	L_SAVE		     	  						//this is the operation for save
++
++    // *********    Handle non-CWSR traps       *******************
++if (!EMU_RUN_HACK)
++    /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
++	s_load_dwordx4  [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 
++	s_waitcnt lgkmcnt(0)
++	s_or_b32        ttmp7, ttmp8, ttmp9
++	s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
++	s_mov_b32       tma_lo, ttmp10  //set tma_lo/hi for next level trap handler
++	s_mov_b32       tma_hi, ttmp11 
++	s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
++	s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler 
++
++L_NO_NEXT_TRAP:
++	s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
++	s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
++	s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
++	s_add_u32       ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
++	s_addc_u32	ttmp1, ttmp1, 0
++L_EXCP_CASE:
++	s_and_b32	ttmp1, ttmp1, 0xFFFF
++	s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)   
++	s_rfe_b64    	[ttmp0, ttmp1]
++end
++    // *********        End handling of non-CWSR traps   *******************
++
++/**************************************************************************/
++/*                     	save routine						              */
++/**************************************************************************/
++
++L_SAVE:	
++
++if G8SR_DEBUG_TIMESTAMP
++        s_memrealtime	s_g8sr_ts_save_s
++        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
++end
++
++	//check whether there is mem_viol
++	s_getreg_b32  	s_save_trapsts, hwreg(HW_REG_TRAPSTS)    		 				
++	s_and_b32	s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK			
++	s_cbranch_scc0	L_NO_PC_REWIND
++    
++	//if so, need rewind PC assuming GDS operation gets NACKed
++	s_mov_b32       s_save_tmp, 0															//clear mem_viol bit
++	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp	//clear mem_viol bit 
++	s_and_b32 		s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
++	s_sub_u32 		s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
++	s_subb_u32 		s_save_pc_hi, s_save_pc_hi, 0x0			  // -scc
++
++L_NO_PC_REWIND:
++    s_mov_b32       s_save_tmp, 0															//clear saveCtx bit
++	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp		//clear saveCtx bit   
++
++	s_mov_b32		s_save_xnack_mask_lo,	xnack_mask_lo									//save XNACK_MASK  
++	s_mov_b32		s_save_xnack_mask_hi,	xnack_mask_hi    //save XNACK must before any memory operation
++	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)					//save RCNT
++	s_lshl_b32		s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
++	s_or_b32		s_save_pc_hi, s_save_pc_hi, s_save_tmp
++	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)	//save FIRST_REPLAY
++	s_lshl_b32		s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
++	s_or_b32		s_save_pc_hi, s_save_pc_hi, s_save_tmp
++	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS)										//clear RCNT and FIRST_REPLAY in IB_STS
++	s_and_b32		s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
++
++	s_setreg_b32	hwreg(HW_REG_IB_STS), s_save_tmp
++    
++	/*		inform SPI the readiness and wait for SPI's go signal */
++	s_mov_b32		s_save_exec_lo,	exec_lo													//save EXEC and use EXEC for the go signal from SPI
++	s_mov_b32		s_save_exec_hi,	exec_hi
++	s_mov_b64		exec, 	0x0																//clear EXEC to get ready to receive
++
++if G8SR_DEBUG_TIMESTAMP
++        s_memrealtime  s_g8sr_ts_sq_save_msg
++        s_waitcnt lgkmcnt(0)
++end
++
++	if (EMU_RUN_HACK)
++	
++	else
++		s_sendmsg	sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC  
++	end
++
++  L_SLEEP:		
++	s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
++	
++	if (EMU_RUN_HACK)
++																							
++	else
++		s_cbranch_execz	L_SLEEP                                                         
++	end
++
++if G8SR_DEBUG_TIMESTAMP
++        s_memrealtime  s_g8sr_ts_spi_wrexec
++        s_waitcnt lgkmcnt(0)
++end
++
++	/*      setup Resource Contants    */
++	if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))	
++		//calculate wd_addr using absolute thread id 
++		v_readlane_b32 s_save_tmp, v9, 0
++		s_lshr_b32 s_save_tmp, s_save_tmp, 6
++		s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
++		s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
++		s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
++		s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL		
++	else
++	end
++	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
++		s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
++		s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
++		s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL		
++	else
++	end
++	
++	
++	s_mov_b32		s_save_buf_rsrc0, 	s_save_spi_init_lo														//base_addr_lo
++	s_and_b32		s_save_buf_rsrc1, 	s_save_spi_init_hi, 0x0000FFFF											//base_addr_hi
++	s_or_b32		s_save_buf_rsrc1, 	s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
++    s_mov_b32       s_save_buf_rsrc2,   0                                               						//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
++	s_mov_b32		s_save_buf_rsrc3, 	S_SAVE_BUF_RSRC_WORD3_MISC
++	s_and_b32		s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK		
++	s_lshr_b32		s_save_tmp,  		s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)			//get ATC bit into position
++	s_or_b32		s_save_buf_rsrc3, 	s_save_buf_rsrc3,  s_save_tmp											//or ATC
++	s_and_b32		s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK		
++	s_lshr_b32		s_save_tmp,  		s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)		//get MTYPE bits into position
++	s_or_b32		s_save_buf_rsrc3, 	s_save_buf_rsrc3,  s_save_tmp											//or MTYPE	
++	
++	//FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
++	s_mov_b32		s_save_m0,			m0																	//save M0
++	
++	/* 		global mem offset			*/
++	s_mov_b32		s_save_mem_offset, 	0x0																		//mem offset initial value = 0
++
++
++
++
++	/* 		save HW registers	*/
++	//////////////////////////////
++
++  L_SAVE_HWREG:
++        // HWREG SR memory offset : size(VGPR)+size(SGPR)
++       get_vgpr_size_bytes(s_save_mem_offset)
++       get_sgpr_size_bytes(s_save_tmp)
++       s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
++
++
++    s_mov_b32		s_save_buf_rsrc2, 0x4								//NUM_RECORDS	in bytes
++	if (SWIZZLE_EN)
++		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
++	else
++		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
++	end
++
++	
++	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)					//M0
++
++	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))      
++		s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
++		s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0			//carry bit over
++		s_mov_b32	tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
++	    s_mov_b32	tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI	
++	end
++
++	write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)					//PC
++	write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
++	write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)				//EXEC
++	write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
++ 	// Save the tma_lo and tma_hi content from exec_lo and ttmp5            
++	s_mov_b32          s_save_exec_lo, exec_lo 
++	s_mov_b32          s_save_exec_hi, ttmp5   
++	write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)				//STATUS 
++	
++	//s_save_trapsts conflicts with s_save_alloc_size
++	s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
++	write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)				//TRAPSTS
++	
++	write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)			//XNACK_MASK_LO
++	write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)			//XNACK_MASK_HI
++	
++	//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
++	s_getreg_b32 	s_save_m0, hwreg(HW_REG_MODE)                                                   //MODE
++	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
++	write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset)						//TBA_LO
++	write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset)						//TBA_HI
++	write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)				//TMA_LO 
++	write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)				//TMA_HI 
++
++	/*      the first wave in the threadgroup    */
++        // save fist_wave bits in tba_hi unused bit.26
++	s_and_b32		s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK     // extract fisrt wave bit
++    //s_or_b32        tba_hi, s_save_tmp, tba_hi                                        // save first wave bit to tba_hi.bits[26]
++    s_mov_b32        s_save_exec_hi, 0x0                                
++    s_or_b32         s_save_exec_hi, s_save_tmp, s_save_exec_hi                          // save first wave bit to s_save_exec_hi.bits[26]
++
++
++	/*      	save SGPRs	    */
++        // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
++	//////////////////////////////
++
++    // SGPR SR memory offset : size(VGPR)	
++    get_vgpr_size_bytes(s_save_mem_offset)
++    // TODO, change RSRC word to rearrange memory layout for SGPRS
++
++	s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) 				//spgr_size
++	s_add_u32 		s_save_alloc_size, s_save_alloc_size, 1
++	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 4 						//Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value) 
++	
++	if (SGPR_SAVE_USE_SQC)
++		s_lshl_b32		s_save_buf_rsrc2,	s_save_alloc_size, 2					//NUM_RECORDS in bytes 
++	else
++		s_lshl_b32		s_save_buf_rsrc2,	s_save_alloc_size, 8					//NUM_RECORDS in bytes (64 threads)
++	end
++	
++	if (SWIZZLE_EN)
++		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
++	else
++		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
++	end
++
++
++	// backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
++    //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
++    s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
++    s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
++
++	s_mov_b32 		m0, 0x0 						//SGPR initial index value =0		
++  L_SAVE_SGPR_LOOP: 					
++    // SGPR is allocated in 16 SGPR granularity					
++	s_movrels_b64 	s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
++    s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
++	s_movrels_b64 	s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
++    s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
++	s_movrels_b64 	s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
++    s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
++	s_movrels_b64 	s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
++    s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
++
++	write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
++	s_add_u32		m0, m0, 16														//next sgpr index
++	s_cmp_lt_u32 	m0, s_save_alloc_size 											//scc = (m0 < s_save_alloc_size) ? 1 : 0
++	s_cbranch_scc1 	L_SAVE_SGPR_LOOP									//SGPR save is complete?
++    // restore s_save_buf_rsrc0,1
++    //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
++    s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
++
++
++
++
++	/*      	save first 4 VGPR, then LDS save could use   */
++        // each wave will alloc 4 vgprs at least...
++	/////////////////////////////////////////////////////////////////////////////////////
++
++    s_mov_b32       s_save_mem_offset, 0
++ 	s_mov_b32		exec_lo, 0xFFFFFFFF 											//need every thread from now on
++	s_mov_b32		exec_hi, 0xFFFFFFFF
++	
++	if (SWIZZLE_EN)
++		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
++	else
++		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
++	end
++
++
++	// VGPR Allocated in 4-GPR granularity
++
++if G8SR_VGPR_SR_IN_DWX4
++        // the const stride for DWx4 is 4*4 bytes
++        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
++        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
++        
++        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++
++        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
++        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
++else
++		buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++		buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
++		buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
++		buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
++end
++
++
++
++	/*      	save LDS	    */
++	//////////////////////////////
++	
++  L_SAVE_LDS:
++
++        // Change EXEC to all threads...	
++	s_mov_b32		exec_lo, 0xFFFFFFFF   //need every thread from now on   
++	s_mov_b32		exec_hi, 0xFFFFFFFF
++	
++	s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 			//lds_size
++	s_and_b32		s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF				//lds_size is zero?
++	s_cbranch_scc0	L_SAVE_LDS_DONE                                                                            //no lds used? jump to L_SAVE_DONE
++
++	s_barrier               //LDS is used? wait for other waves in the same TG 
++	//s_and_b32		s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here     
++    s_and_b32		s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
++	s_cbranch_scc0	L_SAVE_LDS_DONE
++
++        // first wave do LDS save;
++        
++	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 6 						//LDS size in dwords = lds_size * 64dw
++	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 2 						//LDS size in bytes
++	s_mov_b32		s_save_buf_rsrc2,  s_save_alloc_size  							//NUM_RECORDS in bytes
++
++    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
++    // 
++    get_vgpr_size_bytes(s_save_mem_offset)
++    get_sgpr_size_bytes(s_save_tmp)
++    s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
++    s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
++
++
++	if (SWIZZLE_EN)
++		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0	      //FIXME need to use swizzle to enable bounds checking?
++	else
++		s_mov_b32		s_save_buf_rsrc2,  0x1000000                  //NUM_RECORDS in bytes
++	end
++
++	s_mov_b32 		m0, 0x0                                               //lds_offset initial value = 0
++
++
++var LDS_DMA_ENABLE = 0
++var UNROLL = 0
++if UNROLL==0 && LDS_DMA_ENABLE==1
++        s_mov_b32  s3, 256*2
++        s_nop 0
++        s_nop 0
++        s_nop 0
++  L_SAVE_LDS_LOOP:
++        //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
++	if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
++            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1            // first 64DW
++            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
++	end
++
++	s_add_u32		m0, m0, s3											//every buffer_store_lds does 256 bytes
++	s_add_u32		s_save_mem_offset, s_save_mem_offset, s3							//mem offset increased by 256 bytes
++	s_cmp_lt_u32	m0, s_save_alloc_size												//scc=(m0 < s_save_alloc_size) ? 1 : 0
++	s_cbranch_scc1  L_SAVE_LDS_LOOP														//LDS save is complete?
++
++elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL  , has ichace miss
++      // store from higest LDS address to lowest
++      s_mov_b32  s3, 256*2
++      s_sub_u32  m0, s_save_alloc_size, s3
++      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
++      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
++      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
++      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
++      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
++      s_nop 0
++      s_nop 0
++      s_nop 0   //pad 3 dw to let LDS_DMA align with 64Bytes
++      s_getpc_b64 s[0:1]                              // reuse s[0:1], since s[0:1] already saved
++      s_add_u32   s0, s0,s_save_alloc_size
++      s_addc_u32  s1, s1, 0
++      s_setpc_b64 s[0:1]
++      
++
++       for var i =0; i< 128; i++    
++            // be careful to make here a 64Byte aligned address, which could improve performance...
++            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0           // first 64DW
++            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256           // second 64DW
++
++	    if i!=127
++		s_sub_u32  m0, m0, s3      // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
++	    	s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
++            end
++       end
++
++else   // BUFFER_STORE
++      v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
++      v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2     // tid
++      v_mul_i32_i24 v2, v3, 8   // tid*8
++      v_mov_b32 v3, 256*2
++      s_mov_b32 m0, 0x10000
++      s_mov_b32 s0, s_save_buf_rsrc3
++      s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF    // disable add_tid 
++      s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
++
++L_SAVE_LDS_LOOP_VECTOR:
++      ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
++      s_waitcnt lgkmcnt(0)
++      buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
++//      s_waitcnt vmcnt(0)
++      v_add_u32 v2, vcc[0:1], v2, v3
++      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
++      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
++
++      // restore rsrc3
++      s_mov_b32 s_save_buf_rsrc3, s0
++      
++end
++
++L_SAVE_LDS_DONE:	
++
++
++	/*      	save VGPRs  - set the Rest VGPRs	    */
++	//////////////////////////////////////////////////////////////////////////////////////
++  L_SAVE_VGPR:
++    // VGPR SR memory offset: 0
++    // TODO rearrange the RSRC words to use swizzle for VGPR save...
++  
++    s_mov_b32       s_save_mem_offset, (0+256*4)                                    // for the rest VGPRs
++ 	s_mov_b32		exec_lo, 0xFFFFFFFF 											//need every thread from now on
++	s_mov_b32		exec_hi, 0xFFFFFFFF
++	
++	s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 					//vpgr_size
++	s_add_u32 		s_save_alloc_size, s_save_alloc_size, 1
++	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 2 						//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible 
++	s_lshl_b32		s_save_buf_rsrc2,  s_save_alloc_size, 8							//NUM_RECORDS in bytes (64 threads*4)
++	if (SWIZZLE_EN)
++		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
++	else
++		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
++	end
++
++
++	// VGPR Allocated in 4-GPR granularity
++
++if G8SR_VGPR_SR_IN_DWX4
++        // the const stride for DWx4 is 4*4 bytes
++        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
++        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
++        
++        s_mov_b32         m0, 4     // skip first 4 VGPRs
++        s_cmp_lt_u32      m0, s_save_alloc_size
++        s_cbranch_scc0    L_SAVE_VGPR_LOOP_END      // no more vgprs
++
++        s_set_gpr_idx_on  m0, 0x1   // This will change M0
++        s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000  // because above inst change m0
++L_SAVE_VGPR_LOOP:
++        v_mov_b32         v0, v0   // v0 = v[0+m0]
++        v_mov_b32         v1, v1
++        v_mov_b32         v2, v2
++        v_mov_b32         v3, v3
++
++
++        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++        s_add_u32         m0, m0, 4
++        s_add_u32         s_save_mem_offset, s_save_mem_offset, 256*4
++        s_cmp_lt_u32      m0, s_save_alloc_size
++	s_cbranch_scc1 	L_SAVE_VGPR_LOOP												//VGPR save is complete?
++	s_set_gpr_idx_off
++L_SAVE_VGPR_LOOP_END:
++
++        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
++        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
++else
++    // VGPR store using dw burst	
++    s_mov_b32 		  m0, 0x4   //VGPR initial index value =0
++    s_cmp_lt_u32      m0, s_save_alloc_size
++    s_cbranch_scc0    L_SAVE_VGPR_END
++
++
++    s_set_gpr_idx_on    m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
++    s_add_u32		s_save_alloc_size, s_save_alloc_size, 0x1000					//add 0x1000 since we compare m0 against it later	
++
++  L_SAVE_VGPR_LOOP: 										
++	v_mov_b32 		v0, v0				//v0 = v[0+m0]	
++	v_mov_b32 		v1, v1				//v0 = v[0+m0]	
++	v_mov_b32 		v2, v2				//v0 = v[0+m0]	
++	v_mov_b32 		v3, v3				//v0 = v[0+m0]	
++	    
++    if(USE_MTBUF_INSTEAD_OF_MUBUF)       
++		tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
++    else
++		buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++		buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
++		buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
++		buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
++	end
++
++    s_add_u32		m0, m0, 4														//next vgpr index
++	s_add_u32		s_save_mem_offset, s_save_mem_offset, 256*4						//every buffer_store_dword does 256 bytes
++	s_cmp_lt_u32 	m0,	s_save_alloc_size 											//scc = (m0 < s_save_alloc_size) ? 1 : 0
++	s_cbranch_scc1 	L_SAVE_VGPR_LOOP												//VGPR save is complete?
++	s_set_gpr_idx_off
++end
++	
++L_SAVE_VGPR_END:
++
++
++
++
++
++	
++	/*     S_PGM_END_SAVED  */    							//FIXME  graphics ONLY
++	if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))	
++		s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
++		s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
++		s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0			//carry bit over
++		s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
++	else
++	end
++
++// Save Done timestamp 
++if G8SR_DEBUG_TIMESTAMP
++        s_memrealtime	s_g8sr_ts_save_d
++        // SGPR SR memory offset : size(VGPR)	
++        get_vgpr_size_bytes(s_save_mem_offset)
++        s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
++        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
++        // Need reset rsrc2??
++        s_mov_b32 m0, s_save_mem_offset
++        s_mov_b32 s_save_buf_rsrc2,  0x1000000                                  //NUM_RECORDS in bytes
++        s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0		glc:1	
++end
++
++	
++    s_branch	L_END_PGM
++	
++
++				
++/**************************************************************************/
++/*                     	restore routine						              */
++/**************************************************************************/
++
++L_RESTORE:
++    /*      Setup Resource Contants    */
++    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
++		//calculate wd_addr using absolute thread id
++		v_readlane_b32 s_restore_tmp, v9, 0
++		s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
++		s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
++		s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
++		s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
++		s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL	
++	else
++	end
++
++if G8SR_DEBUG_TIMESTAMP
++        s_memrealtime	s_g8sr_ts_restore_s
++        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
++        // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
++        s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
++        s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
++end
++
++
++	
++    s_mov_b32		s_restore_buf_rsrc0, 	s_restore_spi_init_lo															//base_addr_lo
++	s_and_b32		s_restore_buf_rsrc1, 	s_restore_spi_init_hi, 0x0000FFFF												//base_addr_hi
++	s_or_b32		s_restore_buf_rsrc1, 	s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
++    s_mov_b32       s_restore_buf_rsrc2,   	0                                               								//NUM_RECORDS initial value = 0 (in bytes)
++	s_mov_b32		s_restore_buf_rsrc3, 	S_RESTORE_BUF_RSRC_WORD3_MISC
++	s_and_b32		s_restore_tmp,         	s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK		
++	s_lshr_b32		s_restore_tmp,  		s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)		//get ATC bit into position
++	s_or_b32		s_restore_buf_rsrc3, 	s_restore_buf_rsrc3,  s_restore_tmp												//or ATC
++	s_and_b32		s_restore_tmp,         	s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK		
++	s_lshr_b32		s_restore_tmp,  		s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)	//get MTYPE bits into position
++	s_or_b32		s_restore_buf_rsrc3, 	s_restore_buf_rsrc3,  s_restore_tmp												//or MTYPE
++	
++	/* 		global mem offset			*/
++//	s_mov_b32		s_restore_mem_offset, 0x0								//mem offset initial value = 0
++	
++	/*      the first wave in the threadgroup    */
++	s_and_b32		s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK			
++	s_cbranch_scc0	L_RESTORE_VGPR
++
++    /*      	restore LDS	    */
++	//////////////////////////////
++  L_RESTORE_LDS:
++
++	s_mov_b32		exec_lo, 0xFFFFFFFF 													//need every thread from now on   //be consistent with SAVE although can be moved ahead
++	s_mov_b32		exec_hi, 0xFFFFFFFF
++	
++	s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 				//lds_size
++	s_and_b32		s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF					//lds_size is zero?
++	s_cbranch_scc0	L_RESTORE_VGPR															//no lds used? jump to L_RESTORE_VGPR
++	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 6 							//LDS size in dwords = lds_size * 64dw
++	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 2 							//LDS size in bytes
++	s_mov_b32		s_restore_buf_rsrc2,	s_restore_alloc_size							//NUM_RECORDS in bytes
++
++    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
++    // 
++    get_vgpr_size_bytes(s_restore_mem_offset)
++    get_sgpr_size_bytes(s_restore_tmp)
++    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
++    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()            //FIXME, Check if offset overflow???
++
++
++	if (SWIZZLE_EN)
++		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
++	else
++		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
++	end
++	s_mov_b32 		m0, 0x0 																//lds_offset initial value = 0
++
++  L_RESTORE_LDS_LOOP:									
++	if (SAVE_LDS)
++	    buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1                    // first 64DW
++	    buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
++	end
++    s_add_u32		m0, m0, 256*2					                            // 128 DW
++	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 256*2           //mem offset increased by 128DW
++	s_cmp_lt_u32	m0, s_restore_alloc_size                                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
++	s_cbranch_scc1  L_RESTORE_LDS_LOOP														//LDS restore is complete?
++
++    
++    /*      	restore VGPRs	    */
++	//////////////////////////////
++  L_RESTORE_VGPR:
++        // VGPR SR memory offset : 0	
++	s_mov_b32		s_restore_mem_offset, 0x0
++ 	s_mov_b32		exec_lo, 0xFFFFFFFF 													//need every thread from now on   //be consistent with SAVE although can be moved ahead
++	s_mov_b32		exec_hi, 0xFFFFFFFF
++	
++	s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 	//vpgr_size
++	s_add_u32 		s_restore_alloc_size, s_restore_alloc_size, 1
++	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 2 							//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
++    s_lshl_b32		s_restore_buf_rsrc2,  s_restore_alloc_size, 8						    //NUM_RECORDS in bytes (64 threads*4)
++	if (SWIZZLE_EN)
++		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
++	else
++		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
++	end
++
++if G8SR_VGPR_SR_IN_DWX4
++     get_vgpr_size_bytes(s_restore_mem_offset)
++     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
++
++     // the const stride for DWx4 is 4*4 bytes
++     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
++     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
++
++     s_mov_b32         m0, s_restore_alloc_size
++     s_set_gpr_idx_on  m0, 0x8    // Note.. This will change m0
++
++L_RESTORE_VGPR_LOOP:
++     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
++     s_waitcnt vmcnt(0)
++     s_sub_u32         m0, m0, 4
++     v_mov_b32         v0, v0   // v[0+m0] = v0
++     v_mov_b32         v1, v1
++     v_mov_b32         v2, v2
++     v_mov_b32         v3, v3
++     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
++     s_cmp_eq_u32      m0, 0x8000
++     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
++     s_set_gpr_idx_off
++
++     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
++     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
++
++else
++	// VGPR load using dw burst
++	s_mov_b32		s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v1, v0 will be the last
++	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 256*4
++    s_mov_b32 		m0, 4 			     				//VGPR initial index value = 1
++	s_set_gpr_idx_on  m0, 0x8						//M0[7:0] = M0[7:0] and M0[15:12] = 0x8
++    s_add_u32		s_restore_alloc_size, s_restore_alloc_size, 0x8000						//add 0x8000 since we compare m0 against it later	
++
++  L_RESTORE_VGPR_LOOP: 										
++    if(USE_MTBUF_INSTEAD_OF_MUBUF)       
++		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
++    else
++		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1	
++		buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1	offset:256
++		buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1	offset:256*2
++		buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1	offset:256*3
++	end
++	s_waitcnt		vmcnt(0)																//ensure data ready
++	v_mov_b32		v0, v0																	//v[0+m0] = v0
++    v_mov_b32       v1, v1
++    v_mov_b32       v2, v2
++    v_mov_b32       v3, v3
++    s_add_u32		m0, m0, 4																//next vgpr index
++	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 256*4							//every buffer_load_dword does 256 bytes
++	s_cmp_lt_u32 	m0,	s_restore_alloc_size 												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
++	s_cbranch_scc1 	L_RESTORE_VGPR_LOOP														//VGPR restore (except v0) is complete?
++	s_set_gpr_idx_off
++																							/* VGPR restore on v0 */
++    if(USE_MTBUF_INSTEAD_OF_MUBUF)       
++		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
++    else
++		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save	slc:1 glc:1	
++		buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save	slc:1 glc:1	offset:256
++		buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save	slc:1 glc:1	offset:256*2
++		buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save	slc:1 glc:1	offset:256*3
++	end
++
++end
++	
++    /*      	restore SGPRs	    */
++	//////////////////////////////
++
++    // SGPR SR memory offset : size(VGPR)	
++    get_vgpr_size_bytes(s_restore_mem_offset)
++    get_sgpr_size_bytes(s_restore_tmp)
++    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
++    s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4     // restore SGPR from S[n] to S[0], by 16 sgprs group
++    // TODO, change RSRC word to rearrange memory layout for SGPRS
++
++	s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) 				//spgr_size
++	s_add_u32 		s_restore_alloc_size, s_restore_alloc_size, 1
++	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 4 							//Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
++
++	if (SGPR_SAVE_USE_SQC)
++		s_lshl_b32		s_restore_buf_rsrc2,	s_restore_alloc_size, 2						//NUM_RECORDS in bytes 
++	else
++		s_lshl_b32		s_restore_buf_rsrc2,	s_restore_alloc_size, 8						//NUM_RECORDS in bytes (64 threads)
++	end
++	if (SWIZZLE_EN)
++		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
++	else
++		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
++	end
++
++    /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
++       However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
++    */
++    s_mov_b32 m0, s_restore_alloc_size
++
++ L_RESTORE_SGPR_LOOP: 
++	read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)  //PV: further performance improvement can be made
++	s_waitcnt		lgkmcnt(0)																//ensure data ready
++
++    s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
++
++	s_movreld_b64 	s0, s0 		//s[0+m0] = s0
++    s_movreld_b64 	s2, s2
++    s_movreld_b64 	s4, s4
++    s_movreld_b64 	s6, s6
++    s_movreld_b64 	s8, s8
++    s_movreld_b64 	s10, s10
++    s_movreld_b64 	s12, s12
++    s_movreld_b64 	s14, s14
++
++	s_cmp_eq_u32 	m0, 0				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
++	s_cbranch_scc0 	L_RESTORE_SGPR_LOOP             //SGPR restore (except s0) is complete?
++	
++    /* 		restore HW registers	*/
++	//////////////////////////////
++  L_RESTORE_HWREG:
++
++
++if G8SR_DEBUG_TIMESTAMP
++      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
++      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
++end
++
++    // HWREG SR memory offset : size(VGPR)+size(SGPR)
++    get_vgpr_size_bytes(s_restore_mem_offset)
++    get_sgpr_size_bytes(s_restore_tmp)
++    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
++
++
++    s_mov_b32		s_restore_buf_rsrc2, 0x4												//NUM_RECORDS	in bytes
++	if (SWIZZLE_EN)
++		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
++	else
++		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
++	end
++
++	read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)					//M0
++	read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)				//PC
++	read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
++	read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)				//EXEC
++	read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
++	read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)				//STATUS
++	read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)				//TRAPSTS
++    read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)					//XNACK_MASK_LO
++	read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)					//XNACK_MASK_HI
++	read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)				//MODE
++	read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset)						//TBA_LO
++	read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset)						//TBA_HI
++	
++	s_waitcnt		lgkmcnt(0)																						//from now on, it is safe to restore STATUS and IB_STS
++
++	s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff    	//pc[47:32]        //Do it here in order not to affect STATUS
++
++	//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
++	if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
++		s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8	  //two back-to-back s_trap are used (first for save and second for restore)
++		s_addc_u32	s_restore_pc_hi, s_restore_pc_hi, 0x0		 //carry bit over
++	end	
++	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))	      
++		s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
++		s_addc_u32	s_restore_pc_hi, s_restore_pc_hi, 0x0		 //carry bit over
++	end
++	
++	s_mov_b32 		m0, 		s_restore_m0
++	s_mov_b32 		exec_lo, 	s_restore_exec_lo
++	s_mov_b32 		exec_hi, 	s_restore_exec_hi
++	
++	read_hwreg_from_mem(tma_lo, s_restore_buf_rsrc0, s_restore_mem_offset)	    //tma_lo
++	read_hwreg_from_mem(tma_hi, s_restore_buf_rsrc0, s_restore_mem_offset)	    //tma_hi
++	s_waitcnt		lgkmcnt(0)		//from now on, it is safe to restore STATUS and IB_STS
++	s_and_b32		s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
++	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
++	s_and_b32		s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
++	s_lshr_b32		s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
++	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
++	//s_setreg_b32 	hwreg(HW_REG_TRAPSTS), 	s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
++	s_setreg_b32 	hwreg(HW_REG_MODE), 	s_restore_mode
++	//reuse s_restore_m0 as a temp register
++	s_and_b32		s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
++	s_lshr_b32		s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
++	s_lshl_b32		s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
++	s_mov_b32		s_restore_tmp, 0x0																				//IB_STS is zero
++	s_or_b32		s_restore_tmp, s_restore_tmp, s_restore_m0
++	s_and_b32		s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
++	s_lshr_b32		s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
++	s_lshl_b32		s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
++	s_or_b32		s_restore_tmp, s_restore_tmp, s_restore_m0
++    s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK 
++    s_lshr_b32		s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT    
++    s_setreg_b32 	hwreg(HW_REG_IB_STS), 	s_restore_tmp
++
++	s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
++	s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
++	s_setreg_b32 	hwreg(HW_REG_STATUS), 	s_restore_status     // SCC is included, which is changed by previous salu
++	
++	s_barrier													//barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
++
++if G8SR_DEBUG_TIMESTAMP
++	s_memrealtime s_g8sr_ts_restore_d
++	s_waitcnt lgkmcnt(0)
++end	
++	
++//	s_rfe_b64 s_restore_pc_lo                              		//Return to the main shader program and resume execution
++    s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0            // s_restore_m0[0] is used to set STATUS.inst_atc 
++
++
++/**************************************************************************/
++/*                     	the END								              */
++/**************************************************************************/	
++L_END_PGM:	
++	s_endpgm
++	
++end	
++
++
++/**************************************************************************/
++/*                     	the helper functions							  */
++/**************************************************************************/
++
++//Only for save hwreg to mem
++function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
++		s_mov_b32 exec_lo, m0					//assuming exec_lo is not needed anymore from this point on
++		s_mov_b32 m0, s_mem_offset
++		s_buffer_store_dword s, s_rsrc, m0		glc:0	
++		s_add_u32		s_mem_offset, s_mem_offset, 4
++		s_mov_b32	m0, exec_lo
++end
++
++//Only for save hwreg to mem 
++function write_tma_to_mem(s, s_rsrc, offset_imm) 
++        s_mov_b32 exec_lo, m0					//assuming exec_lo is not needed anymore from this point on 
++        s_mov_b32 m0, offset_imm 
++        s_buffer_store_dword s, s_rsrc, m0		glc:0	 
++        s_mov_b32	m0, exec_lo 
++end 
++
++// HWREG are saved before SGPRs, so all HWREG could be use.
++function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
++
++		s_buffer_store_dwordx4 s[0], s_rsrc, 0  glc:0	
++		s_buffer_store_dwordx4 s[4], s_rsrc, 16  glc:0	
++		s_buffer_store_dwordx4 s[8], s_rsrc, 32  glc:0	
++		s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:0	
++        s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
++        s_addc_u32 		s_rsrc[1], s_rsrc[1], 0x0			  // +scc
++end
++
++
++function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
++	s_buffer_load_dword s, s_rsrc, s_mem_offset		glc:1
++	s_add_u32		s_mem_offset, s_mem_offset, 4
++end
++
++function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
++	s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset		glc:1
++	s_sub_u32		s_mem_offset, s_mem_offset, 4*16
++end
++
++
++
++function get_lds_size_bytes(s_lds_size_byte)
++    // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW	 
++    s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 			// lds_size
++    s_lshl_b32     s_lds_size_byte, s_lds_size_byte, 8 						//LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
++end
++
++function get_vgpr_size_bytes(s_vgpr_size_byte)
++    s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)  //vpgr_size
++    s_add_u32      s_vgpr_size_byte, s_vgpr_size_byte, 1
++    s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)   //FIXME for GFX, zero is possible 
++end
++
++function get_sgpr_size_bytes(s_sgpr_size_byte)
++    s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)  //spgr_size
++    s_add_u32      s_sgpr_size_byte, s_sgpr_size_byte, 1
++    s_lshl_b32     s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value) 
++end
++
++function get_hwreg_size_bytes
++    return 128 //HWREG size 128 bytes
++end
++
++#endif
++
++static const uint32_t cwsr_trap_carrizo_hex[] = {
++	0xbf820001, 0xbf820131,
++	0xb8f4f802, 0xb8f5f803,
++	0x8675ff75, 0x00000400,
++	0xbf850013, 0xc00a1e37,
++	0x00000000, 0xbf8c007f,
++	0x87777978, 0xbf840004,
++	0xbeee007a, 0xbeef007b,
++	0xb974f802, 0xbe801d78,
++	0xb8f5f803, 0x8675ff75,
++	0x000001ff, 0xbf850002,
++	0x80708470, 0x82718071,
++	0x8671ff71, 0x0000ffff,
++	0xb974f802, 0xbe801f70,
++	0xb8f5f803, 0x8675ff75,
++	0x00000100, 0xbf840006,
++	0xbefa0080, 0xb97a0203,
++	0x8671ff71, 0x0000ffff,
++	0x80f08870, 0x82f18071,
++	0xbefa0080, 0xb97a0283,
++	0xbef60068, 0xbef70069,
++	0xb8fa1c07, 0x8e7a9c7a,
++	0x87717a71, 0xb8fa03c7,
++	0x8e7a9b7a, 0x87717a71,
++	0xb8faf807, 0x867aff7a,
++	0x00007fff, 0xb97af807,
++	0xbef2007e, 0xbef3007f,
++	0xbefe0180, 0xbf900004,
++	0xbf8e0002, 0xbf88fffe,
++	0xbef8007e, 0x8679ff7f,
++	0x0000ffff, 0x8779ff79,
++	0x00040000, 0xbefa0080,
++	0xbefb00ff, 0x00807fac,
++	0x867aff7f, 0x08000000,
++	0x8f7a837a, 0x877b7a7b,
++	0x867aff7f, 0x70000000,
++	0x8f7a817a, 0x877b7a7b,
++	0xbeef007c, 0xbeee0080,
++	0xb8ee2a05, 0x806e816e,
++	0x8e6e8a6e, 0xb8fa1605,
++	0x807a817a, 0x8e7a867a,
++	0x806e7a6e, 0xbefa0084,
++	0xbefa00ff, 0x01000000,
++	0xbefe007c, 0xbefc006e,
++	0xc0601bfc, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0601c3c, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0601c7c, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0601cbc, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0601cfc, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbef2007e, 0xbef30075,
++	0xbefe007c, 0xbefc006e,
++	0xc0601d3c, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xb8f5f803, 0xbefe007c,
++	0xbefc006e, 0xc0601d7c,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xbefe007c,
++	0xbefc006e, 0xc0601dbc,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xbefe007c,
++	0xbefc006e, 0xc0601dfc,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xb8eff801,
++	0xbefe007c, 0xbefc006e,
++	0xc0601bfc, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0601b3c, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0601b7c, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0601cbc, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0601cfc, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0x867aff7f, 0x04000000,
++	0xbef30080, 0x8773737a,
++	0xb8ee2a05, 0x806e816e,
++	0x8e6e8a6e, 0xb8f51605,
++	0x80758175, 0x8e758475,
++	0x8e7a8275, 0xbefa00ff,
++	0x01000000, 0xbef60178,
++	0x80786e78, 0xbefc0080,
++	0xbe802b00, 0xbe822b02,
++	0xbe842b04, 0xbe862b06,
++	0xbe882b08, 0xbe8a2b0a,
++	0xbe8c2b0c, 0xbe8e2b0e,
++	0xc06a003c, 0x00000000,
++	0xc06a013c, 0x00000010,
++	0xc06a023c, 0x00000020,
++	0xc06a033c, 0x00000030,
++	0x8078c078, 0x82798079,
++	0x807c907c, 0xbf0a757c,
++	0xbf85ffeb, 0xbef80176,
++	0xbeee0080, 0xbefe00c1,
++	0xbeff00c1, 0xbefa00ff,
++	0x01000000, 0xe0724000,
++	0x6e1e0000, 0xe0724100,
++	0x6e1e0100, 0xe0724200,
++	0x6e1e0200, 0xe0724300,
++	0x6e1e0300, 0xbefe00c1,
++	0xbeff00c1, 0xb8f54306,
++	0x8675c175, 0xbf84002c,
++	0xbf8a0000, 0x867aff73,
++	0x04000000, 0xbf840028,
++	0x8e758675, 0x8e758275,
++	0xbefa0075, 0xb8ee2a05,
++	0x806e816e, 0x8e6e8a6e,
++	0xb8fa1605, 0x807a817a,
++	0x8e7a867a, 0x806e7a6e,
++	0x806eff6e, 0x00000080,
++	0xbefa00ff, 0x01000000,
++	0xbefc0080, 0xd28c0002,
++	0x000100c1, 0xd28d0003,
++	0x000204c1, 0xd1060002,
++	0x00011103, 0x7e0602ff,
++	0x00000200, 0xbefc00ff,
++	0x00010000, 0xbe80007b,
++	0x867bff7b, 0xff7fffff,
++	0x877bff7b, 0x00058000,
++	0xd8ec0000, 0x00000002,
++	0xbf8c007f, 0xe0765000,
++	0x6e1e0002, 0x32040702,
++	0xd0c9006a, 0x0000eb02,
++	0xbf87fff7, 0xbefb0000,
++	0xbeee00ff, 0x00000400,
++	0xbefe00c1, 0xbeff00c1,
++	0xb8f52a05, 0x80758175,
++	0x8e758275, 0x8e7a8875,
++	0xbefa00ff, 0x01000000,
++	0xbefc0084, 0xbf0a757c,
++	0xbf840015, 0xbf11017c,
++	0x8075ff75, 0x00001000,
++	0x7e000300, 0x7e020301,
++	0x7e040302, 0x7e060303,
++	0xe0724000, 0x6e1e0000,
++	0xe0724100, 0x6e1e0100,
++	0xe0724200, 0x6e1e0200,
++	0xe0724300, 0x6e1e0300,
++	0x807c847c, 0x806eff6e,
++	0x00000400, 0xbf0a757c,
++	0xbf85ffef, 0xbf9c0000,
++	0xbf8200d1, 0xbef8007e,
++	0x8679ff7f, 0x0000ffff,
++	0x8779ff79, 0x00040000,
++	0xbefa0080, 0xbefb00ff,
++	0x00807fac, 0x8676ff7f,
++	0x08000000, 0x8f768376,
++	0x877b767b, 0x8676ff7f,
++	0x70000000, 0x8f768176,
++	0x877b767b, 0x8676ff7f,
++	0x04000000, 0xbf84001e,
++	0xbefe00c1, 0xbeff00c1,
++	0xb8f34306, 0x8673c173,
++	0xbf840019, 0x8e738673,
++	0x8e738273, 0xbefa0073,
++	0xb8f22a05, 0x80728172,
++	0x8e728a72, 0xb8f61605,
++	0x80768176, 0x8e768676,
++	0x80727672, 0x8072ff72,
++	0x00000080, 0xbefa00ff,
++	0x01000000, 0xbefc0080,
++	0xe0510000, 0x721e0000,
++	0xe0510100, 0x721e0000,
++	0x807cff7c, 0x00000200,
++	0x8072ff72, 0x00000200,
++	0xbf0a737c, 0xbf85fff6,
++	0xbef20080, 0xbefe00c1,
++	0xbeff00c1, 0xb8f32a05,
++	0x80738173, 0x8e738273,
++	0x8e7a8873, 0xbefa00ff,
++	0x01000000, 0xbef60072,
++	0x8072ff72, 0x00000400,
++	0xbefc0084, 0xbf11087c,
++	0x8073ff73, 0x00008000,
++	0xe0524000, 0x721e0000,
++	0xe0524100, 0x721e0100,
++	0xe0524200, 0x721e0200,
++	0xe0524300, 0x721e0300,
++	0xbf8c0f70, 0x7e000300,
++	0x7e020301, 0x7e040302,
++	0x7e060303, 0x807c847c,
++	0x8072ff72, 0x00000400,
++	0xbf0a737c, 0xbf85ffee,
++	0xbf9c0000, 0xe0524000,
++	0x761e0000, 0xe0524100,
++	0x761e0100, 0xe0524200,
++	0x761e0200, 0xe0524300,
++	0x761e0300, 0xb8f22a05,
++	0x80728172, 0x8e728a72,
++	0xb8f61605, 0x80768176,
++	0x8e768676, 0x80727672,
++	0x80f2c072, 0xb8f31605,
++	0x80738173, 0x8e738473,
++	0x8e7a8273, 0xbefa00ff,
++	0x01000000, 0xbefc0073,
++	0xc031003c, 0x00000072,
++	0x80f2c072, 0xbf8c007f,
++	0x80fc907c, 0xbe802d00,
++	0xbe822d02, 0xbe842d04,
++	0xbe862d06, 0xbe882d08,
++	0xbe8a2d0a, 0xbe8c2d0c,
++	0xbe8e2d0e, 0xbf06807c,
++	0xbf84fff1, 0xb8f22a05,
++	0x80728172, 0x8e728a72,
++	0xb8f61605, 0x80768176,
++	0x8e768676, 0x80727672,
++	0xbefa0084, 0xbefa00ff,
++	0x01000000, 0xc0211cfc,
++	0x00000072, 0x80728472,
++	0xc0211c3c, 0x00000072,
++	0x80728472, 0xc0211c7c,
++	0x00000072, 0x80728472,
++	0xc0211bbc, 0x00000072,
++	0x80728472, 0xc0211bfc,
++	0x00000072, 0x80728472,
++	0xc0211d3c, 0x00000072,
++	0x80728472, 0xc0211d7c,
++	0x00000072, 0x80728472,
++	0xc0211a3c, 0x00000072,
++	0x80728472, 0xc0211a7c,
++	0x00000072, 0x80728472,
++	0xc0211dfc, 0x00000072,
++	0x80728472, 0xc0211b3c,
++	0x00000072, 0x80728472,
++	0xc0211b7c, 0x00000072,
++	0x80728472, 0xbf8c007f,
++	0x8671ff71, 0x0000ffff,
++	0xbefc0073, 0xbefe006e,
++	0xbeff006f, 0xc0211bbc,
++	0x00000072, 0x80728472,
++	0xc0211bfc, 0x00000072,
++	0x80728472, 0xbf8c007f,
++	0x867375ff, 0x000003ff,
++	0xb9734803, 0x867375ff,
++	0xfffff800, 0x8f738b73,
++	0xb973a2c3, 0xb977f801,
++	0x8673ff71, 0xf0000000,
++	0x8f739c73, 0x8e739073,
++	0xbef60080, 0x87767376,
++	0x8673ff71, 0x08000000,
++	0x8f739b73, 0x8e738f73,
++	0x87767376, 0x8673ff74,
++	0x00800000, 0x8f739773,
++	0xb976f807, 0x86fe7e7e,
++	0x86ea6a6a, 0xb974f802,
++	0xbf8a0000, 0x95807370,
++	0xbf810000, 0x00000000,
++};
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index 6316aad..595640a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -31,16 +31,23 @@
+ #include <uapi/linux/kfd_ioctl.h>
+ #include <linux/time.h>
+ #include <linux/mm.h>
+-#include <linux/mman.h>
++#include <uapi/asm-generic/mman-common.h>
+ #include <asm/processor.h>
++
+ #include "kfd_priv.h"
+ #include "kfd_device_queue_manager.h"
+ #include "kfd_dbgmgr.h"
++#include "cik_regs.h"
+ 
+ static long kfd_ioctl(struct file *, unsigned int, unsigned long);
+ static int kfd_open(struct inode *, struct file *);
+ static int kfd_mmap(struct file *, struct vm_area_struct *);
++static uint32_t kfd_convert_user_mem_alloction_flags(
++		struct kfd_dev *dev,
++		uint32_t userspace_flags);
++static bool kfd_is_large_bar(struct kfd_dev *dev);
+ 
++static int kfd_evict(struct file *filep, struct kfd_process *p, void *data);
+ static const char kfd_dev_name[] = "kfd";
+ 
+ static const struct file_operations kfd_fops = {
+@@ -117,7 +124,7 @@ static int kfd_open(struct inode *inode, struct file *filep)
+ 		return -EPERM;
+ 	}
+ 
+-	process = kfd_create_process(current);
++	process = kfd_create_process(filep);
+ 	if (IS_ERR(process))
+ 		return PTR_ERR(process);
+ 
+@@ -206,6 +213,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
+ 	q_properties->ctx_save_restore_area_address =
+ 			args->ctx_save_restore_address;
+ 	q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
++	q_properties->ctl_stack_size = args->ctl_stack_size;
+ 	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
+ 		args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
+ 		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
+@@ -270,7 +278,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
+ 		return -EINVAL;
+ 	}
+ 
+-	mutex_lock(&p->mutex);
++	down_write(&p->lock);
+ 
+ 	pdd = kfd_bind_process_to_device(dev, p);
+ 	if (IS_ERR(pdd)) {
+@@ -282,8 +290,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
+ 			p->pasid,
+ 			dev->id);
+ 
+-	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
+-				0, q_properties.type, &queue_id);
++	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id);
+ 	if (err != 0)
+ 		goto err_create_queue;
+ 
+@@ -291,10 +298,10 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
+ 
+ 
+ 	/* Return gpu_id as doorbell offset for mmap usage */
+-	args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id);
++	args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL | args->gpu_id);
+ 	args->doorbell_offset <<= PAGE_SHIFT;
+ 
+-	mutex_unlock(&p->mutex);
++	up_write(&p->lock);
+ 
+ 	pr_debug("kfd: queue id %d was created successfully\n", args->queue_id);
+ 
+@@ -311,7 +318,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
+ 
+ err_create_queue:
+ err_bind_process:
+-	mutex_unlock(&p->mutex);
++	up_write(&p->lock);
+ 	return err;
+ }
+ 
+@@ -325,11 +332,11 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
+ 				args->queue_id,
+ 				p->pasid);
+ 
+-	mutex_lock(&p->mutex);
++	down_write(&p->lock);
+ 
+ 	retval = pqm_destroy_queue(&p->pqm, args->queue_id);
+ 
+-	mutex_unlock(&p->mutex);
++	up_write(&p->lock);
+ 	return retval;
+ }
+ 
+@@ -371,11 +378,33 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
+ 	pr_debug("kfd: updating queue id %d for PASID %d\n",
+ 			args->queue_id, p->pasid);
+ 
+-	mutex_lock(&p->mutex);
++	down_write(&p->lock);
+ 
+ 	retval = pqm_update_queue(&p->pqm, args->queue_id, &properties);
+ 
+-	mutex_unlock(&p->mutex);
++	up_write(&p->lock);
++
++	return retval;
++}
++
++static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
++					void *data)
++{
++	int retval;
++	struct kfd_ioctl_set_cu_mask_args *args = data;
++	struct queue_properties properties;
++	uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr;
++
++	if (get_user(properties.cu_mask, cu_mask_ptr))
++		return -EFAULT;
++	if (properties.cu_mask == 0)
++		return 0;
++
++	down_write(&p->lock);
++
++	retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties);
++
++	up_write(&p->lock);
+ 
+ 	return retval;
+ }
+@@ -403,7 +432,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
+ 	if (dev == NULL)
+ 		return -EINVAL;
+ 
+-	mutex_lock(&p->mutex);
++	down_write(&p->lock);
+ 
+ 	pdd = kfd_bind_process_to_device(dev, p);
+ 	if (IS_ERR(pdd)) {
+@@ -427,46 +456,80 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
+ 		err = -EINVAL;
+ 
+ out:
+-	mutex_unlock(&p->mutex);
++	up_write(&p->lock);
+ 
+ 	return err;
+ }
+ 
+-static int kfd_ioctl_dbg_register(struct file *filep,
+-				struct kfd_process *p, void *data)
++static int kfd_ioctl_set_trap_handler(struct file *filep,
++					struct kfd_process *p, void *data)
+ {
+-	struct kfd_ioctl_dbg_register_args *args = data;
++	struct kfd_ioctl_set_trap_handler_args *args = data;
+ 	struct kfd_dev *dev;
+-	struct kfd_dbgmgr *dbgmgr_ptr;
++	int err = 0;
+ 	struct kfd_process_device *pdd;
+-	bool create_ok;
+-	long status = 0;
+ 
+ 	dev = kfd_device_by_id(args->gpu_id);
+ 	if (dev == NULL)
+ 		return -EINVAL;
+ 
+-	if (dev->device_info->asic_family == CHIP_CARRIZO) {
+-		pr_debug("kfd_ioctl_dbg_register not supported on CZ\n");
+-		return -EINVAL;
++	down_write(&p->lock);
++
++	pdd = kfd_bind_process_to_device(dev, p);
++	if (IS_ERR(pdd)) {
++		err = -ESRCH;
++		goto out;
++	}
++	if (!dev->cwsr_enabled || !pdd->qpd.cwsr_kaddr) {
++		pr_err("kfd: CWSR is not enabled, can't set trap handler.\n");
++		err = -EINVAL;
++		goto out;
+ 	}
+ 
+-	mutex_lock(kfd_get_dbgmgr_mutex());
+-	mutex_lock(&p->mutex);
++	if (dev->dqm->ops.set_trap_handler(dev->dqm,
++					&pdd->qpd,
++					args->tba_addr,
++					args->tma_addr))
++		err = -EINVAL;
+ 
+-	/*
+-	 * make sure that we have pdd, if this the first queue created for
+-	 * this process
+-	 */
++out:
++	up_write(&p->lock);
++
++	return err;
++}
++
++static int
++kfd_ioctl_dbg_register(struct file *filep, struct kfd_process *p, void *data)
++{
++	long status = -EFAULT;
++	struct kfd_ioctl_dbg_register_args *args = data;
++	struct kfd_dev *dev;
++	struct kfd_dbgmgr *dbgmgr_ptr;
++	struct kfd_process_device *pdd;
++	bool create_ok = false;
++
++	pr_debug("kfd:dbg: %s\n", __func__);
++
++	dev = kfd_device_by_id(args->gpu_id);
++	if (!dev) {
++		dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__);
++		return status;
++	}
++
++	down_write(&p->lock);
++	mutex_lock(get_dbgmgr_mutex());
++
++	/* make sure that we have pdd, if this the first queue created for this process */
+ 	pdd = kfd_bind_process_to_device(dev, p);
+-	if (IS_ERR(pdd)) {
+-		mutex_unlock(&p->mutex);
+-		mutex_unlock(kfd_get_dbgmgr_mutex());
++	if (IS_ERR(pdd) < 0) {
++		mutex_unlock(get_dbgmgr_mutex());
++		up_write(&p->lock);
+ 		return PTR_ERR(pdd);
+ 	}
+ 
+ 	if (dev->dbgmgr == NULL) {
+ 		/* In case of a legal call, we have no dbgmgr yet */
++
+ 		create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev);
+ 		if (create_ok) {
+ 			status = kfd_dbgmgr_register(dbgmgr_ptr, p);
+@@ -475,13 +538,10 @@ static int kfd_ioctl_dbg_register(struct file *filep,
+ 			else
+ 				dev->dbgmgr = dbgmgr_ptr;
+ 		}
+-	} else {
+-		pr_debug("debugger already registered\n");
+-		status = -EINVAL;
+ 	}
+ 
+-	mutex_unlock(&p->mutex);
+-	mutex_unlock(kfd_get_dbgmgr_mutex());
++	mutex_unlock(get_dbgmgr_mutex());
++	up_write(&p->lock);
+ 
+ 	return status;
+ }
+@@ -489,9 +549,9 @@ static int kfd_ioctl_dbg_register(struct file *filep,
+ static int kfd_ioctl_dbg_unregister(struct file *filep,
+ 				struct kfd_process *p, void *data)
+ {
++	long status = -EFAULT;
+ 	struct kfd_ioctl_dbg_unregister_args *args = data;
+ 	struct kfd_dev *dev;
+-	long status;
+ 
+ 	dev = kfd_device_by_id(args->gpu_id);
+ 	if (dev == NULL)
+@@ -502,7 +562,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep,
+ 		return -EINVAL;
+ 	}
+ 
+-	mutex_lock(kfd_get_dbgmgr_mutex());
++	mutex_lock(get_dbgmgr_mutex());
+ 
+ 	status = kfd_dbgmgr_unregister(dev->dbgmgr, p);
+ 	if (status == 0) {
+@@ -510,7 +570,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep,
+ 		dev->dbgmgr = NULL;
+ 	}
+ 
+-	mutex_unlock(kfd_get_dbgmgr_mutex());
++	mutex_unlock(get_dbgmgr_mutex());
+ 
+ 	return status;
+ }
+@@ -519,125 +579,144 @@ static int kfd_ioctl_dbg_unregister(struct file *filep,
+  * Parse and generate variable size data structure for address watch.
+  * Total size of the buffer and # watch points is limited in order
+  * to prevent kernel abuse. (no bearing to the much smaller HW limitation
+- * which is enforced by dbgdev module)
++ * which is enforced by dbgdev module.
+  * please also note that the watch address itself are not "copied from user",
+  * since it be set into the HW in user mode values.
+  *
+  */
+-static int kfd_ioctl_dbg_address_watch(struct file *filep,
+-					struct kfd_process *p, void *data)
++
++static int
++kfd_ioctl_dbg_address_watch(struct file *filep,
++		struct kfd_process *p,
++		void *data)
+ {
++	long status = -EFAULT;
+ 	struct kfd_ioctl_dbg_address_watch_args *args = data;
+ 	struct kfd_dev *dev;
+ 	struct dbg_address_watch_info aw_info;
+-	unsigned char *args_buff;
+-	long status;
+-	void __user *cmd_from_user;
+-	uint64_t watch_mask_value = 0;
++	unsigned char *args_buff = NULL;
+ 	unsigned int args_idx = 0;
++	uint64_t watch_mask_value = 0;
+ 
+ 	memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info));
+ 
+-	dev = kfd_device_by_id(args->gpu_id);
+-	if (dev == NULL)
+-		return -EINVAL;
++	do {
++		dev = kfd_device_by_id(args->gpu_id);
++		if (!dev) {
++			dev_info(NULL,
++			"Error! kfd: In func %s >> get device by id failed\n",
++			__func__);
++			break;
++		}
+ 
+-	if (dev->device_info->asic_family == CHIP_CARRIZO) {
+-		pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
+-		return -EINVAL;
+-	}
++		if (args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) {
++			status = -EINVAL;
++			break;
++		}
+ 
+-	cmd_from_user = (void __user *) args->content_ptr;
++		if (args->buf_size_in_bytes <= sizeof(*args)) {
++			status = -EINVAL;
++			break;
++		}
+ 
+-	/* Validate arguments */
++		/* this is the actual buffer to work with */
+ 
+-	if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) ||
+-		(args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) ||
+-		(cmd_from_user == NULL))
+-		return -EINVAL;
++		args_buff = kzalloc(args->buf_size_in_bytes -
++						sizeof(*args), GFP_KERNEL);
++		if (args_buff == NULL) {
++			status = -ENOMEM;
++			break;
++		}
+ 
+-	/* this is the actual buffer to work with */
+-	args_buff = memdup_user(cmd_from_user,
+-				args->buf_size_in_bytes - sizeof(*args));
+-	if (IS_ERR(args_buff))
+-		return PTR_ERR(args_buff);
++		/* this is the actual buffer to work with */
++		args_buff = memdup_user(cmd_from_user,
++					args->buf_size_in_bytes - sizeof(*args));
++		if (IS_ERR(args_buff))
++			return PTR_ERR(args_buff);
+ 
+-	aw_info.process = p;
++		aw_info.process = p;
+ 
+-	aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx]));
+-	args_idx += sizeof(aw_info.num_watch_points);
++		aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx]));
++		args_idx += sizeof(aw_info.num_watch_points);
+ 
+-	aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx];
+-	args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points;
++		aw_info.watch_mode = (HSA_DBG_WATCH_MODE *) &args_buff[args_idx];
++		args_idx += sizeof(HSA_DBG_WATCH_MODE) * aw_info.num_watch_points;
+ 
+-	/*
+-	 * set watch address base pointer to point on the array base
+-	 * within args_buff
+-	 */
+-	aw_info.watch_address = (uint64_t *) &args_buff[args_idx];
++		/* set watch address base pointer to point on the array base within args_buff */
+ 
+-	/* skip over the addresses buffer */
+-	args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points;
++		aw_info.watch_address = (uint64_t *) &args_buff[args_idx];
+ 
+-	if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) {
+-		kfree(args_buff);
+-		return -EINVAL;
+-	}
++		/*skip over the addresses buffer */
++		args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points;
+ 
+-	watch_mask_value = (uint64_t) args_buff[args_idx];
++		if (args_idx >= args->buf_size_in_bytes) {
++			status = -EINVAL;
++			break;
++		}
+ 
+-	if (watch_mask_value > 0) {
+-		/*
+-		 * There is an array of masks.
+-		 * set watch mask base pointer to point on the array base
+-		 * within args_buff
+-		 */
+-		aw_info.watch_mask = (uint64_t *) &args_buff[args_idx];
++		watch_mask_value = (uint64_t) args_buff[args_idx];
+ 
+-		/* skip over the masks buffer */
+-		args_idx += sizeof(aw_info.watch_mask) *
+-				aw_info.num_watch_points;
+-	} else {
+-		/* just the NULL mask, set to NULL and skip over it */
+-		aw_info.watch_mask = NULL;
+-		args_idx += sizeof(aw_info.watch_mask);
+-	}
++		if (watch_mask_value > 0) {
++			/* there is an array of masks */
+ 
+-	if (args_idx >= args->buf_size_in_bytes - sizeof(args)) {
+-		kfree(args_buff);
+-		return -EINVAL;
+-	}
++			/* set watch mask base pointer to point on the array base within args_buff */
++			aw_info.watch_mask = (uint64_t *) &args_buff[args_idx];
++
++			/*skip over the masks buffer */
++			args_idx += sizeof(aw_info.watch_mask) * aw_info.num_watch_points;
++		}
++
++		else
++			/* just the NULL mask, set to NULL and skip over it */
++		{
++			aw_info.watch_mask = NULL;
++			args_idx += sizeof(aw_info.watch_mask);
++		}
++
++		if (args_idx > args->buf_size_in_bytes) {
++			status = -EINVAL;
++			break;
++		}
++
++		aw_info.watch_event = NULL;	/* Currently HSA Event is not supported for DBG */
++		status = 0;
++
++	} while (0);
+ 
+-	/* Currently HSA Event is not supported for DBG */
+-	aw_info.watch_event = NULL;
++	if (status == 0) {
++		mutex_lock(get_dbgmgr_mutex());
+ 
+-	mutex_lock(kfd_get_dbgmgr_mutex());
++		status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info);
+ 
+-	status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info);
++		mutex_unlock(get_dbgmgr_mutex());
+ 
+-	mutex_unlock(kfd_get_dbgmgr_mutex());
++	}
+ 
+ 	kfree(args_buff);
+ 
+ 	return status;
+ }
+ 
+-/* Parse and generate fixed size data structure for wave control */
+-static int kfd_ioctl_dbg_wave_control(struct file *filep,
+-					struct kfd_process *p, void *data)
++/*
++ * Parse and generate fixed size data structure for wave control.
++ * Buffer is generated in a "packed" form, for avoiding structure packing/pending dependencies.
++ */
++
++static int
++kfd_ioctl_dbg_wave_control(struct file *filep, struct kfd_process *p, void *data)
+ {
++	long status = -EFAULT;
+ 	struct kfd_ioctl_dbg_wave_control_args *args = data;
+ 	struct kfd_dev *dev;
+ 	struct dbg_wave_control_info wac_info;
+-	unsigned char *args_buff;
+-	uint32_t computed_buff_size;
+-	long status;
+-	void __user *cmd_from_user;
++	unsigned char *args_buff = NULL;
+ 	unsigned int args_idx = 0;
++	uint32_t computed_buff_size;
+ 
+ 	memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info));
+ 
+ 	/* we use compact form, independent of the packing attribute value */
++
+ 	computed_buff_size = sizeof(*args) +
+ 				sizeof(wac_info.mode) +
+ 				sizeof(wac_info.operand) +
+@@ -645,26 +724,25 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep,
+ 				sizeof(wac_info.dbgWave_msg.MemoryVA) +
+ 				sizeof(wac_info.trapId);
+ 
+-	dev = kfd_device_by_id(args->gpu_id);
+-	if (dev == NULL)
+-		return -EINVAL;
+ 
+-	if (dev->device_info->asic_family == CHIP_CARRIZO) {
+-		pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
+-		return -EINVAL;
+-	}
++	dev_info(NULL, "kfd: In func %s - start\n", __func__);
+ 
+-	/* input size must match the computed "compact" size */
+-	if (args->buf_size_in_bytes != computed_buff_size) {
+-		pr_debug("size mismatch, computed : actual %u : %u\n",
+-				args->buf_size_in_bytes, computed_buff_size);
+-		return -EINVAL;
+-	}
++	do {
++		dev = kfd_device_by_id(args->gpu_id);
++		if (!dev) {
++			dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__);
++			break;
++		}
+ 
+-	cmd_from_user = (void __user *) args->content_ptr;
++		/* input size must match the computed "compact" size */
+ 
+-	if (cmd_from_user == NULL)
+-		return -EINVAL;
++		if (args->buf_size_in_bytes != computed_buff_size) {
++			dev_info(NULL,
++					 "Error! kfd: In func %s >> size mismatch, computed : actual %u : %u\n",
++					__func__, args->buf_size_in_bytes, computed_buff_size);
++			status = -EINVAL;
++			break;
++		}
+ 
+ 	/* copy the entire buffer from user */
+ 
+@@ -673,34 +751,51 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep,
+ 	if (IS_ERR(args_buff))
+ 		return PTR_ERR(args_buff);
+ 
+-	/* move ptr to the start of the "pay-load" area */
+-	wac_info.process = p;
++		if (copy_from_user(args_buff,
++				(void __user *) args->content_ptr,
++				args->buf_size_in_bytes - sizeof(*args))) {
++			dev_info(NULL,
++			"Error! kfd: In func %s >> copy_from_user failed\n",
++			 __func__);
++			break;
++		}
++
++		/* move ptr to the start of the "pay-load" area */
+ 
+-	wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx]));
+-	args_idx += sizeof(wac_info.operand);
+ 
+-	wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx]));
+-	args_idx += sizeof(wac_info.mode);
++		wac_info.process = p;
+ 
+-	wac_info.trapId = *((uint32_t *)(&args_buff[args_idx]));
+-	args_idx += sizeof(wac_info.trapId);
++		wac_info.operand = (HSA_DBG_WAVEOP) *((HSA_DBG_WAVEOP *)(&args_buff[args_idx]));
++		args_idx += sizeof(wac_info.operand);
+ 
+-	wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value =
+-					*((uint32_t *)(&args_buff[args_idx]));
+-	wac_info.dbgWave_msg.MemoryVA = NULL;
++		wac_info.mode = (HSA_DBG_WAVEMODE) *((HSA_DBG_WAVEMODE *)(&args_buff[args_idx]));
++		args_idx += sizeof(wac_info.mode);
+ 
+-	mutex_lock(kfd_get_dbgmgr_mutex());
++		wac_info.trapId = (uint32_t) *((uint32_t *)(&args_buff[args_idx]));
++		args_idx += sizeof(wac_info.trapId);
+ 
+-	pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n",
+-			wac_info.process, wac_info.operand,
+-			wac_info.mode, wac_info.trapId,
+-			wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
++		wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = *((uint32_t *)(&args_buff[args_idx]));
++		wac_info.dbgWave_msg.MemoryVA = NULL;
+ 
+-	status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info);
+ 
+-	pr_debug("Returned status of dbg manager is %ld\n", status);
++		status = 0;
+ 
+-	mutex_unlock(kfd_get_dbgmgr_mutex());
++	} while (0);
++	if (status == 0) {
++		mutex_lock(get_dbgmgr_mutex());
++
++		dev_info(NULL,
++				"kfd: In func %s >> calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n",
++				__func__, wac_info.process, wac_info.operand, wac_info.mode, wac_info.trapId,
++				wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
++
++		status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info);
++
++		dev_info(NULL, "kfd: In func %s >> returned status of dbg manager is %ld\n", __func__, status);
++
++		mutex_unlock(get_dbgmgr_mutex());
++
++	}
+ 
+ 	kfree(args_buff);
+ 
+@@ -715,12 +810,13 @@ static int kfd_ioctl_get_clock_counters(struct file *filep,
+ 	struct timespec64 time;
+ 
+ 	dev = kfd_device_by_id(args->gpu_id);
+-	if (dev == NULL)
+-		return -EINVAL;
+-
+-	/* Reading GPU clock counter from KGD */
+-	args->gpu_clock_counter =
+-		dev->kfd2kgd->get_gpu_clock_counter(dev->kgd);
++	if (dev)
++		/* Reading GPU clock counter from KGD */
++		args->gpu_clock_counter =
++			dev->kfd2kgd->get_gpu_clock_counter(dev->kgd);
++	else
++		/* Node without GPU resource */
++		args->gpu_clock_counter = 0;
+ 
+ 	/* No access to rdtsc. Using raw monotonic time */
+ 	getrawmonotonic64(&time);
+@@ -747,7 +843,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,
+ 
+ 	args->num_of_nodes = 0;
+ 
+-	mutex_lock(&p->mutex);
++	down_write(&p->lock);
+ 
+ 	/*if the process-device list isn't empty*/
+ 	if (kfd_has_process_device_data(p)) {
+@@ -786,52 +882,180 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,
+ 				(args->num_of_nodes < NUM_OF_SUPPORTED_GPUS));
+ 	}
+ 
+-	mutex_unlock(&p->mutex);
++	up_write(&p->lock);
+ 
+ 	return 0;
+ }
+ 
+-static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
+-					void *data)
++static int kfd_ioctl_get_process_apertures_new(struct file *filp,
++				struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_get_process_apertures_new_args *args = data;
++	struct kfd_process_device_apertures *pa;
++	struct kfd_process_device *pdd;
++	uint32_t nodes = 0;
++	int ret;
++
++	dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
++
++	if (args->num_of_nodes == 0) {
++		/* Return number of nodes, so that user space can alloacate
++		* sufficient memory */
++		down_write(&p->lock);
++
++		if (!kfd_has_process_device_data(p)) {
++			up_write(&p->lock);
++			return 0;
++		}
++
++		/* Run over all pdd of the process */
++		pdd = kfd_get_first_process_device_data(p);
++		do {
++			args->num_of_nodes++;
++		} while ((pdd =
++			kfd_get_next_process_device_data(p, pdd)) != NULL);
++
++		up_write(&p->lock);
++		return 0;
++	}
++
++	/* Fill in process-aperture information for all available
++	 * nodes, but not more than args->num_of_nodes as that is
++	 * the amount of memory allocated by user */
++	pa = kzalloc((sizeof(struct kfd_process_device_apertures) *
++				args->num_of_nodes), GFP_KERNEL);
++	if (!pa)
++		return -ENOMEM;
++
++	down_write(&p->lock);
++
++	if (!kfd_has_process_device_data(p)) {
++		up_write(&p->lock);
++		args->num_of_nodes = 0;
++		kfree(pa);
++		return 0;
++	}
++
++	/* Run over all pdd of the process */
++	pdd = kfd_get_first_process_device_data(p);
++	do {
++		pa[nodes].gpu_id = pdd->dev->id;
++		pa[nodes].lds_base = pdd->lds_base;
++		pa[nodes].lds_limit = pdd->lds_limit;
++		pa[nodes].gpuvm_base = pdd->gpuvm_base;
++		pa[nodes].gpuvm_limit = pdd->gpuvm_limit;
++		pa[nodes].scratch_base = pdd->scratch_base;
++		pa[nodes].scratch_limit = pdd->scratch_limit;
++
++		dev_dbg(kfd_device,
++			"gpu id %u\n", pdd->dev->id);
++		dev_dbg(kfd_device,
++			"lds_base %llX\n", pdd->lds_base);
++		dev_dbg(kfd_device,
++			"lds_limit %llX\n", pdd->lds_limit);
++		dev_dbg(kfd_device,
++			"gpuvm_base %llX\n", pdd->gpuvm_base);
++		dev_dbg(kfd_device,
++			"gpuvm_limit %llX\n", pdd->gpuvm_limit);
++		dev_dbg(kfd_device,
++			"scratch_base %llX\n", pdd->scratch_base);
++		dev_dbg(kfd_device,
++			"scratch_limit %llX\n", pdd->scratch_limit);
++		nodes++;
++	} while (
++		(pdd = kfd_get_next_process_device_data(p, pdd)) != NULL &&
++		(nodes < args->num_of_nodes));
++	up_write(&p->lock);
++
++	args->num_of_nodes = nodes;
++	ret = copy_to_user(
++			(void __user *)args->kfd_process_device_apertures_ptr,
++			pa,
++			(nodes * sizeof(struct kfd_process_device_apertures)));
++	kfree(pa);
++	return ret ? -EFAULT : 0;
++}
++
++static int
++kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, void *data)
+ {
+ 	struct kfd_ioctl_create_event_args *args = data;
+-	int err;
++	struct kfd_dev *kfd;
++	struct kfd_process_device *pdd;
++	int err = -EINVAL;
++	void *mem, *kern_addr = NULL;
++
++	pr_debug("amdkfd: Event page offset 0x%llx\n", args->event_page_offset);
++
++	if (args->event_page_offset) {
++		kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset));
++		if (!kfd) {
++			pr_err("amdkfd: can't find kfd device\n");
++			return -EFAULT;
++		}
++		if (KFD_IS_DGPU(kfd->device_info->asic_family)) {
++			down_write(&p->lock);
++			pdd = kfd_bind_process_to_device(kfd, p);
++			if (IS_ERR(pdd) < 0) {
++				err = PTR_ERR(pdd);
++				up_write(&p->lock);
++				return -EFAULT;
++			}
++			mem = kfd_process_device_translate_handle(pdd,
++				GET_IDR_HANDLE(args->event_page_offset));
++			if (!mem) {
++				pr_err("amdkfd: can't find BO offset is 0x%llx\n",
++						args->event_page_offset);
++				up_write(&p->lock);
++				return -EFAULT;
++			}
++			up_write(&p->lock);
++
++			/* Map dGPU gtt BO to kernel */
++			kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd,
++					mem, &kern_addr);
++		}
++	}
+ 
+-	err = kfd_event_create(filp, p, args->event_type,
+-				args->auto_reset != 0, args->node_id,
+-				&args->event_id, &args->event_trigger_data,
+-				&args->event_page_offset,
+-				&args->event_slot_index);
++	err = kfd_event_create(filp, p,
++			args->event_type,
++			args->auto_reset != 0,
++			args->node_id,
++			&args->event_id,
++			&args->event_trigger_data,
++			&args->event_page_offset,
++			&args->event_slot_index,
++			kern_addr);
+ 
+ 	return err;
+ }
+ 
+-static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p,
+-					void *data)
++static int
++kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, void *data)
+ {
+ 	struct kfd_ioctl_destroy_event_args *args = data;
+ 
+ 	return kfd_event_destroy(p, args->event_id);
+ }
+ 
+-static int kfd_ioctl_set_event(struct file *filp, struct kfd_process *p,
+-				void *data)
++static int
++kfd_ioctl_set_event(struct file *filp, struct kfd_process *p, void *data)
+ {
+ 	struct kfd_ioctl_set_event_args *args = data;
+ 
+ 	return kfd_set_event(p, args->event_id);
+ }
+ 
+-static int kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p,
+-				void *data)
++static int
++kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p, void *data)
+ {
+ 	struct kfd_ioctl_reset_event_args *args = data;
+ 
+ 	return kfd_reset_event(p, args->event_id);
+ }
+ 
+-static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p,
+-				void *data)
++static int
++kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, void *data)
+ {
+ 	struct kfd_ioctl_wait_events_args *args = data;
+ 	enum kfd_event_wait_result wait_result;
+@@ -846,6 +1070,711 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p,
+ 
+ 	return err;
+ }
++static int kfd_ioctl_alloc_scratch_memory(struct file *filep,
++					struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_alloc_memory_of_gpu_args *args =
++			(struct kfd_ioctl_alloc_memory_of_gpu_args *)data;
++	struct kfd_process_device *pdd;
++	struct kfd_dev *dev;
++	long err;
++
++	if (args->size == 0)
++		return -EINVAL;
++
++	dev = kfd_device_by_id(args->gpu_id);
++	if (dev == NULL)
++		return -EINVAL;
++
++	down_write(&p->lock);
++
++	pdd = kfd_bind_process_to_device(dev, p);
++	if (IS_ERR(pdd) < 0) {
++		err = PTR_ERR(pdd);
++		goto bind_process_to_device_fail;
++	}
++
++	pdd->sh_hidden_private_base_vmid = args->va_addr;
++	pdd->qpd.sh_hidden_private_base = args->va_addr;
++
++	up_write(&p->lock);
++
++	if (sched_policy == KFD_SCHED_POLICY_NO_HWS && pdd->qpd.vmid != 0) {
++		err = dev->kfd2kgd->alloc_memory_of_scratch(
++			dev->kgd, args->va_addr, pdd->qpd.vmid);
++		if (err != 0)
++			goto alloc_memory_of_scratch_failed;
++	}
++
++	return 0;
++
++bind_process_to_device_fail:
++	up_write(&p->lock);
++alloc_memory_of_scratch_failed:
++	return -EFAULT;
++}
++
++static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
++					struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_alloc_memory_of_gpu_args *args = data;
++	struct kfd_process_device *pdd;
++	void *mem;
++	struct kfd_dev *dev;
++	int idr_handle;
++	long err;
++
++	if (args->size == 0)
++		return -EINVAL;
++
++	dev = kfd_device_by_id(args->gpu_id);
++	if (dev == NULL)
++		return -EINVAL;
++
++	down_write(&p->lock);
++	pdd = kfd_bind_process_to_device(dev, p);
++	up_write(&p->lock);
++	if (IS_ERR(pdd) < 0)
++		return PTR_ERR(pdd);
++
++	err = dev->kfd2kgd->alloc_memory_of_gpu(
++		dev->kgd, args->va_addr, args->size,
++		pdd->vm, (struct kgd_mem **) &mem, NULL, NULL, pdd, 0);
++
++	if (err != 0)
++		return err;
++
++	down_write(&p->lock);
++	idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
++			args->va_addr, args->size);
++	up_write(&p->lock);
++	if (idr_handle < 0) {
++		dev->kfd2kgd->free_memory_of_gpu(dev->kgd,
++						 (struct kgd_mem *) mem);
++		return -EFAULT;
++	}
++
++	args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
++
++	return 0;
++}
++
++bool kfd_is_large_bar(struct kfd_dev *dev)
++{
++	struct kfd_local_mem_info mem_info;
++
++	if (debug_largebar) {
++		pr_debug("amdkfd: simulate large-bar allocation on non large-bar machine\n");
++		return true;
++	}
++
++	if (!KFD_IS_DGPU(dev->device_info->asic_family))
++		return false;
++
++	dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info);
++	if (mem_info.local_mem_size_private == 0 &&
++			mem_info.local_mem_size_public > 0)
++		return true;
++	return false;
++}
++
++static uint32_t kfd_convert_user_mem_alloction_flags(
++		struct kfd_dev *dev,
++		uint32_t userspace_flags)
++{
++	uint32_t kernel_allocation_flags;
++
++	kernel_allocation_flags = 0;
++
++	/* Allocate VRAM bo */
++	if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) ||
++		(userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE)) {
++		kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM;
++		if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) &&
++				kfd_is_large_bar(dev))
++			kernel_allocation_flags |= ALLOC_MEM_FLAGS_PUBLIC;
++		goto out;
++	}
++	/*
++	 * Since currently user space library doesn't uses scratch
++	 * allocation flag I route it to VRAM
++	 */
++	if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_SCRATCH) ||
++		(userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_SCRATCH)) {
++		kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM;
++		goto out;
++	}
++	/*
++	 * The current usage for *_HOST allocation flags are for GTT memory
++	 * Need to verify if we're node zero or we want to allocate bo on
++	 * public domain for P2P buffers.
++	 */
++	if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST) {
++		kernel_allocation_flags = ALLOC_MEM_FLAGS_GTT;
++		goto out;
++	}
++	/* Allocate userptr BO */
++	if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
++		kernel_allocation_flags = ALLOC_MEM_FLAGS_USERPTR;
++		goto out;
++	}
++
++out:
++	if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_AQL_QUEUE_MEM)
++		kernel_allocation_flags |= ALLOC_MEM_FLAGS_AQL_QUEUE_MEM;
++	/* Current HW doesn't support non paged memory */
++	kernel_allocation_flags |= ALLOC_MEM_FLAGS_NONPAGED;
++	/*
++	 *  Set by default execute access as this buffer might be allocated
++	 * for CP's ring buffer
++	 */
++	kernel_allocation_flags |= ALLOC_MEM_FLAGS_EXECUTE_ACCESS;
++	kernel_allocation_flags |= ALLOC_MEM_FLAGS_NO_SUBSTITUTE;
++
++	pr_debug("amdkfd: user allocation flags 0x%x kernel allocation flags: 0x%x\n",
++			userspace_flags, kernel_allocation_flags);
++
++	return kernel_allocation_flags;
++}
++
++static int kfd_ioctl_alloc_memory_of_gpu_new(struct file *filep,
++					struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_alloc_memory_of_gpu_new_args *args = data;
++	struct kfd_process_device *pdd;
++	void *mem;
++	struct kfd_dev *dev;
++	int idr_handle;
++	long err;
++	uint64_t offset;
++
++	if (args->size == 0)
++		return -EINVAL;
++
++	dev = kfd_device_by_id(args->gpu_id);
++	if (dev == NULL)
++		return -EINVAL;
++
++	down_write(&p->lock);
++	pdd = kfd_bind_process_to_device(dev, p);
++	up_write(&p->lock);
++	if (IS_ERR(pdd) < 0)
++		return PTR_ERR(pdd);
++
++	offset = args->mmap_offset;
++	err = dev->kfd2kgd->alloc_memory_of_gpu(
++		dev->kgd, args->va_addr, args->size,
++		pdd->vm, (struct kgd_mem **) &mem, &offset,
++		NULL, pdd,
++		kfd_convert_user_mem_alloction_flags(dev, args->flags));
++
++	if (err != 0)
++		return err;
++
++	down_write(&p->lock);
++	idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
++			args->va_addr, args->size);
++	up_write(&p->lock);
++	if (idr_handle < 0) {
++		dev->kfd2kgd->free_memory_of_gpu(dev->kgd,
++						 (struct kgd_mem *) mem);
++		return -EFAULT;
++	}
++
++	args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
++	if ((args->flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) != 0 &&
++			!kfd_is_large_bar(dev)) {
++		args->mmap_offset = 0;
++	} else {
++		args->mmap_offset = KFD_MMAP_TYPE_MAP_BO;
++		args->mmap_offset |= KFD_MMAP_GPU_ID(args->gpu_id);
++		args->mmap_offset <<= PAGE_SHIFT;
++		args->mmap_offset |= offset;
++	}
++
++	return 0;
++}
++
++static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
++					struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_free_memory_of_gpu_args *args = data;
++	struct kfd_process_device *pdd;
++	struct kfd_bo *buf_obj;
++	struct kfd_dev *dev;
++	int ret;
++
++	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
++	if (dev == NULL)
++		return -EINVAL;
++
++	down_write(&p->lock);
++
++	pdd = kfd_get_process_device_data(dev, p);
++	if (!pdd) {
++		pr_err("Process device data doesn't exist\n");
++		ret = -EINVAL;
++		goto err_unlock;
++	}
++
++	buf_obj = kfd_process_device_find_bo(pdd,
++					GET_IDR_HANDLE(args->handle));
++	if (buf_obj == NULL) {
++		ret = -EINVAL;
++		goto err_unlock;
++	}
++	run_rdma_free_callback(buf_obj);
++
++	up_write(&p->lock);
++
++	ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem);
++
++	/* If freeing the buffer failed, leave the handle in place for
++	 * clean-up during process tear-down. */
++	if (ret == 0) {
++		down_write(&p->lock);
++		kfd_process_device_remove_obj_handle(
++			pdd, GET_IDR_HANDLE(args->handle));
++		up_write(&p->lock);
++	}
++
++	return ret;
++
++err_unlock:
++	up_write(&p->lock);
++	return ret;
++}
++
++int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem,
++		struct kfd_process *p, struct kfd_process_device *pdd)
++{
++	int err;
++
++	BUG_ON(!dev);
++	BUG_ON(!pdd);
++
++	err = dev->kfd2kgd->map_memory_to_gpu(
++		dev->kgd, (struct kgd_mem *) mem, pdd->vm);
++
++	if (err != 0)
++		return err;
++
++	radeon_flush_tlb(dev, p->pasid);
++
++	err = dev->dqm->ops.set_page_directory_base(dev->dqm, &pdd->qpd);
++	if (err != 0) {
++		dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd,
++				(struct kgd_mem *) mem, pdd->vm);
++		return err;
++	}
++
++	return 0;
++}
++
++static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
++					struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_map_memory_to_gpu_new_args *args = data;
++	struct kfd_process_device *pdd, *peer_pdd;
++	void *mem;
++	struct kfd_dev *dev, *peer;
++	long err = 0;
++	int i, num_dev;
++	uint32_t *devices_arr = NULL;
++	int bo_size;
++
++	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
++	if (dev == NULL)
++		return -EINVAL;
++
++	if (args->device_ids_array_size > 0 &&
++			(args->device_ids_array_size < sizeof(uint32_t))) {
++		pr_err("amdkfd: err node IDs array size %u\n",
++				args->device_ids_array_size);
++		return -EFAULT;
++	}
++
++	if (args->device_ids_array_size > 0) {
++		devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL);
++		if (!devices_arr)
++			return -ENOMEM;
++
++		err = copy_from_user(devices_arr,
++				(void __user *)args->device_ids_array,
++				args->device_ids_array_size);
++		if (err != 0) {
++			err = -EFAULT;
++			goto copy_from_user_failed;
++		}
++	}
++
++	down_write(&p->lock);
++
++	pdd = kfd_bind_process_to_device(dev, p);
++	if (IS_ERR(pdd) < 0) {
++		err = PTR_ERR(pdd);
++		goto bind_process_to_device_failed;
++	}
++
++	mem = kfd_process_device_translate_handle(pdd,
++						GET_IDR_HANDLE(args->handle));
++	up_write(&p->lock);
++
++	if (mem == NULL) {
++		err = PTR_ERR(mem);
++		goto get_mem_obj_from_handle_failed;
++	}
++
++	if (args->device_ids_array_size > 0) {
++		num_dev = args->device_ids_array_size / sizeof(uint32_t);
++		for (i = 0 ; i < num_dev; i++) {
++			peer = kfd_device_by_id(devices_arr[i]);
++			if (!peer) {
++				pr_err("amdkfd: didn't found kfd-dev for 0x%x\n",
++						devices_arr[i]);
++				err = -EFAULT;
++				goto get_mem_obj_from_handle_failed;
++			}
++			down_write(&p->lock);
++			peer_pdd = kfd_bind_process_to_device(peer, p);
++			up_write(&p->lock);
++			if (!peer_pdd) {
++				err = -EFAULT;
++				goto get_mem_obj_from_handle_failed;
++			}
++			err = kfd_map_memory_to_gpu(peer, mem, p, peer_pdd);
++			if (err != 0)
++				pr_err("amdkfd: failed to map\n");
++		}
++	} else {
++		err = kfd_map_memory_to_gpu(dev, mem, p, pdd);
++		if (err != 0)
++			pr_err("amdkfd: failed to map\n");
++	}
++
++	bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem);
++	down_write(&p->lock);
++	pdd->mapped_size += bo_size;
++	up_write(&p->lock);
++
++	if (args->device_ids_array_size > 0 && devices_arr)
++		kfree(devices_arr);
++
++	return err;
++
++bind_process_to_device_failed:
++	up_write(&p->lock);
++get_mem_obj_from_handle_failed:
++copy_from_user_failed:
++	kfree(devices_arr);
++	return err;
++}
++
++static int kfd_ioctl_map_memory_to_gpu_wrapper(struct file *filep,
++					struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_map_memory_to_gpu_args *args = data;
++	struct kfd_ioctl_map_memory_to_gpu_new_args new_args;
++
++	new_args.handle = args->handle;
++	new_args.device_ids_array = NULL;
++	new_args.device_ids_array_size = 0;
++
++	return kfd_ioctl_map_memory_to_gpu(filep, p, &new_args);
++}
++
++static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
++					struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_unmap_memory_from_gpu_new_args *args = data;
++	struct kfd_process_device *pdd, *peer_pdd;
++	void *mem;
++	struct kfd_dev *dev, *peer;
++	long err = 0;
++	uint32_t *devices_arr = NULL, num_dev, i;
++	int bo_size;
++
++	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
++	if (dev == NULL)
++		return -EINVAL;
++
++	if (args->device_ids_array_size > 0 &&
++			(args->device_ids_array_size < sizeof(uint32_t))) {
++		pr_err("amdkfd: err node IDs array size %u\n",
++				args->device_ids_array_size);
++		return -EFAULT;
++	}
++
++	if (args->device_ids_array_size > 0) {
++		devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL);
++		if (!devices_arr)
++			return -ENOMEM;
++
++		err = copy_from_user(devices_arr,
++				(void __user *)args->device_ids_array,
++				args->device_ids_array_size);
++		if (err != 0) {
++			err = -EFAULT;
++			goto copy_from_user_failed;
++		}
++	}
++
++	down_write(&p->lock);
++
++	pdd = kfd_get_process_device_data(dev, p);
++	if (!pdd) {
++		pr_err("Process device data doesn't exist\n");
++		err = PTR_ERR(pdd);
++		goto bind_process_to_device_failed;
++	}
++
++	mem = kfd_process_device_translate_handle(pdd,
++						GET_IDR_HANDLE(args->handle));
++	up_write(&p->lock);
++
++	if (mem == NULL) {
++		err = PTR_ERR(mem);
++		goto get_mem_obj_from_handle_failed;
++	}
++
++	if (args->device_ids_array_size > 0) {
++		num_dev = args->device_ids_array_size / sizeof(uint32_t);
++		for (i = 0 ; i < num_dev; i++) {
++			peer = kfd_device_by_id(devices_arr[i]);
++			if (!peer) {
++				err = -EFAULT;
++				goto get_mem_obj_from_handle_failed;
++			}
++			down_write(&p->lock);
++			peer_pdd = kfd_get_process_device_data(peer, p);
++			up_write(&p->lock);
++			if (!peer_pdd) {
++				err = -EFAULT;
++				goto get_mem_obj_from_handle_failed;
++			}
++			peer->kfd2kgd->unmap_memory_to_gpu(peer->kgd,
++					mem, peer_pdd->vm);
++			radeon_flush_tlb(peer, p->pasid);
++		}
++	} else {
++		dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, mem, pdd->vm);
++		radeon_flush_tlb(dev, p->pasid);
++	}
++
++	bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem);
++	down_write(&p->lock);
++	pdd->mapped_size -= bo_size;
++	up_write(&p->lock);
++
++	return 0;
++
++bind_process_to_device_failed:
++	up_write(&p->lock);
++get_mem_obj_from_handle_failed:
++copy_from_user_failed:
++	kfree(devices_arr);
++	return err;
++}
++
++static int kfd_ioctl_unmap_memory_from_gpu_wrapper(struct file *filep,
++					struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
++	struct kfd_ioctl_unmap_memory_from_gpu_new_args new_args;
++
++	new_args.handle = args->handle;
++	new_args.device_ids_array = NULL;
++	new_args.device_ids_array_size = 0;
++
++	return kfd_ioctl_unmap_memory_from_gpu(filep, p, &new_args);
++}
++
++static int kfd_ioctl_open_graphic_handle(struct file *filep,
++					struct kfd_process *p,
++					void *data)
++{
++	struct kfd_ioctl_open_graphic_handle_args *args = data;
++	struct kfd_dev *dev;
++	struct kfd_process_device *pdd;
++	void *mem;
++	int idr_handle;
++	long err;
++
++	dev = kfd_device_by_id(args->gpu_id);
++	if (dev == NULL)
++		return -EINVAL;
++
++	if (dev->device_info->asic_family != CHIP_KAVERI) {
++		pr_debug("kfd_ioctl_open_graphic_handle only supported on KV\n");
++		return -EINVAL;
++	}
++
++	down_write(&p->lock);
++	pdd = kfd_bind_process_to_device(dev, p);
++	up_write(&p->lock);
++	if (IS_ERR(pdd) < 0)
++		return PTR_ERR(pdd);
++
++	err = dev->kfd2kgd->open_graphic_handle(dev->kgd,
++			args->va_addr,
++			(struct kgd_vm *) pdd->vm,
++			args->graphic_device_fd,
++			args->graphic_handle,
++			(struct kgd_mem **) &mem);
++
++	if (err != 0)
++		return err;
++
++	down_write(&p->lock);
++	/*TODO: When open_graphic_handle is implemented, we need to create
++	* the corresponding interval tree. We need to know the size of
++	* the buffer through open_graphic_handle(). We use 1 for now.*/
++	idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
++			args->va_addr, 1);
++	up_write(&p->lock);
++	if (idr_handle < 0) {
++		/* FIXME: destroy_process_gpumem doesn't seem to be
++		 * implemented anywhere */
++		dev->kfd2kgd->destroy_process_gpumem(dev->kgd, mem);
++		return -EFAULT;
++	}
++
++	args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
++
++	return 0;
++}
++
++static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep,
++		struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_set_process_dgpu_aperture_args *args = data;
++	struct kfd_dev *dev;
++	struct kfd_process_device *pdd;
++	long err;
++
++	dev = kfd_device_by_id(args->gpu_id);
++	if (dev == NULL)
++		return -EINVAL;
++
++	down_write(&p->lock);
++
++	pdd = kfd_bind_process_to_device(dev, p);
++	if (IS_ERR(pdd) < 0) {
++		err = PTR_ERR(pdd);
++		goto exit;
++	}
++
++	err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base,
++			args->dgpu_limit);
++
++exit:
++	up_write(&p->lock);
++	return err;
++}
++
++static int kfd_ioctl_get_dmabuf_info(struct file *filep,
++		struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_get_dmabuf_info_args *args = data;
++	struct kfd_dev *dev = NULL;
++	struct kgd_dev *dma_buf_kgd;
++	void *metadata_buffer = NULL;
++	uint32_t flags;
++	unsigned i;
++	int r;
++
++	/* Find a KFD GPU device that supports the get_dmabuf_info query */
++	for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++)
++		if (dev && dev->kfd2kgd->get_dmabuf_info)
++			break;
++	if (!dev)
++		return -EINVAL;
++
++	if (args->metadata_ptr) {
++		metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL);
++		if (!metadata_buffer)
++			return -ENOMEM;
++	}
++
++	/* Get dmabuf info from KGD */
++	r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd,
++					  &dma_buf_kgd, &args->size,
++					  metadata_buffer, args->metadata_size,
++					  &args->metadata_size, &flags);
++	if (r)
++		goto exit;
++
++	/* Reverse-lookup gpu_id from kgd pointer */
++	dev = kfd_device_by_kgd(dma_buf_kgd);
++	if (!dev) {
++		r = -EINVAL;
++		goto exit;
++	}
++	args->gpu_id = kfd_get_gpu_id(dev);
++
++	/* Translate flags */
++	if (flags & ALLOC_MEM_FLAGS_VRAM) {
++		args->flags = KFD_IS_DGPU(dev->device_info->asic_family) ?
++			KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE :
++			KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE;
++	} else
++		args->flags = KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST;
++
++	/* Copy metadata buffer to user mode */
++	if (metadata_buffer) {
++		r = copy_to_user((void __user *)args->metadata_ptr,
++				 metadata_buffer, args->metadata_size);
++		if (r != 0)
++			r = -EFAULT;
++	}
++
++exit:
++	kfree(metadata_buffer);
++
++	return r;
++}
++
++static int kfd_ioctl_import_dmabuf(struct file *filep,
++		struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_import_dmabuf_args *args = data;
++	struct kfd_dev *dev;
++	struct kfd_process_device *pdd;
++	void *mem;
++	uint64_t size;
++	int idr_handle;
++	int r;
++
++	dev = kfd_device_by_id(args->gpu_id);
++	if (!dev || !dev->kfd2kgd->import_dmabuf)
++		return -EINVAL;
++
++	down_write(&p->lock);
++	pdd = kfd_bind_process_to_device(dev, p);
++	up_write(&p->lock);
++	if (IS_ERR(pdd) < 0)
++		return PTR_ERR(pdd);
++
++	r = dev->kfd2kgd->import_dmabuf(dev->kgd, args->dmabuf_fd,
++					args->va_addr, pdd->vm,
++					(struct kgd_mem **)&mem, &size);
++	if (r)
++		return r;
++
++	down_write(&p->lock);
++	idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
++			args->va_addr, size);
++	up_write(&p->lock);
++	if (idr_handle < 0) {
++		dev->kfd2kgd->free_memory_of_gpu(dev->kgd,
++						 (struct kgd_mem *)mem);
++		return -EFAULT;
++	}
++
++	args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
++
++	return 0;
++}
+ 
+ #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
+ 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0, .name = #ioctl}
+@@ -899,10 +1828,65 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
+ 
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL,
+ 			kfd_ioctl_dbg_wave_control, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU,
++			kfd_ioctl_alloc_memory_of_gpu, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU,
++			kfd_ioctl_free_memory_of_gpu, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU,
++			kfd_ioctl_map_memory_to_gpu_wrapper, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU,
++			kfd_ioctl_unmap_memory_from_gpu_wrapper, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_OPEN_GRAPHIC_HANDLE,
++			kfd_ioctl_open_graphic_handle, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH,
++			kfd_ioctl_alloc_scratch_memory, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK,
++			kfd_ioctl_set_cu_mask, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE,
++			kfd_ioctl_set_process_dgpu_aperture, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
++			kfd_ioctl_set_trap_handler, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU_NEW,
++				kfd_ioctl_alloc_memory_of_gpu_new, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW,
++				kfd_ioctl_map_memory_to_gpu, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW,
++				kfd_ioctl_unmap_memory_from_gpu, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
++				kfd_ioctl_get_process_apertures_new, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_EVICT_MEMORY,
++				kfd_evict, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO,
++				kfd_ioctl_get_dmabuf_info, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF,
++				kfd_ioctl_import_dmabuf, 0)
+ };
+ 
+ #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
+ 
++static int kfd_evict(struct file *filep, struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_eviction_args *args = data;
++
++	return evict_size(p, args->size, args->type);
++
++}
+ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+ {
+ 	struct kfd_process *process;
+@@ -994,20 +1978,37 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
+ {
+ 	struct kfd_process *process;
++	struct kfd_dev *kfd;
++	unsigned long vm_pgoff;
++	int retval;
+ 
+ 	process = kfd_get_process(current);
+ 	if (IS_ERR(process))
+ 		return PTR_ERR(process);
+ 
+-	if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) ==
+-			KFD_MMAP_DOORBELL_MASK) {
+-		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK;
++	vm_pgoff = vma->vm_pgoff;
++	vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff);
++
++	switch (vm_pgoff & KFD_MMAP_TYPE_MASK) {
++	case KFD_MMAP_TYPE_DOORBELL:
+ 		return kfd_doorbell_mmap(process, vma);
+-	} else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) ==
+-			KFD_MMAP_EVENTS_MASK) {
+-		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK;
++
++	case KFD_MMAP_TYPE_EVENTS:
+ 		return kfd_event_mmap(process, vma);
++
++	case KFD_MMAP_TYPE_MAP_BO:
++		kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff));
++		if (!kfd)
++			return -EFAULT;
++		retval = kfd->kfd2kgd->mmap_bo(kfd->kgd, vma);
++		return retval;
++
++	case KFD_MMAP_TYPE_RESERVED_MEM:
++		return kfd_reserved_mem_mmap(process, vma);
++
+ 	}
+ 
+ 	return -EFAULT;
+ }
++
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+new file mode 100644
+index 0000000..b3d4a50
+--- /dev/null
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+@@ -0,0 +1,1163 @@
++#include <linux/kernel.h>
++#include <linux/acpi.h>
++#include <linux/mm.h>
++#include <linux/amd-iommu.h>
++#include <linux/pci.h>
++#include "kfd_crat.h"
++#include "kfd_priv.h"
++#include "kfd_topology.h"
++
++/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
++ * GPU processor ID are expressed with Bit[31]=1.
++ * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs
++ * used in the CRAT. */
++static uint32_t gpu_processor_id_low = 0x80001000;
++
++/* Return the next available gpu_processor_id and increment it for next GPU
++ * @total_cu_count - Total CUs present in the GPU including ones masked off
++ */
++static inline unsigned int get_and_inc_gpu_processor_id(
++				unsigned int total_cu_count)
++{
++	int current_id = gpu_processor_id_low;
++
++	gpu_processor_id_low += total_cu_count;
++	return current_id;
++}
++
++/* Static table to describe GPU Cache information */
++struct kfd_gpu_cache_info {
++	uint32_t	cache_size;
++	uint32_t	cache_level;
++	uint32_t	flags;
++	/* Indicates how many Compute Units share this cache
++	 * Value = 1 indicates the cache is not shared */
++	uint32_t	num_cu_shared;
++};
++
++static struct kfd_gpu_cache_info kaveri_cache_info[] = {
++	{
++		/* TCP L1 Cache per CU */
++		.cache_size = 16,
++		.cache_level = 1,
++		.flags = (CRAT_CACHE_FLAGS_ENABLED |
++				CRAT_CACHE_FLAGS_DATA_CACHE |
++				CRAT_CACHE_FLAGS_SIMD_CACHE),
++		.num_cu_shared = 1,
++
++	},
++	{
++		/* Scalar L1 Instruction Cache (in SQC module) per bank */
++		.cache_size = 16,
++		.cache_level = 1,
++		.flags = (CRAT_CACHE_FLAGS_ENABLED |
++				CRAT_CACHE_FLAGS_INST_CACHE |
++				CRAT_CACHE_FLAGS_SIMD_CACHE),
++		.num_cu_shared = 2,
++	},
++	{
++		/* Scalar L1 Data Cache (in SQC module) per bank */
++		.cache_size = 8,
++		.cache_level = 1,
++		.flags = (CRAT_CACHE_FLAGS_ENABLED |
++				CRAT_CACHE_FLAGS_DATA_CACHE |
++				CRAT_CACHE_FLAGS_SIMD_CACHE),
++		.num_cu_shared = 2,
++	},
++
++	/* TODO: Add L2 Cache information */
++};
++
++
++static struct kfd_gpu_cache_info carrizo_cache_info[] = {
++	{
++		/* TCP L1 Cache per CU */
++		.cache_size = 16,
++		.cache_level = 1,
++		.flags = (CRAT_CACHE_FLAGS_ENABLED |
++				CRAT_CACHE_FLAGS_DATA_CACHE |
++				CRAT_CACHE_FLAGS_SIMD_CACHE),
++		.num_cu_shared = 1,
++	},
++	{
++		/* Scalar L1 Instruction Cache (in SQC module) per bank */
++		.cache_size = 8,
++		.cache_level = 1,
++		.flags = (CRAT_CACHE_FLAGS_ENABLED |
++				CRAT_CACHE_FLAGS_INST_CACHE |
++				CRAT_CACHE_FLAGS_SIMD_CACHE),
++		.num_cu_shared = 4,
++	},
++	{
++		/* Scalar L1 Data Cache (in SQC module) per bank. */
++		.cache_size = 4,
++		.cache_level = 1,
++		.flags = (CRAT_CACHE_FLAGS_ENABLED |
++				CRAT_CACHE_FLAGS_DATA_CACHE |
++				CRAT_CACHE_FLAGS_SIMD_CACHE),
++		.num_cu_shared = 4,
++	},
++
++	/* TODO: Add L2 Cache information */
++};
++
++/* NOTE: In future if more information is added to struct kfd_gpu_cache_info
++ *	the following ASICs may need a separate table. */
++#define tonga_cache_info carrizo_cache_info
++#define fiji_cache_info  carrizo_cache_info
++
++static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
++		struct crat_subtype_computeunit *cu)
++{
++	BUG_ON(!dev);
++	BUG_ON(!cu);
++
++	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
++	dev->node_props.cpu_core_id_base = cu->processor_id_low;
++	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
++		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
++
++	pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
++			cu->processor_id_low);
++}
++
++static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
++		struct crat_subtype_computeunit *cu)
++{
++	BUG_ON(!dev);
++	BUG_ON(!cu);
++
++	dev->node_props.simd_id_base = cu->processor_id_low;
++	dev->node_props.simd_count = cu->num_simd_cores;
++	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
++	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
++	dev->node_props.wave_front_size = cu->wave_front_size;
++	dev->node_props.array_count = cu->array_count;
++	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
++	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
++	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
++	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
++		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
++	pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);
++}
++
++/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct
++ *		topology device present in the device_list
++ */
++static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,
++				struct list_head *device_list)
++{
++	struct kfd_topology_device *dev;
++
++	BUG_ON(!cu);
++
++	pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
++			cu->proximity_domain, cu->hsa_capability);
++	list_for_each_entry(dev, device_list, list) {
++		if (cu->proximity_domain == dev->proximity_domain) {
++			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
++				kfd_populated_cu_info_cpu(dev, cu);
++
++			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
++				kfd_populated_cu_info_gpu(dev, cu);
++			break;
++		}
++	}
++
++	return 0;
++}
++
++/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct
++ *		topology device present in the device_list
++ */
++static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,
++				struct list_head *device_list)
++{
++	struct kfd_mem_properties *props;
++	struct kfd_topology_device *dev;
++
++	BUG_ON(!mem);
++
++	pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",
++			mem->proximity_domain);
++	list_for_each_entry(dev, device_list, list) {
++		if (mem->proximity_domain == dev->proximity_domain) {
++			props = kfd_alloc_struct(props);
++			if (props == NULL)
++				return -ENOMEM;
++
++			/*
++			 * We're on GPU node
++			 */
++			if (dev->node_props.cpu_cores_count == 0) {
++				/* APU */
++				if (mem->visibility_type == 0)
++					props->heap_type =
++						HSA_MEM_HEAP_TYPE_FB_PRIVATE;
++				/* dGPU */
++				else
++					props->heap_type = mem->visibility_type;
++			}
++			else
++				props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
++
++			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
++				props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
++			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
++				props->flags |= HSA_MEM_FLAGS_NON_VOLATILE;
++
++			props->size_in_bytes =
++				((uint64_t)mem->length_high << 32) +
++							mem->length_low;
++			props->width = mem->width;
++
++			dev->node_props.mem_banks_count++;
++			list_add_tail(&props->list, &dev->mem_props);
++
++			break;
++		}
++	}
++
++	return 0;
++}
++
++/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct
++ *		topology device present in the device_list
++ */
++static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
++			struct list_head *device_list)
++{
++	struct kfd_cache_properties *props;
++	struct kfd_topology_device *dev;
++	uint32_t id;
++	uint32_t total_num_of_cu;
++
++	BUG_ON(!cache);
++
++	id = cache->processor_id_low;
++
++	list_for_each_entry(dev, device_list, list) {
++		total_num_of_cu = (dev->node_props.array_count *
++					dev->node_props.cu_per_simd_array);
++
++		/* Cache infomration in CRAT doesn't have proximity_domain
++		 * information as it is associated with a CPU core or GPU
++		 * Compute Unit. So map the cache using CPU core Id or SIMD
++		 * (GPU) ID.
++		 * TODO: This works because currently we can safely assume that
++		 *  Compute Units are parsed before caches are parsed. In future
++		 *  remove this dependency
++		 */
++		if ((id >= dev->node_props.cpu_core_id_base &&
++			id <= dev->node_props.cpu_core_id_base +
++				dev->node_props.cpu_cores_count) ||
++			(id >= dev->node_props.simd_id_base &&
++			id < dev->node_props.simd_id_base +
++				total_num_of_cu)) {
++			props = kfd_alloc_struct(props);
++			if (props == NULL)
++				return -ENOMEM;
++
++			props->processor_id_low = id;
++			props->cache_level = cache->cache_level;
++			props->cache_size = cache->cache_size;
++			props->cacheline_size = cache->cache_line_size;
++			props->cachelines_per_tag = cache->lines_per_tag;
++			props->cache_assoc = cache->associativity;
++			props->cache_latency = cache->cache_latency;
++			memcpy(props->sibling_map, cache->sibling_map,
++					sizeof(props->sibling_map));
++
++			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
++				props->cache_type |= HSA_CACHE_TYPE_DATA;
++			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
++				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
++			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
++				props->cache_type |= HSA_CACHE_TYPE_CPU;
++			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
++				props->cache_type |= HSA_CACHE_TYPE_HSACU;
++
++			dev->cache_count++;
++			dev->node_props.caches_count++;
++			list_add_tail(&props->list, &dev->cache_props);
++
++			break;
++		}
++	}
++
++	return 0;
++}
++
++/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
++ *		topology device present in the device_list
++ */
++static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
++					struct list_head *device_list)
++{
++	struct kfd_iolink_properties *props;
++	struct kfd_topology_device *dev;
++	uint32_t i = 0;
++	uint32_t id_from;
++	uint32_t id_to;
++
++	BUG_ON(!iolink);
++
++	id_from = iolink->proximity_domain_from;
++	id_to = iolink->proximity_domain_to;
++
++	pr_debug("Found IO link entry in CRAT table with id_from=%d\n", id_from);
++	list_for_each_entry(dev, device_list, list) {
++		if (id_from == dev->proximity_domain) {
++			props = kfd_alloc_struct(props);
++			if (props == NULL)
++				return -ENOMEM;
++
++			props->node_from = id_from;
++			props->node_to = id_to;
++			props->ver_maj = iolink->version_major;
++			props->ver_min = iolink->version_minor;
++			props->iolink_type = iolink->io_interface_type;
++
++			/*
++			 * weight factor (derived from CDIR), currently always 1
++			 */
++			props->weight = 1;
++
++			props->min_latency = iolink->minimum_latency;
++			props->max_latency = iolink->maximum_latency;
++			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
++			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
++			props->rec_transfer_size =
++					iolink->recommended_transfer_size;
++
++			dev->io_link_count++;
++			dev->node_props.io_links_count++;
++			list_add_tail(&props->list, &dev->io_link_props);
++
++			break;
++		}
++		i++;
++	}
++
++	return 0;
++}
++
++/* kfd_parse_subtype - parse subtypes and attach it to correct topology device
++ *		present in the device_list
++ *	@sub_type_hdr - subtype section of crat_image
++ *	@device_list - list of topology devices present in this crat_image
++ */
++static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
++				struct list_head *device_list)
++{
++	struct crat_subtype_computeunit *cu;
++	struct crat_subtype_memory *mem;
++	struct crat_subtype_cache *cache;
++	struct crat_subtype_iolink *iolink;
++	int ret = 0;
++
++	BUG_ON(!sub_type_hdr);
++
++	switch (sub_type_hdr->type) {
++	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
++		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
++		ret = kfd_parse_subtype_cu(cu, device_list);
++		break;
++	case CRAT_SUBTYPE_MEMORY_AFFINITY:
++		mem = (struct crat_subtype_memory *)sub_type_hdr;
++		ret = kfd_parse_subtype_mem(mem, device_list);
++		break;
++	case CRAT_SUBTYPE_CACHE_AFFINITY:
++		cache = (struct crat_subtype_cache *)sub_type_hdr;
++		ret = kfd_parse_subtype_cache(cache, device_list);
++		break;
++	case CRAT_SUBTYPE_TLB_AFFINITY:
++		/*
++		 * For now, nothing to do here
++		 */
++		pr_debug("Found TLB entry in CRAT table (not processing)\n");
++		break;
++	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
++		/*
++		 * For now, nothing to do here
++		 */
++		pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
++		break;
++	case CRAT_SUBTYPE_IOLINK_AFFINITY:
++		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
++		ret = kfd_parse_subtype_iolink(iolink, device_list);
++		break;
++	default:
++		pr_warn("Unknown subtype (%d) in CRAT\n",
++				sub_type_hdr->type);
++	}
++
++	return ret;
++}
++
++/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT
++ *		create a kfd_topology_device and add in to device_list. Also parse
++ *		CRAT subtypes and attach it to appropriate kfd_topology_device
++ *	@crat_image - input image containing CRAT
++ *	@device_list - [OUT] list of kfd_topology_device generated after parsing
++ *				   crat_image
++ *	@proximity_domain - Proximity domain of the first device in the table
++ *	Return - 0 if successful else -ve value
++ */
++int kfd_parse_crat_table(void *crat_image,
++				struct list_head *device_list,
++				uint32_t proximity_domain)
++{
++	struct kfd_topology_device *top_dev = NULL;
++	struct crat_subtype_generic *sub_type_hdr;
++	uint16_t node_id;
++	int ret;
++	struct crat_header *crat_table = (struct crat_header *)crat_image;
++	uint16_t num_nodes;
++	uint32_t image_len;
++	uint32_t last_header_type, last_header_length;
++
++	if (!crat_image)
++		return -EINVAL;
++
++	if (!list_empty(device_list)) {
++		pr_warn("Error device list should be empty\n");
++	}
++
++	num_nodes = crat_table->num_domains;
++	image_len = crat_table->length;
++
++	pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
++
++	for (node_id = 0; node_id < num_nodes; node_id++) {
++		top_dev = kfd_create_topology_device(device_list);
++		if (!top_dev)
++			break;
++		top_dev->proximity_domain = proximity_domain++;
++	}
++
++	if (!top_dev)
++		return -ENOMEM;
++
++	memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);
++	memcpy(top_dev->oem_table_id, crat_table->oem_table_id, CRAT_OEMTABLEID_LENGTH);
++	top_dev->oem_revision = crat_table->oem_revision;
++
++	last_header_type = last_header_length = 0;
++	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
++	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
++			((char *)crat_image) + image_len) {
++		pr_debug("kfd parsing crat sub type header %p enabled: %s type: 0x%x length %d\n",
++				sub_type_hdr,
++				(sub_type_hdr->flags &
++					CRAT_SUBTYPE_FLAGS_ENABLED)
++					? "true" : "false",
++				sub_type_hdr->type,
++				sub_type_hdr->length);
++
++		if (sub_type_hdr->length == 0) {
++			pr_err("amdkfd: Parsing wrong CRAT's sub header last header type: %d last header len %d\n",
++				last_header_type, last_header_type);
++			pr_err("amdkfd: Current header type %d length %d\n",
++				sub_type_hdr->type, sub_type_hdr->length);
++			break;
++		}
++
++		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
++			ret = kfd_parse_subtype(sub_type_hdr, device_list);
++			if (ret != 0)
++				return ret;
++		}
++
++		last_header_type = sub_type_hdr->type;
++		last_header_length = sub_type_hdr->length;
++		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
++				sub_type_hdr->length);
++	}
++
++	return 0;
++}
++
++/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
++static int fill_in_pcache(struct crat_subtype_cache *pcache,
++				struct kfd_gpu_cache_info *pcache_info,
++				struct kfd_cu_info *cu_info,
++				int mem_available,
++				int cu_bitmask,
++				int cache_type, unsigned int cu_processor_id,
++				int cu_block)
++{
++	unsigned int cu_sibling_map_mask;
++	int first_active_cu;
++
++	/* First check if enough memory is available */
++	if (mem_available - sizeof(struct crat_subtype_cache) < 0)
++		return -ENOMEM;
++
++	cu_sibling_map_mask = cu_bitmask;
++	cu_sibling_map_mask >>= cu_block;
++	cu_sibling_map_mask &=
++		((1 << pcache_info[cache_type].num_cu_shared) - 1);
++	first_active_cu = ffs(cu_sibling_map_mask);
++
++	/* CU could be inactive. In case of shared cache find the first active
++	 * CU. and incase of non-shared cache check if the CU is inactive. If
++	 * inactive active skip it*/
++	if (first_active_cu) {
++		memset(pcache, 0, sizeof(struct crat_subtype_cache));
++		pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
++		pcache->length = sizeof(struct crat_subtype_cache);
++		pcache->flags = pcache_info[cache_type].flags;
++		pcache->processor_id_low = cu_processor_id
++					 + (first_active_cu - 1);
++		pcache->cache_level = pcache_info[cache_type].cache_level;
++		pcache->cache_size = pcache_info[cache_type].cache_size;
++
++		/* Sibling map is w.r.t processor_id_low, so shift out
++		 * inactive CU */
++		cu_sibling_map_mask =
++			cu_sibling_map_mask >> (first_active_cu - 1);
++
++		pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
++		pcache->sibling_map[1] =
++				(uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
++		pcache->sibling_map[2] =
++				(uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
++		pcache->sibling_map[3] =
++				(uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
++		return 0;
++	}
++	return 1;
++}
++
++/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info tables
++ *	@kdev - [IN] GPU device
++ *	@gpu_processor_id - [IN] GPU processor ID to which these caches associate
++ *	@available_size - [IN] Amount of memory available in pcache
++ *	@cu_info - [IN] Compute Unit info obtained from KGD
++ *	@pcache - [OUT] memory into which cache data is to be filled in.
++ *	@size_filled - [OUT] amount of data used up in pcache.
++ *	@num_of_entries - [OUT] number of caches added
++ */
++static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
++			int gpu_processor_id,
++			int available_size,
++			struct kfd_cu_info *cu_info,
++			struct crat_subtype_cache *pcache,
++			int *size_filled,
++			int *num_of_entries)
++{
++	struct kfd_gpu_cache_info *pcache_info;
++	int num_of_cache_types = 0;
++	int i, j, k;
++	int ct = 0;
++	int mem_available = available_size;
++	unsigned int cu_processor_id;
++	int ret;
++
++	switch (kdev->device_info->asic_family) {
++	case CHIP_KAVERI:
++		pcache_info = kaveri_cache_info;
++		num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
++		break;
++	case CHIP_CARRIZO:
++		pcache_info = carrizo_cache_info;
++		num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
++		break;
++	case CHIP_TONGA:
++		pcache_info = tonga_cache_info;
++		num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
++		break;
++	case CHIP_FIJI:
++		pcache_info = fiji_cache_info;
++		num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	*size_filled = 0;
++	*num_of_entries = 0;
++
++	/* For each type of cache listed in the kfd_gpu_cache_info table,
++	 * go through all available Compute Units.
++	 * The [i,j,k] loop will
++	 *		if kfd_gpu_cache_info.num_cu_shared = 1
++	 *			will parse through all available CU
++	 *		If (kfd_gpu_cache_info.num_cu_shared != 1)
++	 *			then it will consider only one CU from
++	 *			the shared unit
++	 */
++
++	for (ct = 0; ct < num_of_cache_types; ct++) {
++		cu_processor_id = gpu_processor_id;
++		for (i = 0; i < cu_info->num_shader_engines; i++) {
++			for (j = 0; j < cu_info->num_shader_arrays_per_engine;
++				j++) {
++				for (k = 0; k < cu_info->num_cu_per_sh;
++					k += pcache_info[ct].num_cu_shared) {
++
++					ret = fill_in_pcache(pcache,
++						pcache_info,
++						cu_info,
++						mem_available,
++						cu_info->cu_bitmap[i][j],
++						ct,
++						cu_processor_id,
++						k);
++
++					if (ret < 0)
++						break;
++
++					if (!ret) {
++						pcache++;
++						(*num_of_entries)++;
++						mem_available -=
++							sizeof(*pcache);
++						(*size_filled) +=
++							sizeof(*pcache);
++					}
++
++					/* Move to next CU block */
++					cu_processor_id +=
++						pcache_info[ct].num_cu_shared;
++				}
++			}
++		}
++	}
++
++	pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
++
++	return 0;
++}
++
++/*
++ * kfd_create_crat_image_acpi - Allocates memory for CRAT image and
++ *		copies CRAT from ACPI (if available).
++ *
++ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
++ *
++ * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then
++ *				*crat_image will be NULL
++ * @size: [OUT] size of crat_image
++ *
++ * Return 0 if successful else return -ve value
++ */
++int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
++{
++	struct acpi_table_header *crat_table;
++	acpi_status status;
++	void *pcrat_image;
++
++	if (!crat_image)
++		return -EINVAL;
++
++	*crat_image = NULL;
++
++	/*
++	 * Fetch the CRAT table from ACPI
++	 */
++	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
++	if (status == AE_NOT_FOUND) {
++		pr_warn("CRAT table not found\n");
++		return -ENODATA;
++	} else if (ACPI_FAILURE(status)) {
++		const char *err = acpi_format_exception(status);
++		pr_err("CRAT table error: %s\n", err);
++		return -EINVAL;
++	}
++
++	pcrat_image = kmalloc(crat_table->length, GFP_KERNEL);
++	if (!pcrat_image) {
++		pr_err("No memory for allocating CRAT image\n");
++		return -ENOMEM;
++	}
++
++	memcpy(pcrat_image, crat_table, crat_table->length);
++
++	*crat_image = pcrat_image;
++	*size = crat_table->length;
++
++	return 0;
++}
++
++/* Memory required to create Virtual CRAT.
++ * Since there is no easy way to predict the amount of memory required, the
++ * following amount are allocated for CPU and GPU Virtual CRAT. This is
++ * expected to cover all known conditions. But to be safe additional check
++ * is put in the code to ensure we don't overwrite.
++ */
++#define VCRAT_SIZE_FOR_CPU	PAGE_SIZE
++#define VCRAT_SIZE_FOR_GPU	(3 * PAGE_SIZE)
++
++/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
++ *
++ *  @numa_node_id: CPU NUMA node id
++ *  @avail_size: Available size in the memory
++ *  @sub_type_hdr: Memory into which compute info will be filled in
++ *
++ *  Return 0 if successful else return -ve value
++ */
++static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,
++				int proximity_domain,
++				struct crat_subtype_computeunit *sub_type_hdr)
++{
++	const struct cpumask *cpumask;
++
++	*avail_size -= sizeof(struct crat_subtype_computeunit);
++	if (*avail_size < 0)
++		return -ENOMEM;
++
++	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
++
++	/* Fill in subtype header data */
++	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
++	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
++	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
++
++	cpumask = cpumask_of_node(numa_node_id);
++
++	/* Fill in CU data */
++	sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;
++	sub_type_hdr->proximity_domain = proximity_domain;
++	sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);
++	if (sub_type_hdr->processor_id_low == -1)
++		return -EINVAL;
++
++	sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);
++
++	return 0;
++}
++
++/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node
++ *
++ *  @numa_node_id: CPU NUMA node id
++ *  @avail_size: Available size in the memory
++ *  @sub_type_hdr: Memory into which compute info will be filled in
++ *
++ *  Return 0 if successful else return -ve value
++ */
++static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
++			int proximity_domain,
++			struct crat_subtype_memory *sub_type_hdr)
++{
++	uint64_t mem_in_bytes = 0;
++	pg_data_t *pgdat;
++	int zone_type;
++
++	*avail_size -= sizeof(struct crat_subtype_computeunit);
++	if (*avail_size < 0)
++		return -ENOMEM;
++
++	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
++
++	/* Fill in subtype header data */
++	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
++	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
++	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
++
++	/* Fill in Memory Subunit data */
++
++	/* Unlike si_meminfo, si_meminfo_node is not exported. So
++	 * the following lines are duplicated from si_meminfo_node
++	 * function */
++	pgdat = NODE_DATA(numa_node_id);
++	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
++		mem_in_bytes += pgdat->node_zones[zone_type].managed_pages;
++	mem_in_bytes <<= PAGE_SHIFT;
++
++	sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
++	sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);
++	sub_type_hdr->proximity_domain = proximity_domain;
++
++	return 0;
++}
++
++/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU
++ *
++ *	@pcrat_image: Fill in VCRAT for CPU
++ *	@size:	[IN] allocated size of crat_image.
++ *			[OUT] actual size of data filled in crat_image
++ */
++static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
++{
++	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
++	struct acpi_table_header *acpi_table;
++	acpi_status status;
++	struct crat_subtype_generic *sub_type_hdr;
++	int avail_size = *size;
++	int numa_node_id;
++	int ret = 0;
++
++	if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_CPU)
++		return -EINVAL;
++
++	/* Fill in CRAT Header.
++	 * Modify length and total_entries as subunits are added.
++	 */
++	avail_size -= sizeof(struct crat_header);
++	if (avail_size < 0)
++		return -ENOMEM;
++
++	memset(crat_table, 0, sizeof(struct crat_header));
++	memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature));
++	crat_table->length = sizeof(struct crat_header);
++
++	status = acpi_get_table("DSDT", 0, &acpi_table);
++	if (status == AE_NOT_FOUND)
++		pr_warn("DSDT table not found for OEM information\n");
++	else {
++		crat_table->oem_revision = acpi_table->revision;
++		memcpy(crat_table->oem_id, acpi_table->oem_id, CRAT_OEMID_LENGTH);
++		memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, CRAT_OEMTABLEID_LENGTH);
++	}
++	crat_table->total_entries = 0;
++	crat_table->num_domains = 0;
++
++	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
++
++	for_each_online_node(numa_node_id) {
++		/* Fill in Subtype: Compute Unit */
++		ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,
++			crat_table->num_domains,
++			(struct crat_subtype_computeunit *)sub_type_hdr);
++		if (ret < 0)
++			return ret;
++		crat_table->length += sub_type_hdr->length;
++		crat_table->total_entries++;
++
++		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
++			sub_type_hdr->length);
++
++		/* Fill in Subtype: Memory */
++		ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,
++			crat_table->num_domains,
++			(struct crat_subtype_memory *)sub_type_hdr);
++		if (ret < 0)
++			return ret;
++		crat_table->length += sub_type_hdr->length;
++		crat_table->total_entries++;
++
++		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
++			sub_type_hdr->length);
++
++		crat_table->num_domains++;
++	}
++
++	/* TODO: Add cache Subtype for CPU.
++	 * Currently, CPU cache information is available in function
++	 * detect_cache_attributes(cpu) defined in the file
++	 * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not exported
++	 * and to get the same information the code needs to be duplicated.
++	 */
++
++	*size = crat_table->length;
++	pr_info("Virtual CRAT table created for CPU\n");
++
++	return 0;
++}
++
++static int kfd_fill_gpu_memory_affinity(int *avail_size,
++		struct kfd_dev *kdev, uint8_t type, uint64_t size,
++		struct crat_subtype_memory *sub_type_hdr,
++		uint32_t proximity_domain,
++		const struct kfd_local_mem_info *local_mem_info)
++{
++	*avail_size -= sizeof(struct crat_subtype_memory);
++	if (*avail_size < 0)
++		return -ENOMEM;
++
++	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
++	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
++	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
++	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
++
++	sub_type_hdr->proximity_domain = proximity_domain;
++
++	pr_debug("amdkfd: fill gpu memory affinity - type 0x%x size 0x%llx\n",
++			type, size);
++
++	sub_type_hdr->length_low = lower_32_bits(size);
++	sub_type_hdr->length_high = upper_32_bits(size);
++
++	sub_type_hdr->width = local_mem_info->vram_width;
++	sub_type_hdr->visibility_type = type;
++
++	return 0;
++}
++
++/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
++ *	to its NUMA node
++ *
++ *  @avail_size: Available size in the memory
++ *  @kdev - [IN] GPU device
++ *  @sub_type_hdr: Memory into which io link info will be filled in
++ *  @proximity_domain - proximity domain of the GPU node
++ *
++ *  Return 0 if successful else return -ve value
++ */
++static int kfd_fill_gpu_direct_io_link(int *avail_size,
++			struct kfd_dev *kdev,
++			struct crat_subtype_iolink *sub_type_hdr,
++			uint32_t proximity_domain)
++{
++	int proximity_domain_to;
++	*avail_size -= sizeof(struct crat_subtype_iolink);
++	if (*avail_size < 0)
++		return -ENOMEM;
++
++	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
++
++	/* Fill in subtype header data */
++	sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
++	sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
++	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
++
++	/* Fill in IOLINK subtype.
++	 * TODO: Fill-in other fields of iolink subtype */
++	sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
++	sub_type_hdr->proximity_domain_from = proximity_domain;
++	proximity_domain_to =
++		kfd_get_proximity_domain(kdev->pdev->bus);
++	if (proximity_domain_to == -1)
++		return -EINVAL;
++
++	sub_type_hdr->proximity_domain_to = proximity_domain_to;
++	return 0;
++}
++
++/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
++ *
++ *	@pcrat_image: Fill in VCRAT for GPU
++ *	@size:	[IN] allocated size of crat_image.
++ *		[OUT] actual size of data filled in crat_image
++ */
++static int kfd_create_vcrat_image_gpu(void *pcrat_image,
++			size_t *size, struct kfd_dev *kdev,
++			uint32_t proximity_domain)
++{
++	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
++	struct crat_subtype_generic *sub_type_hdr;
++	struct crat_subtype_computeunit *cu;
++	struct kfd_cu_info cu_info;
++	struct amd_iommu_device_info iommu_info;
++	int avail_size = *size;
++	uint32_t total_num_of_cu;
++	int num_of_cache_entries = 0;
++	int cache_mem_filled = 0;
++	int ret = 0;
++	const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
++								AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
++								AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
++	struct kfd_local_mem_info local_mem_info;
++
++	if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_GPU)
++		return -EINVAL;
++
++	/* Fill the CRAT Header.
++	 * Modify length and total_entries as subunits are added.
++	 */
++	avail_size -= sizeof(struct crat_header);
++	if (avail_size < 0)
++		return -ENOMEM;
++
++	memset(crat_table, 0, sizeof(struct crat_header));
++
++	memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature));
++	crat_table->length = sizeof(struct crat_header); /* Change length as we add more subtypes*/
++	crat_table->num_domains = 1;
++	crat_table->total_entries = 0;
++
++	/* Fill in Subtype: Compute Unit
++	 * First fill in the sub type header and then sub type data
++	 */
++	avail_size -= sizeof(struct crat_subtype_computeunit);
++	if (avail_size < 0)
++		return -ENOMEM;
++
++	sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
++	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
++
++	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
++	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
++	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
++
++	/* Fill CU subtype data */
++	cu = (struct crat_subtype_computeunit *)sub_type_hdr;
++	cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
++	cu->proximity_domain = proximity_domain;
++
++	kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info);
++	cu->num_simd_per_cu = cu_info.simd_per_cu;
++	cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
++	cu->max_waves_simd = cu_info.max_waves_per_simd;
++
++	cu->wave_front_size = cu_info.wave_front_size;
++	cu->array_count = cu_info.num_shader_arrays_per_engine *
++		cu_info.num_shader_engines;
++	total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
++	cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
++	cu->num_cu_per_array = cu_info.num_cu_per_sh;
++	cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
++	cu->num_banks = cu_info.num_shader_engines;
++	cu->lds_size_in_kb = cu_info.lds_size;
++
++	cu->hsa_capability = 0;
++
++	/* Check if this node supports IOMMU. During parsing this flag will
++	 * translate to HSA_CAP_ATS_PRESENT */
++	iommu_info.flags = 0;
++	if (0 == amd_iommu_device_info(kdev->pdev, &iommu_info)) {
++		if ((iommu_info.flags & required_iommu_flags) == required_iommu_flags)
++			cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT;
++	}
++
++	crat_table->length += sub_type_hdr->length;
++	crat_table->total_entries++;
++
++	/* Fill in Subtype: Memory. Only on systems with large BAR (no
++	 * private FB), report memory as public. On other systems
++	 * report the total FB size (public+private) as a single
++	 * private heap. */
++	kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info);
++	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
++			sub_type_hdr->length);
++
++	if (local_mem_info.local_mem_size_private == 0)
++		ret = kfd_fill_gpu_memory_affinity(&avail_size,
++				kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
++				local_mem_info.local_mem_size_public,
++				(struct crat_subtype_memory *)sub_type_hdr,
++				proximity_domain,
++				&local_mem_info);
++	else
++		ret = kfd_fill_gpu_memory_affinity(&avail_size,
++				kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,
++				local_mem_info.local_mem_size_public +
++				local_mem_info.local_mem_size_private,
++				(struct crat_subtype_memory *)sub_type_hdr,
++				proximity_domain,
++				&local_mem_info);
++	if (ret < 0)
++		return ret;
++
++	crat_table->length += sizeof(struct crat_subtype_memory);
++	crat_table->total_entries++;
++
++	/* TODO: Fill in cache information. This information is NOT readily
++	 * available in KGD */
++	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
++		sub_type_hdr->length);
++	ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
++				avail_size,
++				&cu_info,
++				(struct crat_subtype_cache *)sub_type_hdr,
++				&cache_mem_filled,
++				&num_of_cache_entries);
++
++	if (ret < 0)
++		return ret;
++
++	crat_table->length += cache_mem_filled;
++	crat_table->total_entries += num_of_cache_entries;
++	avail_size -= cache_mem_filled;
++
++	/* Fill in Subtype: IO_LINKS
++	 *  Only direct links are added here which is Link from GPU to
++	 *  to its NUMA node. Indirect links are added by userspace.
++	 */
++	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
++		cache_mem_filled);
++	ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev,
++		(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
++
++	if (ret < 0)
++		return ret;
++
++	crat_table->length += sub_type_hdr->length;
++	crat_table->total_entries++;
++
++	*size = crat_table->length;
++	pr_info("Virtual CRAT table created for GPU\n");
++
++	return ret;
++}
++
++/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and
++ *		creates a Virtual CRAT (VCRAT) image
++ *
++ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
++ *
++ *	@crat_image: VCRAT image created because ACPI does not have a
++ *		CRAT for this device
++ *	@size: [OUT] size of virtual crat_image
++ *	@flags:	COMPUTE_UNIT_CPU - Create VCRAT for CPU device
++ *			COMPUTE_UNIT_GPU - Create VCRAT for GPU
++ *			(COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
++ *				-- this option is not currently implemented. The assumption
++ *				   is that all AMD APUs will have CRAT
++ *	@kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU
++ *
++ * Return 0 if successful else return -ve value
++*/
++int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
++		int flags, struct kfd_dev *kdev, uint32_t proximity_domain)
++{
++	void *pcrat_image;
++	int ret = 0;
++
++	if (!crat_image)
++		return -EINVAL;
++
++	*crat_image = NULL;
++
++	/* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and
++	 * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover
++	 * all the current conditions. A check is put not to overwrite beyond
++	 * allocated size
++	 */
++	switch (flags) {
++	case COMPUTE_UNIT_CPU:
++		pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL);
++		if (!pcrat_image)
++			return -ENOMEM;
++		*size = VCRAT_SIZE_FOR_CPU;
++		ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
++		break;
++	case COMPUTE_UNIT_GPU:
++		if (!kdev)
++			return -EINVAL;
++		pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
++		if (!pcrat_image)
++			return -ENOMEM;
++		*size = VCRAT_SIZE_FOR_GPU;
++		ret = kfd_create_vcrat_image_gpu(pcrat_image, size,
++				kdev, proximity_domain);
++		break;
++	case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) :
++		/*TODO:*/
++		ret = -EINVAL;
++		pr_err("VCRAT not implemented for APU\n");
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret == 0)
++		*crat_image = pcrat_image;
++
++	return ret;
++}
++
++
++/* kfd_destroy_crat_image
++ *
++ * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)
++ *
++ */
++void kfd_destroy_crat_image(void *crat_image)
++{
++	if (crat_image)
++		kfree(crat_image);
++	return;
++}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+index a374fa3..9af3745 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+@@ -24,6 +24,7 @@
+ #define KFD_CRAT_H_INCLUDED
+ 
+ #include <linux/types.h>
++#include "kfd_priv.h"
+ 
+ #pragma pack(1)
+ 
+@@ -44,6 +45,10 @@
+ 
+ #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1)
+ 
++/* Compute Unit flags */
++#define COMPUTE_UNIT_CPU	(1 << 0)	/* Create Virtual CRAT for CPU */
++#define COMPUTE_UNIT_GPU	(1 << 1)	/* Create Virtual CRAT for GPU */
++
+ struct crat_header {
+ 	uint32_t	signature;
+ 	uint32_t	length;
+@@ -105,7 +110,7 @@ struct crat_subtype_computeunit {
+ 	uint8_t		wave_front_size;
+ 	uint8_t		num_banks;
+ 	uint16_t	micro_engine_id;
+-	uint8_t		num_arrays;
++	uint8_t		array_count;
+ 	uint8_t		num_cu_per_array;
+ 	uint8_t		num_simd_per_cu;
+ 	uint8_t		max_slots_scatch_cu;
+@@ -127,13 +132,14 @@ struct crat_subtype_memory {
+ 	uint8_t		length;
+ 	uint16_t	reserved;
+ 	uint32_t	flags;
+-	uint32_t	promixity_domain;
++	uint32_t	proximity_domain;
+ 	uint32_t	base_addr_low;
+ 	uint32_t	base_addr_high;
+ 	uint32_t	length_low;
+ 	uint32_t	length_high;
+ 	uint32_t	width;
+-	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH];
++	uint8_t		visibility_type; /* for virtual (dGPU) CRAT */
++	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1];
+ };
+ 
+ /*
+@@ -222,9 +228,12 @@ struct crat_subtype_ccompute {
+ /*
+  * HSA IO Link Affinity structure and definitions
+  */
+-#define CRAT_IOLINK_FLAGS_ENABLED	0x00000001
+-#define CRAT_IOLINK_FLAGS_COHERENCY	0x00000002
+-#define CRAT_IOLINK_FLAGS_RESERVED	0xfffffffc
++#define CRAT_IOLINK_FLAGS_ENABLED	(1 << 0)
++#define CRAT_IOLINK_FLAGS_NON_COHERENT	(1 << 1)
++#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2)
++#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3)
++#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4)
++#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0
+ 
+ /*
+  * IO interface types
+@@ -232,8 +241,16 @@ struct crat_subtype_ccompute {
+ #define CRAT_IOLINK_TYPE_UNDEFINED	0
+ #define CRAT_IOLINK_TYPE_HYPERTRANSPORT	1
+ #define CRAT_IOLINK_TYPE_PCIEXPRESS	2
+-#define CRAT_IOLINK_TYPE_OTHER		3
+-#define CRAT_IOLINK_TYPE_MAX		255
++#define CRAT_IOLINK_TYPE_AMBA 3
++#define CRAT_IOLINK_TYPE_MIPI 4
++#define CRAT_IOLINK_TYPE_QPI_1_1 5
++#define CRAT_IOLINK_TYPE_RESERVED1 6
++#define CRAT_IOLINK_TYPE_RESERVED2 7
++#define CRAT_IOLINK_TYPE_RAPID_IO 8
++#define CRAT_IOLINK_TYPE_INFINIBAND 9
++#define CRAT_IOLINK_TYPE_RESERVED3 10
++#define CRAT_IOLINK_TYPE_OTHER 11
++#define CRAT_IOLINK_TYPE_MAX 255
+ 
+ #define CRAT_IOLINK_RESERVED_LENGTH 24
+ 
+@@ -291,4 +308,11 @@ struct cdit_header {
+ 
+ #pragma pack()
+ 
++int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
++void kfd_destroy_crat_image(void *crat_image);
++int kfd_parse_crat_table(void *crat_image,
++		struct list_head *device_list,
++		uint32_t proximity_domain);
++int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
++		int flags, struct kfd_dev *kdev, uint32_t proximity_domain);
+ #endif /* KFD_CRAT_H_INCLUDED */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+index d5e19b5..4f2311e 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+@@ -42,8 +42,6 @@
+ 
+ static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev)
+ {
+-	BUG_ON(!dev || !dev->kfd2kgd);
+-
+ 	dev->kfd2kgd->address_watch_disable(dev->kgd);
+ }
+ 
+@@ -51,129 +49,118 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
+ 				unsigned int pasid, uint64_t vmid0_address,
+ 				uint32_t *packet_buff, size_t size_in_bytes)
+ {
++	int status = 0;
++	unsigned int *ib_packet_buff = NULL;
+ 	struct pm4__release_mem *rm_packet;
+ 	struct pm4__indirect_buffer_pasid *ib_packet;
++	struct kernel_queue *kq = dbgdev->kq;
++	size_t pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + sizeof(struct pm4__indirect_buffer_pasid);
+ 	struct kfd_mem_obj *mem_obj;
+-	size_t pq_packets_size_in_bytes;
++
++	uint64_t *rm_state = NULL;
++
+ 	union ULARGE_INTEGER *largep;
+ 	union ULARGE_INTEGER addr;
+-	struct kernel_queue *kq;
+-	uint64_t *rm_state;
+-	unsigned int *ib_packet_buff;
+-	int status;
+-
+-	BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes);
+-
+-	kq = dbgdev->kq;
+-
+-	pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) +
+-				sizeof(struct pm4__indirect_buffer_pasid);
+-
+-	/*
+-	 * We acquire a buffer from DIQ
+-	 * The receive packet buff will be sitting on the Indirect Buffer
+-	 * and in the PQ we put the IB packet + sync packet(s).
+-	 */
+-	status = kq->ops.acquire_packet_buffer(kq,
+-				pq_packets_size_in_bytes / sizeof(uint32_t),
+-				&ib_packet_buff);
+-	if (status != 0) {
+-		pr_err("amdkfd: acquire_packet_buffer failed\n");
+-		return status;
+-	}
+ 
+-	memset(ib_packet_buff, 0, pq_packets_size_in_bytes);
++	do {
++		if ((kq == NULL) || (packet_buff == NULL) || (size_in_bytes == 0)) {
++			pr_debug("Error! kfd: In func %s >> Illegal packet parameters\n", __func__);
++			status = -EINVAL;
++			break;
++		}
++		/* todo - enter proper locking to be multithreaded safe */
+ 
+-	ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff);
++		/* We acquire a buffer from DIQ
++		 * The receive packet buff will be sitting on the Indirect Buffer
++		 * and in the PQ we put the IB packet + sync packet(s).
++		 */
++		status = kq->ops.acquire_packet_buffer(kq, pq_packets_size_in_bytes / sizeof(uint32_t), &ib_packet_buff);
++		if (status != 0) {
++			pr_debug("Error! kfd: In func %s >> acquire_packet_buffer failed\n", __func__);
++			break;
++		}
+ 
+-	ib_packet->header.count = 3;
+-	ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID;
+-	ib_packet->header.type = PM4_TYPE_3;
++		memset(ib_packet_buff, 0, pq_packets_size_in_bytes);
+ 
+-	largep = (union ULARGE_INTEGER *) &vmid0_address;
++		ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff);
+ 
+-	ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2;
+-	ib_packet->bitfields3.ib_base_hi = largep->u.high_part;
++		ib_packet->header.count = 3;
++		ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID;
++		ib_packet->header.type = PM4_TYPE_3;
+ 
+-	ib_packet->control = (1 << 23) | (1 << 31) |
+-			((size_in_bytes / sizeof(uint32_t)) & 0xfffff);
++		largep = (union ULARGE_INTEGER *) &vmid0_address;
+ 
+-	ib_packet->bitfields5.pasid = pasid;
++		ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2;
++		ib_packet->bitfields3.ib_base_hi = largep->u.high_part;
+ 
+-	/*
+-	 * for now we use release mem for GPU-CPU synchronization
+-	 * Consider WaitRegMem + WriteData as a better alternative
+-	 * we get a GART allocations ( gpu/cpu mapping),
+-	 * for the sync variable, and wait until:
+-	 * (a) Sync with HW
+-	 * (b) Sync var is written by CP to mem.
+-	 */
+-	rm_packet = (struct pm4__release_mem *) (ib_packet_buff +
+-			(sizeof(struct pm4__indirect_buffer_pasid) /
+-					sizeof(unsigned int)));
++		ib_packet->control = (1 << 23) | (1 << 31) |
++				((size_in_bytes / sizeof(uint32_t)) & 0xfffff);
+ 
+-	status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t),
+-					&mem_obj);
++		ib_packet->bitfields5.pasid = pasid;
+ 
+-	if (status != 0) {
+-		pr_err("amdkfd: Failed to allocate GART memory\n");
+-		kq->ops.rollback_packet(kq);
+-		return status;
+-	}
++		/*
++		 * for now we use release mem for GPU-CPU synchronization
++		 * Consider WaitRegMem + WriteData as a better alternative
++		 * we get a GART allocations ( gpu/cpu mapping),
++		 * for the sync variable, and wait until:
++		 * (a) Sync with HW
++		 * (b) Sync var is written by CP to mem.
++		 */
++		rm_packet = (struct pm4__release_mem *) (ib_packet_buff +
++				(sizeof(struct pm4__indirect_buffer_pasid) / sizeof(unsigned int)));
+ 
+-	rm_state = (uint64_t *) mem_obj->cpu_ptr;
++		status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t),
++						&mem_obj);
+ 
+-	*rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING;
++		if (status == 0) {
+ 
+-	rm_packet->header.opcode = IT_RELEASE_MEM;
+-	rm_packet->header.type = PM4_TYPE_3;
+-	rm_packet->header.count = sizeof(struct pm4__release_mem) /
+-					sizeof(unsigned int) - 2;
++			rm_state = (uint64_t *) mem_obj->cpu_ptr;
+ 
+-	rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+-	rm_packet->bitfields2.event_index =
+-				event_index___release_mem__end_of_pipe;
++			*rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING;
+ 
+-	rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
+-	rm_packet->bitfields2.atc = 0;
+-	rm_packet->bitfields2.tc_wb_action_ena = 1;
++			rm_packet->header.opcode = IT_RELEASE_MEM;
++			rm_packet->header.type = PM4_TYPE_3;
++			rm_packet->header.count = sizeof(struct pm4__release_mem) / sizeof(unsigned int) - 2;
+ 
+-	addr.quad_part = mem_obj->gpu_addr;
++			rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
++			rm_packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
++			rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
++			rm_packet->bitfields2.atc = 0;
++			rm_packet->bitfields2.tc_wb_action_ena = 1;
+ 
+-	rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2;
+-	rm_packet->address_hi = addr.u.high_part;
++			addr.quad_part = mem_obj->gpu_addr;
+ 
+-	rm_packet->bitfields3.data_sel =
+-				data_sel___release_mem__send_64_bit_data;
++			rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2;
++			rm_packet->address_hi = addr.u.high_part;
+ 
+-	rm_packet->bitfields3.int_sel =
+-			int_sel___release_mem__send_data_after_write_confirm;
++			rm_packet->bitfields3.data_sel = data_sel___release_mem__send_64_bit_data;
++			rm_packet->bitfields3.int_sel = int_sel___release_mem__send_data_after_write_confirm;
++			rm_packet->bitfields3.dst_sel = dst_sel___release_mem__memory_controller;
+ 
+-	rm_packet->bitfields3.dst_sel =
+-			dst_sel___release_mem__memory_controller;
++			rm_packet->data_lo = QUEUESTATE__ACTIVE;
+ 
+-	rm_packet->data_lo = QUEUESTATE__ACTIVE;
++			kq->ops.submit_packet(kq);
+ 
+-	kq->ops.submit_packet(kq);
++			/* Wait till CP writes sync code: */
+ 
+-	/* Wait till CP writes sync code: */
+-	status = amdkfd_fence_wait_timeout(
+-			(unsigned int *) rm_state,
+-			QUEUESTATE__ACTIVE, 1500);
++			status = amdkfd_fence_wait_timeout(
++					(unsigned int *) rm_state,
++					QUEUESTATE__ACTIVE, 1500);
++
++		} else {
++			pr_debug("Error! kfd: In func %s >> failed to allocate GART memory\n", __func__);
++		}
++	} while (false);
+ 
+-	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
++	if (rm_state != NULL)
++		kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+ 
+ 	return status;
+ }
+ 
+ static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev)
+ {
+-	BUG_ON(!dbgdev);
+-
+-	/*
+-	 * no action is needed in this case,
+-	 * just make sure diq will not be used
+-	 */
++	/* no action is needed in this case, just make sure diq will not be used */
+ 
+ 	dbgdev->kq = NULL;
+ 
+@@ -182,57 +169,68 @@ static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev)
+ 
+ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
+ {
++
++	int status = 0;
++	struct kernel_queue *kq = NULL;
+ 	struct queue_properties properties;
+ 	unsigned int qid;
+-	struct kernel_queue *kq = NULL;
+-	int status;
++	struct process_queue_manager *pqm = dbgdev->pqm;
+ 
+-	BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->dev);
++	do {
+ 
+-	status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,
+-				&properties, 0, KFD_QUEUE_TYPE_DIQ,
+-				&qid);
++		if (!pqm) {
++			pr_debug("Error! kfd: In func %s >> No PQM\n", __func__);
++			status = -EFAULT;
++			break;
++		}
+ 
+-	if (status) {
+-		pr_err("amdkfd: Failed to create DIQ\n");
+-		return status;
+-	}
++		properties.type = KFD_QUEUE_TYPE_DIQ;
+ 
+-	pr_debug("DIQ Created with queue id: %d\n", qid);
++		status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,
++				&properties, &qid);
+ 
+-	kq = pqm_get_kernel_queue(dbgdev->pqm, qid);
++		if (status != 0) {
++			pr_debug("Error! kfd: In func %s >> Create Queue failed\n", __func__);
++			break;
++		}
+ 
+-	if (kq == NULL) {
+-		pr_err("amdkfd: Error getting DIQ\n");
+-		pqm_destroy_queue(dbgdev->pqm, qid);
+-		return -EFAULT;
+-	}
++		pr_debug("kfd: DIQ Created with queue id: %d\n", qid);
++
++		kq = pqm_get_kernel_queue(dbgdev->pqm, qid);
++
++		if (kq == NULL) {
++			pr_debug("Error! kfd: In func %s >> Error getting Kernel Queue\n", __func__);
++			status = -ENOMEM;
++			break;
++		}
++
++		dbgdev->kq = kq;
+ 
+-	dbgdev->kq = kq;
++	} while (false);
+ 
+ 	return status;
+ }
+ 
+ static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev)
+ {
+-	BUG_ON(!dbgdev || !dbgdev->dev);
+-
+ 	/* disable watch address */
++
+ 	dbgdev_address_watch_disable_nodiq(dbgdev->dev);
+ 	return 0;
+ }
+ 
+ static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev)
+ {
+-	/* todo - disable address watch */
+-	int status;
+-
+-	BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->kq);
+-
+-	status = pqm_destroy_queue(dbgdev->pqm,
+-			dbgdev->kq->queue->properties.queue_id);
+-	dbgdev->kq = NULL;
+-
++	/* todo - if needed, kill wavefronts and disable watch */
++	int status = 0;
++	if ((dbgdev == NULL) || (dbgdev->pqm == NULL) || (dbgdev->kq == NULL)) {
++		pr_debug("kfd Err:In func %s >> can't destroy diq\n", __func__);
++		status = -EFAULT;
++	} else {
++		pqm_destroy_queue(dbgdev->pqm,
++				dbgdev->kq->queue->properties.queue_id);
++		dbgdev->kq = NULL;
++	}
+ 	return status;
+ }
+ 
+@@ -241,341 +239,350 @@ static void dbgdev_address_watch_set_registers(
+ 			union TCP_WATCH_ADDR_H_BITS *addrHi,
+ 			union TCP_WATCH_ADDR_L_BITS *addrLo,
+ 			union TCP_WATCH_CNTL_BITS *cntl,
+-			unsigned int index, unsigned int vmid)
++			unsigned int index, unsigned int vmid,
++			unsigned int asic_family)
+ {
+ 	union ULARGE_INTEGER addr;
+ 
+-	BUG_ON(!adw_info || !addrHi || !addrLo || !cntl);
+-
+ 	addr.quad_part = 0;
+ 	addrHi->u32All = 0;
+ 	addrLo->u32All = 0;
+ 	cntl->u32All = 0;
+ 
+ 	if (adw_info->watch_mask != NULL)
+-		cntl->bitfields.mask =
+-			(uint32_t) (adw_info->watch_mask[index] &
+-					ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK);
++		cntl->bitfields.mask = (uint32_t) (adw_info->watch_mask[index] & ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK);
+ 	else
+ 		cntl->bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
+ 
+ 	addr.quad_part = (unsigned long long) adw_info->watch_address[index];
+ 
+-	addrHi->bitfields.addr = addr.u.high_part &
+-					ADDRESS_WATCH_REG_ADDHIGH_MASK;
++	addrHi->bitfields.addr = addr.u.high_part & ADDRESS_WATCH_REG_ADDHIGH_MASK;
+ 	addrLo->bitfields.addr =
+ 			(addr.u.low_part >> ADDRESS_WATCH_REG_ADDLOW_SHIFT);
+ 
+ 	cntl->bitfields.mode = adw_info->watch_mode[index];
+ 	cntl->bitfields.vmid = (uint32_t) vmid;
+-	/* for now assume it is an ATC address */
+-	cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT;
+-
++	/*  for APU assume it is an ATC address.  */
++	if (KFD_IS_DGPU(asic_family) == false)
++		cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT;
+ 	pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask);
+-	pr_debug("\t\t%20s %08x\n", "set reg add high :",
+-			addrHi->bitfields.addr);
+-	pr_debug("\t\t%20s %08x\n", "set reg add low :",
+-			addrLo->bitfields.addr);
++	pr_debug("\t\t%20s %08x\n", "set reg add high :", addrHi->bitfields.addr);
++	pr_debug("\t\t%20s %08x\n", "set reg add low :", addrLo->bitfields.addr);
++
+ }
+ 
+ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev,
+ 					struct dbg_address_watch_info *adw_info)
+ {
++
++	int status = 0;
++
+ 	union TCP_WATCH_ADDR_H_BITS addrHi;
+ 	union TCP_WATCH_ADDR_L_BITS addrLo;
+ 	union TCP_WATCH_CNTL_BITS cntl;
+-	struct kfd_process_device *pdd;
++
++	unsigned int vmid;
+ 	unsigned int i;
+ 
+-	BUG_ON(!dbgdev || !dbgdev->dev || !adw_info);
++	struct kfd_process_device *pdd;
+ 
+-	/* taking the vmid for that process on the safe way using pdd */
+-	pdd = kfd_get_process_device_data(dbgdev->dev,
+-					adw_info->process);
+-	if (!pdd) {
+-		pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n");
+-		return -EFAULT;
+-	}
++	do {
++		/* taking the vmid for that process on the safe way using pdd */
++		pdd = kfd_get_process_device_data(dbgdev->dev,
++						adw_info->process);
++		if (!pdd) {
++			pr_debug("Error! kfd: In func %s >> no PDD available\n", __func__);
++			status = -EFAULT;
++			break;
++		}
+ 
+-	addrHi.u32All = 0;
+-	addrLo.u32All = 0;
+-	cntl.u32All = 0;
++		addrHi.u32All = 0;
++		addrLo.u32All = 0;
++		cntl.u32All = 0;
+ 
+-	if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ||
+-			(adw_info->num_watch_points == 0)) {
+-		pr_err("amdkfd: num_watch_points is invalid\n");
+-		return -EINVAL;
+-	}
++		vmid = pdd->qpd.vmid;
+ 
+-	if ((adw_info->watch_mode == NULL) ||
+-		(adw_info->watch_address == NULL)) {
+-		pr_err("amdkfd: adw_info fields are not valid\n");
+-		return -EINVAL;
+-	}
++		if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES)
++		    || (adw_info->num_watch_points == 0)) {
++			status = -EINVAL;
++			break;
++		}
+ 
+-	for (i = 0 ; i < adw_info->num_watch_points ; i++) {
+-		dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo,
+-						&cntl, i, pdd->qpd.vmid);
+-
+-		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+-		pr_debug("\t\t%20s %08x\n", "register index :", i);
+-		pr_debug("\t\t%20s %08x\n", "vmid is :", pdd->qpd.vmid);
+-		pr_debug("\t\t%20s %08x\n", "Address Low is :",
+-				addrLo.bitfields.addr);
+-		pr_debug("\t\t%20s %08x\n", "Address high is :",
+-				addrHi.bitfields.addr);
+-		pr_debug("\t\t%20s %08x\n", "Address high is :",
+-				addrHi.bitfields.addr);
+-		pr_debug("\t\t%20s %08x\n", "Control Mask is :",
+-				cntl.bitfields.mask);
+-		pr_debug("\t\t%20s %08x\n", "Control Mode is :",
+-				cntl.bitfields.mode);
+-		pr_debug("\t\t%20s %08x\n", "Control Vmid is :",
+-				cntl.bitfields.vmid);
+-		pr_debug("\t\t%20s %08x\n", "Control atc  is :",
+-				cntl.bitfields.atc);
+-		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+-
+-		pdd->dev->kfd2kgd->address_watch_execute(
+-						dbgdev->dev->kgd,
+-						i,
+-						cntl.u32All,
+-						addrHi.u32All,
+-						addrLo.u32All);
+-	}
++		if ((adw_info->watch_mode == NULL) || (adw_info->watch_address == NULL)) {
++			status = -EINVAL;
++			break;
++		}
+ 
+-	return 0;
++		for (i = 0; i < adw_info->num_watch_points; i++) {
++
++			dbgdev_address_watch_set_registers(
++				adw_info,
++				&addrHi,
++				&addrLo,
++				&cntl,
++				i,
++				vmid,
++				dbgdev->dev->device_info->asic_family
++				);
++
++			pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
++			pr_debug("\t\t%20s %08x\n", "register index :", i);
++			pr_debug("\t\t%20s %08x\n", "vmid is :", vmid);
++			pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr);
++			pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr);
++			pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr);
++			pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask);
++			pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode);
++			pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid);
++			pr_debug("\t\t%20s %08x\n", "Control atc  is :", cntl.bitfields.atc);
++			pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
++
++			pdd->dev->kfd2kgd->address_watch_execute(
++							dbgdev->dev->kgd,
++							i,
++							cntl.u32All,
++							addrHi.u32All,
++							addrLo.u32All);
++		}
++
++	} while (false);
++
++	return status;
+ }
+ 
+ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev,
+ 					struct dbg_address_watch_info *adw_info)
+ {
+-	struct pm4__set_config_reg *packets_vec;
++
++	int status = 0;
++	unsigned int i = 0;
+ 	union TCP_WATCH_ADDR_H_BITS addrHi;
+ 	union TCP_WATCH_ADDR_L_BITS addrLo;
+ 	union TCP_WATCH_CNTL_BITS cntl;
+-	struct kfd_mem_obj *mem_obj;
+-	unsigned int aw_reg_add_dword;
+-	uint32_t *packet_buff_uint;
+-	unsigned int i;
+-	int status;
+-	size_t ib_size = sizeof(struct pm4__set_config_reg) * 4;
++
+ 	/* we do not control the vmid in DIQ mode, just a place holder */
+ 	unsigned int vmid = 0;
+ 
+-	BUG_ON(!dbgdev || !dbgdev->dev || !adw_info);
++	struct kfd_mem_obj *mem_obj;
++	uint32_t *packet_buff_uint = NULL;
++
++	struct pm4__set_config_reg *packets_vec = NULL;
++
++	size_t ib_size = sizeof(struct pm4__set_config_reg) * 4;
++
++	unsigned int aw_reg_add_dword;
+ 
+ 	addrHi.u32All = 0;
+ 	addrLo.u32All = 0;
+ 	cntl.u32All = 0;
+ 
+-	if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ||
+-			(adw_info->num_watch_points == 0)) {
+-		pr_err("amdkfd: num_watch_points is invalid\n");
+-		return -EINVAL;
+-	}
++	do {
+ 
+-	if ((NULL == adw_info->watch_mode) ||
+-			(NULL == adw_info->watch_address)) {
+-		pr_err("amdkfd: adw_info fields are not valid\n");
+-		return -EINVAL;
+-	}
++		if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || (adw_info->num_watch_points == 0)) {
++			status = -EINVAL;
++			break;
++		}
++
++		if ((NULL == adw_info->watch_mode) || (NULL == adw_info->watch_address)) {
++			status = -EINVAL;
++			break;
++		}
+ 
+-	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
++		status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
+ 
+-	if (status != 0) {
+-		pr_err("amdkfd: Failed to allocate GART memory\n");
+-		return status;
+-	}
++		if (status != 0)
++			break;
+ 
+-	packet_buff_uint = mem_obj->cpu_ptr;
+-
+-	memset(packet_buff_uint, 0, ib_size);
+-
+-	packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint);
+-
+-	packets_vec[0].header.count = 1;
+-	packets_vec[0].header.opcode = IT_SET_CONFIG_REG;
+-	packets_vec[0].header.type = PM4_TYPE_3;
+-	packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
+-	packets_vec[0].bitfields2.insert_vmid = 1;
+-	packets_vec[1].ordinal1 = packets_vec[0].ordinal1;
+-	packets_vec[1].bitfields2.insert_vmid = 0;
+-	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
+-	packets_vec[2].bitfields2.insert_vmid = 0;
+-	packets_vec[3].ordinal1 = packets_vec[0].ordinal1;
+-	packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
+-	packets_vec[3].bitfields2.insert_vmid = 1;
+-
+-	for (i = 0; i < adw_info->num_watch_points; i++) {
+-		dbgdev_address_watch_set_registers(adw_info,
+-						&addrHi,
+-						&addrLo,
+-						&cntl,
+-						i,
+-						vmid);
+-
+-		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+-		pr_debug("\t\t%20s %08x\n", "register index :", i);
+-		pr_debug("\t\t%20s %08x\n", "vmid is :", vmid);
+-		pr_debug("\t\t%20s %p\n", "Add ptr is :",
+-				adw_info->watch_address);
+-		pr_debug("\t\t%20s %08llx\n", "Add     is :",
+-				adw_info->watch_address[i]);
+-		pr_debug("\t\t%20s %08x\n", "Address Low is :",
+-				addrLo.bitfields.addr);
+-		pr_debug("\t\t%20s %08x\n", "Address high is :",
+-				addrHi.bitfields.addr);
+-		pr_debug("\t\t%20s %08x\n", "Control Mask is :",
+-				cntl.bitfields.mask);
+-		pr_debug("\t\t%20s %08x\n", "Control Mode is :",
+-				cntl.bitfields.mode);
+-		pr_debug("\t\t%20s %08x\n", "Control Vmid is :",
+-				cntl.bitfields.vmid);
+-		pr_debug("\t\t%20s %08x\n", "Control atc  is :",
+-				cntl.bitfields.atc);
+-		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+-
+-		aw_reg_add_dword =
+-				dbgdev->dev->kfd2kgd->address_watch_get_offset(
+-					dbgdev->dev->kgd,
+-					i,
+-					ADDRESS_WATCH_REG_CNTL);
++		packet_buff_uint = mem_obj->cpu_ptr;
++
++		memset(packet_buff_uint, 0, ib_size);
+ 
+-		aw_reg_add_dword /= sizeof(uint32_t);
++		packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint);
+ 
+-		packets_vec[0].bitfields2.reg_offset =
+-					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
++		packets_vec[0].header.count = 1;
++		packets_vec[0].header.opcode = IT_SET_CONFIG_REG;
++		packets_vec[0].header.type = PM4_TYPE_3;
++		packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
++		packets_vec[0].bitfields2.insert_vmid = 1;
++		packets_vec[1].ordinal1 = packets_vec[0].ordinal1;
++		packets_vec[1].bitfields2.insert_vmid = 0;
++		packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
++		packets_vec[2].bitfields2.insert_vmid = 0;
++		packets_vec[3].ordinal1 = packets_vec[0].ordinal1;
++		packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
++		packets_vec[3].bitfields2.insert_vmid = 1;
+ 
+-		packets_vec[0].reg_data[0] = cntl.u32All;
++		for (i = 0; i < adw_info->num_watch_points; i++) {
+ 
+-		aw_reg_add_dword =
+-				dbgdev->dev->kfd2kgd->address_watch_get_offset(
+-					dbgdev->dev->kgd,
++			dbgdev_address_watch_set_registers(
++					adw_info,
++					&addrHi,
++					&addrLo,
++					&cntl,
+ 					i,
+-					ADDRESS_WATCH_REG_ADDR_HI);
++					vmid,
++					dbgdev->dev->device_info->asic_family
++					);
++
++			pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
++			pr_debug("\t\t%20s %08x\n", "register index :", i);
++			pr_debug("\t\t%20s %08x\n", "vmid is :", vmid);
++			pr_debug("\t\t%20s %p\n", "Add ptr is :", adw_info->watch_address);
++			pr_debug("\t\t%20s %08llx\n", "Add     is :", adw_info->watch_address[i]);
++			pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr);
++			pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr);
++			pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask);
++			pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode);
++			pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid);
++			pr_debug("\t\t%20s %08x\n", "Control atc  is :", cntl.bitfields.atc);
++			pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
++
++			aw_reg_add_dword =
++					dbgdev->dev->kfd2kgd
++					->address_watch_get_offset(
++						dbgdev->dev->kgd,
++						i,
++						ADDRESS_WATCH_REG_CNTL);
++
++			packets_vec[0].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE;
++			packets_vec[0].reg_data[0] = cntl.u32All;
+ 
+-		aw_reg_add_dword /= sizeof(uint32_t);
++			aw_reg_add_dword =
++					dbgdev->dev->kfd2kgd
++					->address_watch_get_offset(
++						dbgdev->dev->kgd,
++						i,
++						ADDRESS_WATCH_REG_ADDR_HI);
+ 
+-		packets_vec[1].bitfields2.reg_offset =
+-					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
+-		packets_vec[1].reg_data[0] = addrHi.u32All;
+ 
+-		aw_reg_add_dword =
+-				dbgdev->dev->kfd2kgd->address_watch_get_offset(
+-					dbgdev->dev->kgd,
+-					i,
+-					ADDRESS_WATCH_REG_ADDR_LO);
++			packets_vec[1].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE;
++			packets_vec[1].reg_data[0] = addrHi.u32All;
+ 
+-		aw_reg_add_dword /= sizeof(uint32_t);
++			aw_reg_add_dword =
++					dbgdev->dev->kfd2kgd
++					->address_watch_get_offset(
++						dbgdev->dev->kgd,
++						i,
++						ADDRESS_WATCH_REG_ADDR_LO);
+ 
+-		packets_vec[2].bitfields2.reg_offset =
+-				aw_reg_add_dword - AMD_CONFIG_REG_BASE;
+-		packets_vec[2].reg_data[0] = addrLo.u32All;
+ 
+-		/* enable watch flag if address is not zero*/
+-		if (adw_info->watch_address[i] > 0)
+-			cntl.bitfields.valid = 1;
+-		else
+-			cntl.bitfields.valid = 0;
++			packets_vec[2].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE;
++			packets_vec[2].reg_data[0] = addrLo.u32All;
+ 
+-		aw_reg_add_dword =
+-				dbgdev->dev->kfd2kgd->address_watch_get_offset(
+-					dbgdev->dev->kgd,
+-					i,
+-					ADDRESS_WATCH_REG_CNTL);
++			/* enable watch flag if address is not zero*/
++			if (adw_info->watch_address[i] > 0)
++				cntl.bitfields.valid = 1;
++			else
++				cntl.bitfields.valid = 0;
+ 
+-		aw_reg_add_dword /= sizeof(uint32_t);
++			aw_reg_add_dword =
++					dbgdev->dev->kfd2kgd
++					->address_watch_get_offset(
++						dbgdev->dev->kgd,
++						i,
++						ADDRESS_WATCH_REG_CNTL);
+ 
+-		packets_vec[3].bitfields2.reg_offset =
+-					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
+-		packets_vec[3].reg_data[0] = cntl.u32All;
+ 
+-		status = dbgdev_diq_submit_ib(
+-					dbgdev,
+-					adw_info->process->pasid,
+-					mem_obj->gpu_addr,
+-					packet_buff_uint,
+-					ib_size);
++			packets_vec[3].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE;
++			packets_vec[3].reg_data[0] = cntl.u32All;
++
++			status = dbgdev_diq_submit_ib(
++						dbgdev,
++						adw_info->process->pasid,
++						mem_obj->gpu_addr,
++						packet_buff_uint,
++						ib_size);
++
++			if (status != 0) {
++				pr_debug("Error! kfd: In func %s >> failed to submit DIQ packet\n", __func__);
++				break;
++			}
+ 
+-		if (status != 0) {
+-			pr_err("amdkfd: Failed to submit IB to DIQ\n");
+-			break;
+ 		}
+-	}
+ 
+-	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
++	} while (false);
++	if (packet_buff_uint != NULL)
++		kfd_gtt_sa_free(dbgdev->dev, mem_obj);
++
+ 	return status;
++
+ }
+ 
+ static int dbgdev_wave_control_set_registers(
+ 				struct dbg_wave_control_info *wac_info,
+ 				union SQ_CMD_BITS *in_reg_sq_cmd,
+-				union GRBM_GFX_INDEX_BITS *in_reg_gfx_index)
++				union GRBM_GFX_INDEX_BITS *in_reg_gfx_index,
++				unsigned int asic_family)
+ {
+ 	int status = 0;
+ 	union SQ_CMD_BITS reg_sq_cmd;
+ 	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+-	struct HsaDbgWaveMsgAMDGen2 *pMsg;
+-
+-	BUG_ON(!wac_info || !in_reg_sq_cmd || !in_reg_gfx_index);
+ 
+ 	reg_sq_cmd.u32All = 0;
++
+ 	reg_gfx_index.u32All = 0;
+-	pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2;
+ 
+ 	switch (wac_info->mode) {
+-	/* Send command to single wave */
+-	case HSA_DBG_WAVEMODE_SINGLE:
+-		/*
+-		 * Limit access to the process waves only,
+-		 * by setting vmid check
+-		 */
++	case HSA_DBG_WAVEMODE_SINGLE:	/*  Send command to single wave  */
++		/*limit access to the process waves only,by setting vmid check */
+ 		reg_sq_cmd.bits.check_vmid = 1;
+-		reg_sq_cmd.bits.simd_id = pMsg->ui32.SIMD;
+-		reg_sq_cmd.bits.wave_id = pMsg->ui32.WaveId;
++		reg_sq_cmd.bits.simd_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.SIMD;
++		reg_sq_cmd.bits.wave_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.WaveId;
+ 		reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_SINGLE;
+ 
+-		reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray;
+-		reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine;
+-		reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU;
++		reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray;
++		reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine;
++		reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU;
+ 
+ 		break;
+ 
+-	/* Send command to all waves with matching VMID */
+-	case HSA_DBG_WAVEMODE_BROADCAST_PROCESS:
++	case HSA_DBG_WAVEMODE_BROADCAST_PROCESS:	/*  Send command to all waves with matching VMID  */
++
+ 
+ 		reg_gfx_index.bits.sh_broadcast_writes = 1;
+ 		reg_gfx_index.bits.se_broadcast_writes = 1;
+ 		reg_gfx_index.bits.instance_broadcast_writes = 1;
+ 
+ 		reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST;
+-
+ 		break;
+ 
+-	/* Send command to all CU waves with matching VMID */
+-	case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU:
++	case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU:	/*  Send command to all CU waves with matching VMID  */
+ 
+ 		reg_sq_cmd.bits.check_vmid = 1;
+ 		reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST;
+ 
+-		reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray;
+-		reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine;
+-		reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU;
++		reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray;
++		reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine;
++		reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU;
+ 
+ 		break;
+ 
+ 	default:
+-		return -EINVAL;
++		status = -EINVAL;
++		break;
+ 	}
+ 
+ 	switch (wac_info->operand) {
+ 	case HSA_DBG_WAVEOP_HALT:
+-		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT;
++		if (asic_family == CHIP_KAVERI) {
++			reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT;
++			pr_debug("kfd:dbgdev: halting KV\n");
++		} else {
++			reg_sq_cmd.bits_sethalt.cmd  = SQ_IND_CMD_NEW_SETHALT;
++			reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT;
++			pr_debug("kfd:dbgdev: halting CZ\n");
++		}
+ 		break;
+ 
+ 	case HSA_DBG_WAVEOP_RESUME:
+-		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME;
++		if (asic_family == CHIP_KAVERI) {
++			reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME;
++			pr_debug("kfd:dbgdev: resuming KV\n");
++		} else {
++			reg_sq_cmd.bits_sethalt.cmd  = SQ_IND_CMD_NEW_SETHALT;
++			reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME;
++			pr_debug("kfd:dbgdev: resuming CZ\n");
++		}
+ 		break;
+ 
+ 	case HSA_DBG_WAVEOP_KILL:
+@@ -601,128 +608,114 @@ static int dbgdev_wave_control_set_registers(
+ 	}
+ 
+ 	if (status == 0) {
+-		*in_reg_sq_cmd = reg_sq_cmd;
++		*in_reg_sq_cmd    = reg_sq_cmd;
+ 		*in_reg_gfx_index = reg_gfx_index;
+ 	}
+-
+ 	return status;
++
+ }
+ 
+ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
+ 					struct dbg_wave_control_info *wac_info)
+ {
+ 
+-	int status;
++	int status = 0;
+ 	union SQ_CMD_BITS reg_sq_cmd;
+ 	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+ 	struct kfd_mem_obj *mem_obj;
+-	uint32_t *packet_buff_uint;
+-	struct pm4__set_config_reg *packets_vec;
++	uint32_t *packet_buff_uint = NULL;
++	struct pm4__set_config_reg *packets_vec = NULL;
+ 	size_t ib_size = sizeof(struct pm4__set_config_reg) * 3;
+ 
+-	BUG_ON(!dbgdev || !wac_info);
+-
+ 	reg_sq_cmd.u32All = 0;
++	do {
+ 
+-	status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
+-							&reg_gfx_index);
+-	if (status) {
+-		pr_err("amdkfd: Failed to set wave control registers\n");
+-		return status;
+-	}
+-
+-	/* we do not control the VMID in DIQ,so reset it to a known value */
+-	reg_sq_cmd.bits.vm_id = 0;
+-
+-	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+-
+-	pr_debug("\t\t mode      is: %u\n", wac_info->mode);
+-	pr_debug("\t\t operand   is: %u\n", wac_info->operand);
+-	pr_debug("\t\t trap id   is: %u\n", wac_info->trapId);
+-	pr_debug("\t\t msg value is: %u\n",
+-			wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
+-	pr_debug("\t\t vmid      is: N/A\n");
+-
+-	pr_debug("\t\t chk_vmid  is : %u\n", reg_sq_cmd.bitfields.check_vmid);
+-	pr_debug("\t\t command   is : %u\n", reg_sq_cmd.bitfields.cmd);
+-	pr_debug("\t\t queue id  is : %u\n", reg_sq_cmd.bitfields.queue_id);
+-	pr_debug("\t\t simd id   is : %u\n", reg_sq_cmd.bitfields.simd_id);
+-	pr_debug("\t\t mode      is : %u\n", reg_sq_cmd.bitfields.mode);
+-	pr_debug("\t\t vm_id     is : %u\n", reg_sq_cmd.bitfields.vm_id);
+-	pr_debug("\t\t wave_id   is : %u\n", reg_sq_cmd.bitfields.wave_id);
+-
+-	pr_debug("\t\t ibw       is : %u\n",
+-			reg_gfx_index.bitfields.instance_broadcast_writes);
+-	pr_debug("\t\t ii        is : %u\n",
+-			reg_gfx_index.bitfields.instance_index);
+-	pr_debug("\t\t sebw      is : %u\n",
+-			reg_gfx_index.bitfields.se_broadcast_writes);
+-	pr_debug("\t\t se_ind    is : %u\n", reg_gfx_index.bitfields.se_index);
+-	pr_debug("\t\t sh_ind    is : %u\n", reg_gfx_index.bitfields.sh_index);
+-	pr_debug("\t\t sbw       is : %u\n",
+-			reg_gfx_index.bitfields.sh_broadcast_writes);
+-
+-	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+-
+-	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
+-
+-	if (status != 0) {
+-		pr_err("amdkfd: Failed to allocate GART memory\n");
+-		return status;
+-	}
+-
+-	packet_buff_uint = mem_obj->cpu_ptr;
++		status = dbgdev_wave_control_set_registers(wac_info,
++				&reg_sq_cmd,
++				&reg_gfx_index,
++				dbgdev->dev->device_info->asic_family);
+ 
+-	memset(packet_buff_uint, 0, ib_size);
++		/* we do not control the VMID in DIQ,so reset it to a known value */
++		reg_sq_cmd.bits.vm_id = 0;
++		if (status != 0)
++			break;
++		pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
++
++		pr_debug("\t\t mode      is: %u\n", wac_info->mode);
++		pr_debug("\t\t operand   is: %u\n", wac_info->operand);
++		pr_debug("\t\t trap id   is: %u\n", wac_info->trapId);
++		pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
++		pr_debug("\t\t vmid      is: N/A\n");
++
++		pr_debug("\t\t chk_vmid  is : %u\n", reg_sq_cmd.bitfields.check_vmid);
++		pr_debug("\t\t command   is : %u\n", reg_sq_cmd.bitfields.cmd);
++		pr_debug("\t\t queue id  is : %u\n", reg_sq_cmd.bitfields.queue_id);
++		pr_debug("\t\t simd id   is : %u\n", reg_sq_cmd.bitfields.simd_id);
++		pr_debug("\t\t mode      is : %u\n", reg_sq_cmd.bitfields.mode);
++		pr_debug("\t\t vm_id     is : %u\n", reg_sq_cmd.bitfields.vm_id);
++		pr_debug("\t\t wave_id   is : %u\n", reg_sq_cmd.bitfields.wave_id);
++
++		pr_debug("\t\t ibw       is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes);
++		pr_debug("\t\t ii        is : %u\n", reg_gfx_index.bitfields.instance_index);
++		pr_debug("\t\t sebw      is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes);
++		pr_debug("\t\t se_ind    is : %u\n", reg_gfx_index.bitfields.se_index);
++		pr_debug("\t\t sh_ind    is : %u\n", reg_gfx_index.bitfields.sh_index);
++		pr_debug("\t\t sbw       is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes);
++
++		pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
++
++		status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
++
++		if (status != 0)
++			break;
+ 
+-	packets_vec =  (struct pm4__set_config_reg *) packet_buff_uint;
+-	packets_vec[0].header.count = 1;
+-	packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
+-	packets_vec[0].header.type = PM4_TYPE_3;
+-	packets_vec[0].bitfields2.reg_offset =
+-			GRBM_GFX_INDEX / (sizeof(uint32_t)) -
+-				USERCONFIG_REG_BASE;
++		packet_buff_uint = mem_obj->cpu_ptr;
+ 
+-	packets_vec[0].bitfields2.insert_vmid = 0;
+-	packets_vec[0].reg_data[0] = reg_gfx_index.u32All;
++		memset(packet_buff_uint, 0, ib_size);
+ 
+-	packets_vec[1].header.count = 1;
+-	packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
+-	packets_vec[1].header.type = PM4_TYPE_3;
+-	packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) -
+-						AMD_CONFIG_REG_BASE;
++		packets_vec =  (struct pm4__set_config_reg *) packet_buff_uint;
++		packets_vec[0].header.count = 1;
++		packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
++		packets_vec[0].header.type = PM4_TYPE_3;
++		packets_vec[0].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE;
++		packets_vec[0].bitfields2.insert_vmid = 0;
++		packets_vec[0].reg_data[0] = reg_gfx_index.u32All;
+ 
+-	packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
+-	packets_vec[1].bitfields2.insert_vmid = 1;
+-	packets_vec[1].reg_data[0] = reg_sq_cmd.u32All;
++		packets_vec[1].header.count = 1;
++		packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
++		packets_vec[1].header.type = PM4_TYPE_3;
++		packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - CONFIG_REG_BASE;
++		packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
++		packets_vec[1].bitfields2.insert_vmid = 1;
++		packets_vec[1].reg_data[0] = reg_sq_cmd.u32All;
+ 
+-	/* Restore the GRBM_GFX_INDEX register */
++		/* Restore the GRBM_GFX_INDEX register */
+ 
+-	reg_gfx_index.u32All = 0;
+-	reg_gfx_index.bits.sh_broadcast_writes = 1;
+-	reg_gfx_index.bits.instance_broadcast_writes = 1;
+-	reg_gfx_index.bits.se_broadcast_writes = 1;
++		reg_gfx_index.u32All = 0;
++		reg_gfx_index.bits.sh_broadcast_writes = 1;
++		reg_gfx_index.bits.instance_broadcast_writes = 1;
++		reg_gfx_index.bits.se_broadcast_writes = 1;
+ 
+ 
+-	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
+-	packets_vec[2].bitfields2.reg_offset =
+-				GRBM_GFX_INDEX / (sizeof(uint32_t)) -
+-					USERCONFIG_REG_BASE;
++		packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
++		packets_vec[2].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE;
++		packets_vec[2].bitfields2.insert_vmid = 0;
++		packets_vec[2].reg_data[0] = reg_gfx_index.u32All;
+ 
+-	packets_vec[2].bitfields2.insert_vmid = 0;
+-	packets_vec[2].reg_data[0] = reg_gfx_index.u32All;
++		status = dbgdev_diq_submit_ib(
++				dbgdev,
++				wac_info->process->pasid,
++				mem_obj->gpu_addr,
++				packet_buff_uint,
++				ib_size);
+ 
+-	status = dbgdev_diq_submit_ib(
+-			dbgdev,
+-			wac_info->process->pasid,
+-			mem_obj->gpu_addr,
+-			packet_buff_uint,
+-			ib_size);
++		if (status != 0)
++			pr_debug("%s\n", " Critical Error ! Submit diq packet failed ");
+ 
+-	if (status != 0)
+-		pr_err("amdkfd: Failed to submit IB to DIQ\n");
++	} while (false);
+ 
+-	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
++	if (packet_buff_uint != NULL)
++		kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+ 
+ 	return status;
+ }
+@@ -730,66 +723,69 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
+ static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev,
+ 					struct dbg_wave_control_info *wac_info)
+ {
+-	int status;
++	int status = 0;
++	unsigned int vmid = 0xffff;
+ 	union SQ_CMD_BITS reg_sq_cmd;
+ 	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+-	struct kfd_process_device *pdd;
+ 
+-	BUG_ON(!dbgdev || !dbgdev->dev || !wac_info);
++	struct kfd_process_device *pdd = NULL;
+ 
+ 	reg_sq_cmd.u32All = 0;
++	status = 0;
+ 
+ 	/* taking the VMID for that process on the safe way using PDD */
+ 	pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process);
+ 
+-	if (!pdd) {
+-		pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n");
+-		return -EFAULT;
+-	}
+-	status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
+-							&reg_gfx_index);
+-	if (status) {
+-		pr_err("amdkfd: Failed to set wave control registers\n");
+-		return status;
++	if (pdd) {
++		status = dbgdev_wave_control_set_registers(wac_info,
++				&reg_sq_cmd,
++				&reg_gfx_index,
++				dbgdev->dev->device_info->asic_family);
++		if (status == 0) {
++
++			/* for non DIQ we need to patch the VMID: */
++
++			vmid = pdd->qpd.vmid;
++			reg_sq_cmd.bits.vm_id = vmid;
++
++			pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
++
++			pr_debug("\t\t mode      is: %u\n", wac_info->mode);
++			pr_debug("\t\t operand   is: %u\n", wac_info->operand);
++			pr_debug("\t\t trap id   is: %u\n", wac_info->trapId);
++			pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
++			pr_debug("\t\t vmid      is: %u\n", vmid);
++
++			pr_debug("\t\t chk_vmid  is : %u\n", reg_sq_cmd.bitfields.check_vmid);
++			pr_debug("\t\t command   is : %u\n", reg_sq_cmd.bitfields.cmd);
++			pr_debug("\t\t queue id  is : %u\n", reg_sq_cmd.bitfields.queue_id);
++			pr_debug("\t\t simd id   is : %u\n", reg_sq_cmd.bitfields.simd_id);
++			pr_debug("\t\t mode      is : %u\n", reg_sq_cmd.bitfields.mode);
++			pr_debug("\t\t vm_id     is : %u\n", reg_sq_cmd.bitfields.vm_id);
++			pr_debug("\t\t wave_id   is : %u\n", reg_sq_cmd.bitfields.wave_id);
++
++			pr_debug("\t\t ibw       is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes);
++			pr_debug("\t\t ii        is : %u\n", reg_gfx_index.bitfields.instance_index);
++			pr_debug("\t\t sebw      is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes);
++			pr_debug("\t\t se_ind    is : %u\n", reg_gfx_index.bitfields.se_index);
++			pr_debug("\t\t sh_ind    is : %u\n", reg_gfx_index.bitfields.sh_index);
++			pr_debug("\t\t sbw       is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes);
++
++			pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
++
++			dbgdev->dev->kfd2kgd
++				->wave_control_execute(dbgdev->dev->kgd,
++							reg_gfx_index.u32All,
++							reg_sq_cmd.u32All);
++		} else {
++			status = -EINVAL;
++		}
++	} else {
++		status = -EFAULT;
+ 	}
+ 
+-	/* for non DIQ we need to patch the VMID: */
++	return status;
+ 
+-	reg_sq_cmd.bits.vm_id = pdd->qpd.vmid;
+-
+-	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+-
+-	pr_debug("\t\t mode      is: %u\n", wac_info->mode);
+-	pr_debug("\t\t operand   is: %u\n", wac_info->operand);
+-	pr_debug("\t\t trap id   is: %u\n", wac_info->trapId);
+-	pr_debug("\t\t msg value is: %u\n",
+-			wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
+-	pr_debug("\t\t vmid      is: %u\n", pdd->qpd.vmid);
+-
+-	pr_debug("\t\t chk_vmid  is : %u\n", reg_sq_cmd.bitfields.check_vmid);
+-	pr_debug("\t\t command   is : %u\n", reg_sq_cmd.bitfields.cmd);
+-	pr_debug("\t\t queue id  is : %u\n", reg_sq_cmd.bitfields.queue_id);
+-	pr_debug("\t\t simd id   is : %u\n", reg_sq_cmd.bitfields.simd_id);
+-	pr_debug("\t\t mode      is : %u\n", reg_sq_cmd.bitfields.mode);
+-	pr_debug("\t\t vm_id     is : %u\n", reg_sq_cmd.bitfields.vm_id);
+-	pr_debug("\t\t wave_id   is : %u\n", reg_sq_cmd.bitfields.wave_id);
+-
+-	pr_debug("\t\t ibw       is : %u\n",
+-			reg_gfx_index.bitfields.instance_broadcast_writes);
+-	pr_debug("\t\t ii        is : %u\n",
+-			reg_gfx_index.bitfields.instance_index);
+-	pr_debug("\t\t sebw      is : %u\n",
+-			reg_gfx_index.bitfields.se_broadcast_writes);
+-	pr_debug("\t\t se_ind    is : %u\n", reg_gfx_index.bitfields.se_index);
+-	pr_debug("\t\t sh_ind    is : %u\n", reg_gfx_index.bitfields.sh_index);
+-	pr_debug("\t\t sbw       is : %u\n",
+-			reg_gfx_index.bitfields.sh_broadcast_writes);
+-
+-	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+-
+-	return dbgdev->dev->kfd2kgd->wave_control_execute(dbgdev->dev->kgd,
+-							reg_gfx_index.u32All,
+-							reg_sq_cmd.u32All);
+ }
+ 
+ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+@@ -800,13 +796,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+ 	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+ 	struct kfd_process_device *pdd;
+ 	struct dbg_wave_control_info wac_info;
+-	int temp;
+-	int first_vmid_to_scan = 8;
+-	int last_vmid_to_scan = 15;
+-
+-	first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1;
+-	temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan;
+-	last_vmid_to_scan = first_vmid_to_scan + ffz(temp);
++	int first_vmid_to_scan = dev->vm_info.first_vmid_kfd;
++	int last_vmid_to_scan = dev->vm_info.last_vmid_kfd;
+ 
+ 	reg_sq_cmd.u32All = 0;
+ 	status = 0;
+@@ -823,7 +814,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+ 	for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) {
+ 		if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid
+ 				(dev->kgd, vmid)) {
+-			if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid
++			if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid
+ 					(dev->kgd, vmid) == p->pasid) {
+ 				pr_debug("Killing wave fronts of vmid %d and pasid %d\n",
+ 						vmid, p->pasid);
+@@ -833,7 +824,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+ 	}
+ 
+ 	if (vmid > last_vmid_to_scan) {
+-		pr_err("amdkfd: didn't found vmid for pasid (%d)\n", p->pasid);
++		pr_err("amdkfd: didn't find vmid for pasid (%d)\n", p->pasid);
+ 		return -EFAULT;
+ 	}
+ 
+@@ -843,7 +834,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+ 		return -EFAULT;
+ 
+ 	status = dbgdev_wave_control_set_registers(&wac_info, &reg_sq_cmd,
+-			&reg_gfx_index);
++			&reg_gfx_index, dev->device_info->asic_family);
+ 	if (status != 0)
+ 		return -EINVAL;
+ 
+@@ -858,15 +849,12 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+ }
+ 
+ void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
+-			enum DBGDEV_TYPE type)
++			DBGDEV_TYPE type)
+ {
+-	BUG_ON(!pdbgdev || !pdev);
+-
+ 	pdbgdev->dev = pdev;
+ 	pdbgdev->kq = NULL;
+ 	pdbgdev->type = type;
+ 	pdbgdev->pqm = NULL;
+-
+ 	switch (type) {
+ 	case DBGDEV_TYPE_NODIQ:
+ 		pdbgdev->dbgdev_register = dbgdev_register_nodiq;
+@@ -876,10 +864,12 @@ void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
+ 		break;
+ 	case DBGDEV_TYPE_DIQ:
+ 	default:
++
+ 		pdbgdev->dbgdev_register = dbgdev_register_diq;
+ 		pdbgdev->dbgdev_unregister = dbgdev_unregister_diq;
+ 		pdbgdev->dbgdev_wave_control =  dbgdev_wave_control_diq;
+ 		pdbgdev->dbgdev_address_watch = dbgdev_address_watch_diq;
++
+ 		break;
+ 	}
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
+index 03424c2..82f48ff 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
+@@ -23,6 +23,10 @@
+ #ifndef KFD_DBGDEV_H_
+ #define KFD_DBGDEV_H_
+ 
++/*
++ * SQ_IND_CMD_CMD enum
++ */
++
+ enum {
+ 	SQ_CMD_VMID_OFFSET = 28,
+ 	ADDRESS_WATCH_CNTL_OFFSET = 24
+@@ -48,9 +52,9 @@ enum {
+ 
+ /* CONFIG reg space definition */
+ enum {
+-	AMD_CONFIG_REG_BASE = 0x2000,	/* in dwords */
+-	AMD_CONFIG_REG_END = 0x2B00,
+-	AMD_CONFIG_REG_SIZE = AMD_CONFIG_REG_END - AMD_CONFIG_REG_BASE
++	CONFIG_REG_BASE = 0x2000,	/* in dwords */
++	CONFIG_REG_END = 0x2B00,
++	CONFIG_REG_SIZE = CONFIG_REG_END - CONFIG_REG_BASE
+ };
+ 
+ /* SH reg space definition */
+@@ -60,22 +64,43 @@ enum {
+ 	SH_REG_SIZE = SH_REG_END - SH_REG_BASE
+ };
+ 
++/* SQ_CMD definitions */
++
++enum {
++	SQ_IND_CMD_DATA_RESUME = 0,
++	SQ_IND_CMD_DATA_HALT = 1
++};
++
++enum SQ_IND_CMD_NEW {
++	SQ_IND_CMD_NEW_NULL = 0x00000000,
++	SQ_IND_CMD_NEW_SETHALT = 0x00000001,
++	SQ_IND_CMD_NEW_SAVECTX = 0x00000002,
++	SQ_IND_CMD_NEW_KILL = 0x00000003,
++	SQ_IND_CMD_NEW_DEBUG = 0x00000004,
++	SQ_IND_CMD_NEW_TRAP = 0x00000005,
++	SQ_IND_CMD_NEW_SET_PRIO = 0x00000006
++
++};
++
+ enum SQ_IND_CMD_CMD {
+ 	SQ_IND_CMD_CMD_NULL = 0x00000000,
+ 	SQ_IND_CMD_CMD_HALT = 0x00000001,
+ 	SQ_IND_CMD_CMD_RESUME = 0x00000002,
+ 	SQ_IND_CMD_CMD_KILL = 0x00000003,
+ 	SQ_IND_CMD_CMD_DEBUG = 0x00000004,
+-	SQ_IND_CMD_CMD_TRAP = 0x00000005,
++	SQ_IND_CMD_CMD_TRAP = 0x00000005
+ };
++/*
++ * SQ_IND_CMD_MODE enum
++ */
+ 
+-enum SQ_IND_CMD_MODE {
++typedef enum SQ_IND_CMD_MODE {
+ 	SQ_IND_CMD_MODE_SINGLE = 0x00000000,
+ 	SQ_IND_CMD_MODE_BROADCAST = 0x00000001,
+ 	SQ_IND_CMD_MODE_BROADCAST_QUEUE = 0x00000002,
+ 	SQ_IND_CMD_MODE_BROADCAST_PIPE = 0x00000003,
+ 	SQ_IND_CMD_MODE_BROADCAST_ME = 0x00000004,
+-};
++} SQ_IND_CMD_MODE;
+ 
+ union SQ_IND_INDEX_BITS {
+ 	struct {
+@@ -106,18 +131,32 @@ union SQ_IND_CMD_BITS {
+ union SQ_CMD_BITS {
+ 	struct {
+ 		uint32_t cmd:3;
+-		 uint32_t:1;
++		uint32_t:1;
+ 		uint32_t mode:3;
+ 		uint32_t check_vmid:1;
+ 		uint32_t trap_id:3;
+-		 uint32_t:5;
++		uint32_t:5;
+ 		uint32_t wave_id:4;
+ 		uint32_t simd_id:2;
+-		 uint32_t:2;
++		uint32_t:2;
+ 		uint32_t queue_id:3;
+-		 uint32_t:1;
++		uint32_t:1;
+ 		uint32_t vm_id:4;
+ 	} bitfields, bits;
++	struct {
++		uint32_t cmd:3;
++		uint32_t:1;
++		uint32_t mode:3;
++		uint32_t check_vmid:1;
++		uint32_t data:3;
++		uint32_t:5;
++		uint32_t wave_id:4;
++		uint32_t simd_id:2;
++		uint32_t:2;
++		uint32_t queue_id:3;
++		uint32_t:1;
++		uint32_t vm_id:4;
++	} bitfields_sethalt, bits_sethalt;
+ 	uint32_t u32All;
+ 	signed int i32All;
+ 	float f32All;
+@@ -169,7 +208,7 @@ union TCP_WATCH_ADDR_L_BITS {
+ };
+ 
+ enum {
+-	QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */
++	QUEUESTATE__INVALID = 0,	/* so by default we'll get invalid state */
+ 	QUEUESTATE__ACTIVE_COMPLETION_PENDING,
+ 	QUEUESTATE__ACTIVE
+ };
+@@ -187,7 +226,6 @@ union ULARGE_INTEGER {
+ #define KFD_CIK_VMID_END_OFFSET (KFD_CIK_VMID_START_OFFSET + (8))
+ 
+ 
+-void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
+-			enum DBGDEV_TYPE type);
++void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, DBGDEV_TYPE type);
+ 
+-#endif	/* KFD_DBGDEV_H_ */
++#endif				/* KFD_DBGDEV_H_ */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
+index 56d6763..5d269ea 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
+@@ -36,42 +36,50 @@
+ 
+ static DEFINE_MUTEX(kfd_dbgmgr_mutex);
+ 
+-struct mutex *kfd_get_dbgmgr_mutex(void)
++struct mutex *
++get_dbgmgr_mutex(void)
+ {
+ 	return &kfd_dbgmgr_mutex;
+ }
+ 
++/*===========================================================================*/
+ 
+-static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr)
++static void
++kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr)
+ {
+-	BUG_ON(!pmgr);
+-
+ 	kfree(pmgr->dbgdev);
+-
+ 	pmgr->dbgdev = NULL;
+ 	pmgr->pasid = 0;
+ 	pmgr->dev = NULL;
+ }
+ 
+-void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr)
++/*===========================================================================*/
++
++void
++kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr)
+ {
+ 	if (pmgr != NULL) {
+ 		kfd_dbgmgr_uninitialize(pmgr);
+ 		kfree(pmgr);
++		pmgr = NULL;
+ 	}
+ }
+ 
+-bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
++/*===========================================================================*/
++
++bool
++kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
+ {
+-	enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ;
++	DBGDEV_TYPE  type = DBGDEV_TYPE_DIQ;
+ 	struct kfd_dbgmgr *new_buff;
+ 
+ 	BUG_ON(pdev == NULL);
+ 	BUG_ON(!pdev->init_complete);
+ 
+ 	new_buff = kfd_alloc_struct(new_buff);
+-	if (!new_buff) {
+-		pr_err("amdkfd: Failed to allocate dbgmgr instance\n");
++	if (!new_buff)
++	{
++		dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgmgr instance\n", __func__);
+ 		return false;
+ 	}
+ 
+@@ -79,7 +87,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
+ 	new_buff->dev = pdev;
+ 	new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev);
+ 	if (!new_buff->dbgdev) {
+-		pr_err("amdkfd: Failed to allocate dbgdev instance\n");
++		dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgdev\n", __func__);
+ 		kfree(new_buff);
+ 		return false;
+ 	}
+@@ -94,75 +102,200 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
+ 	return true;
+ }
+ 
+-long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
++/*===========================================================================*/
++
++long
++kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
+ {
+-	BUG_ON(!p || !pmgr || !pmgr->dbgdev);
++	long status = 0;
+ 
+-	if (pmgr->pasid != 0) {
+-		pr_debug("H/W debugger is already active using pasid %d\n",
+-				pmgr->pasid);
+-		return -EBUSY;
+-	}
++	do {
++
++		if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) {
++			dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
++			/*  Invalid Pointer.  */
++			status = -EINVAL;
++			break;
++		}
++		if (pmgr->pasid != 0) {
++			/*  HW debugger is already active.  */
++			status = -EBUSY;
++			break;
++		}
++
++		/* remember pasid */
++
++		pmgr->pasid = p->pasid;
++
++		/* provide the pqm for diq generation */
+ 
+-	/* remember pasid */
+-	pmgr->pasid = p->pasid;
++		pmgr->dbgdev->pqm = &p->pqm;
+ 
+-	/* provide the pqm for diq generation */
+-	pmgr->dbgdev->pqm = &p->pqm;
++		/* activate the actual registering */
++		/* todo: you should lock with the process mutex here */
++		pmgr->dbgdev->dbgdev_register(pmgr->dbgdev);
++		/* todo: you should unlock with the process mutex here  */
+ 
+-	/* activate the actual registering */
+-	pmgr->dbgdev->dbgdev_register(pmgr->dbgdev);
++	} while (false);
+ 
+-	return 0;
++	return status;
+ }
+ 
+-long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
++/* ========================================================================== */
++
++long
++kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
+ {
+-	BUG_ON(!p || !pmgr || !pmgr->dbgdev);
+ 
+-	/* Is the requests coming from the already registered process? */
+-	if (pmgr->pasid != p->pasid) {
+-		pr_debug("H/W debugger is not registered by calling pasid %d\n",
+-				p->pasid);
+-		return -EINVAL;
+-	}
++	long status = 0;
+ 
+-	pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev);
++	do {
+ 
+-	pmgr->pasid = 0;
++		if ((pmgr == NULL) || (pmgr->dev == NULL)
++				|| (pmgr->dbgdev == NULL) || (p == NULL)) {
++			dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
++			/* Invalid Pointer */
++			status = -EINVAL;
++			break;
++		}
++		if (pmgr->pasid != p->pasid) {
++			/* Is the requests coming from the already registered process? */
++			status = -EINVAL;
++			break;
++		}
++
++		/* todo: you should lock with the process mutex here */
++
++		pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev);
+ 
+-	return 0;
++		/* todo: you should unlock with the process mutex here  */
++
++		pmgr->pasid = 0;
++
++	} while (false);
++
++	return status;
+ }
+ 
+-long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr,
+-				struct dbg_wave_control_info *wac_info)
++/* =========================================================================== */
++
++long
++kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info)
+ {
+-	BUG_ON(!pmgr || !pmgr->dbgdev || !wac_info);
++	long status = 0;
+ 
+-	/* Is the requests coming from the already registered process? */
+-	if (pmgr->pasid != wac_info->process->pasid) {
+-		pr_debug("H/W debugger support was not registered for requester pasid %d\n",
+-				wac_info->process->pasid);
+-		return -EINVAL;
+-	}
++	dev_info(NULL, "kfd: In func %s\n", __func__);
++
++	do {
++
++		if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (wac_info == NULL)
++		    || (wac_info->process == NULL)) {
++			/* Invalid Pointer */
++			dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
++			status = -EINVAL;
++			break;
++		}
++		/* Is the requests coming from the already registered process? */
++		if (pmgr->pasid != wac_info->process->pasid) {
++			/* HW debugger support was not registered for requester process */
++			status = -EINVAL;
++			break;
++		}
++
++		status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info);
++
++	} while (false);
++
++	return status;
+ 
+-	return (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info);
+ }
+ 
+-long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr,
+-				struct dbg_address_watch_info *adw_info)
++/* =========================================================================== */
++
++long
++kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info)
+ {
+-	BUG_ON(!pmgr || !pmgr->dbgdev || !adw_info);
++	long status = 0;
+ 
++	dev_info(NULL, "kfd: In func %s\n", __func__);
+ 
+-	/* Is the requests coming from the already registered process? */
+-	if (pmgr->pasid != adw_info->process->pasid) {
+-		pr_debug("H/W debugger support was not registered for requester pasid %d\n",
+-				adw_info->process->pasid);
+-		return -EINVAL;
+-	}
++	do {
++
++		if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (adw_info == NULL)
++		    || (adw_info->process == NULL)) {
++			/* Invalid Pointer */
++			dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
++			status = -EINVAL;
++			break;
++		}
++		/* Is the requests coming from the already registered process? */
++		if (pmgr->pasid != adw_info->process->pasid) {
++			/* HW debugger support was not registered for requester process */
++			status = -EINVAL;
++			break;
++		}
++
++		status = (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev, adw_info);
++
++	} while (false);
++
++	return status;
+ 
+-	return (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev,
+-							adw_info);
+ }
+ 
++
++/* =========================================================================== */
++/*
++ * Handle abnormal process termination
++ * if we are in the midst of a debug session, we should kill all pending waves
++ * of the debugged process and unregister the process from the Debugger.
++ */
++long
++kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process)
++{
++	long status = 0;
++	struct dbg_wave_control_info wac_info;
++
++	dev_info(NULL, "kfd: In func %s\n", __func__);
++
++	do {
++
++		if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) {
++			/* Invalid Pointer */
++			dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
++			status = -EINVAL;
++			break;
++		}
++		/* first, we kill all the wavefronts of this process */
++
++		wac_info.process = process;
++		wac_info.mode = HSA_DBG_WAVEMODE_BROADCAST_PROCESS;
++		wac_info.operand = HSA_DBG_WAVEOP_KILL;
++		wac_info.trapId  = 0x0; /* not used for the KILL */
++		wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = 0; /* not used for kill */
++		wac_info.dbgWave_msg.MemoryVA = NULL; /* not used for kill  */
++
++		status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, &wac_info);
++
++		if (status != 0) {
++			dev_info(NULL, "Error! kfd: In func %s: wave control failed, status is: %ld\n", __func__, status);
++			break;
++		}
++		if (pmgr->pasid == wac_info.process->pasid) {
++				/* if terminated process was registered for debug, then unregister it  */
++				status = kfd_dbgmgr_unregister(pmgr, process);
++				pmgr->pasid = 0;
++		}
++		if (status != 0)
++			dev_info(NULL,
++					"Error! kfd: In func %s: unregister failed, status is: %ld debugger can not be reused\n",
++					__func__, status);
++
++	} while (false);
++
++	return status;
++
++}
++
++
++/*///////////////////////////////////////////////////////////////////////////////////////// */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
+index 257a745..2b6484e 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
+@@ -26,252 +26,242 @@
+ 
+ #include "kfd_priv.h"
+ 
+-/* must align with hsakmttypes definition */
++/*
++ * SQ_IND_CMD_CMD enum
++ */
++
++
++/* must align with hsakmttypes definition. */
+ #pragma pack(push, 4)
+ 
+-enum HSA_DBG_WAVEOP {
+-	HSA_DBG_WAVEOP_HALT = 1,	/* Halts a wavefront		*/
+-	HSA_DBG_WAVEOP_RESUME = 2,	/* Resumes a wavefront		*/
+-	HSA_DBG_WAVEOP_KILL = 3,	/* Kills a wavefront		*/
+-	HSA_DBG_WAVEOP_DEBUG = 4,	/* Causes wavefront to enter
+-						debug mode		*/
+-	HSA_DBG_WAVEOP_TRAP = 5,	/* Causes wavefront to take
+-						a trap			*/
++typedef enum _HSA_DBG_WAVEOP {
++	HSA_DBG_WAVEOP_HALT = 1,	/* Halts a wavefront  */
++	HSA_DBG_WAVEOP_RESUME = 2,	/* Resumes a wavefront  */
++	HSA_DBG_WAVEOP_KILL = 3,	/* Kills a wavefront  */
++	HSA_DBG_WAVEOP_DEBUG = 4,	/* Causes wavefront to enter debug mode  */
++	HSA_DBG_WAVEOP_TRAP = 5,	/* Causes wavefront to take a trap  */
+ 	HSA_DBG_NUM_WAVEOP = 5,
+ 	HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF
+-};
++} HSA_DBG_WAVEOP;
+ 
+-enum HSA_DBG_WAVEMODE {
+-	/* send command to a single wave */
+-	HSA_DBG_WAVEMODE_SINGLE = 0,
+-	/*
+-	 * Broadcast to all wavefronts of all processes is not
+-	 * supported for HSA user mode
+-	 */
+-
+-	/* send to waves within current process */
+-	HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2,
+-	/* send to waves within current process on CU  */
+-	HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3,
++typedef enum _HSA_DBG_WAVEMODE {
++	HSA_DBG_WAVEMODE_SINGLE = 0,	/* send command to a single wave  */
++	/* Broadcast to all wavefronts of all processes is not supported for HSA user mode */
++	HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2,	/* send to waves within current process  */
++	HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3,	/* send to waves within current process on CU  */
+ 	HSA_DBG_NUM_WAVEMODE = 3,
+ 	HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF
+-};
++} HSA_DBG_WAVEMODE;
+ 
+-enum HSA_DBG_WAVEMSG_TYPE {
++typedef enum _HSA_DBG_WAVEMSG_TYPE {
+ 	HSA_DBG_WAVEMSG_AUTO = 0,
+ 	HSA_DBG_WAVEMSG_USER = 1,
+ 	HSA_DBG_WAVEMSG_ERROR = 2,
+ 	HSA_DBG_NUM_WAVEMSG,
+ 	HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF
+-};
++} HSA_DBG_WAVEMSG_TYPE;
+ 
+-enum HSA_DBG_WATCH_MODE {
+-	HSA_DBG_WATCH_READ = 0,		/* Read operations only */
+-	HSA_DBG_WATCH_NONREAD = 1,	/* Write or Atomic operations only */
+-	HSA_DBG_WATCH_ATOMIC = 2,	/* Atomic Operations only */
+-	HSA_DBG_WATCH_ALL = 3,		/* Read, Write or Atomic operations */
++typedef enum _HSA_DBG_WATCH_MODE {
++	HSA_DBG_WATCH_READ = 0,	/* Read operations only  */
++	HSA_DBG_WATCH_NONREAD = 1,	/* Write or Atomic operations only  */
++	HSA_DBG_WATCH_ATOMIC = 2,	/* Atomic Operations only  */
++	HSA_DBG_WATCH_ALL = 3,	/* Read, Write or Atomic operations  */
+ 	HSA_DBG_WATCH_NUM,
+ 	HSA_DBG_WATCH_SIZE = 0xFFFFFFFF
+-};
++} HSA_DBG_WATCH_MODE;
+ 
+ /* This structure is hardware specific and may change in the future */
+-struct HsaDbgWaveMsgAMDGen2 {
++typedef struct _HsaDbgWaveMsgAMDGen2 {
+ 	union {
+-		struct ui32 {
+-			uint32_t UserData:8;	/* user data */
+-			uint32_t ShaderArray:1;	/* Shader array */
+-			uint32_t Priv:1;	/* Privileged */
+-			uint32_t Reserved0:4;	/* This field is reserved,
+-						   should be 0 */
+-			uint32_t WaveId:4;	/* wave id */
+-			uint32_t SIMD:2;	/* SIMD id */
+-			uint32_t HSACU:4;	/* Compute unit */
+-			uint32_t ShaderEngine:2;/* Shader engine */
+-			uint32_t MessageType:2;	/* see HSA_DBG_WAVEMSG_TYPE */
+-			uint32_t Reserved1:4;	/* This field is reserved,
+-						   should be 0 */
++		struct {
++			uint32_t UserData:8;	/*  user data  */
++			uint32_t ShaderArray:1;	/*  Shader array  */
++			uint32_t Priv:1;	/*  Privileged  */
++			uint32_t Reserved0:4;	/*  This field is reserved, should be 0  */
++			uint32_t WaveId:4;	/*  wave id  */
++			uint32_t SIMD:2;	/*  SIMD id  */
++			uint32_t HSACU:4;	/*  Compute unit  */
++			uint32_t ShaderEngine:2;	/*  Shader engine  */
++			uint32_t MessageType:2;	/*  see HSA_DBG_WAVEMSG_TYPE  */
++			uint32_t Reserved1:4;	/*  This field is reserved, should be 0  */
+ 		} ui32;
+ 		uint32_t Value;
+ 	};
+-	uint32_t Reserved2;
+-};
+ 
+-union HsaDbgWaveMessageAMD {
+-	struct HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2;
+-	/* for future HsaDbgWaveMsgAMDGen3; */
+-};
+-
+-struct HsaDbgWaveMessage {
+-	void *MemoryVA;		/* ptr to associated host-accessible data */
+-	union HsaDbgWaveMessageAMD DbgWaveMsg;
+-};
++	uint32_t Reserved2;
+ 
+-/*
+- * TODO: This definitions to be MOVED to kfd_event, once it is implemented.
+- *
+- * HSA sync primitive, Event and HW Exception notification API definitions.
+- * The API functions allow the runtime to define a so-called sync-primitive,
+- * a SW object combining a user-mode provided "syncvar" and a scheduler event
+- * that can be signaled through a defined GPU interrupt. A syncvar is
+- * a process virtual memory location of a certain size that can be accessed
+- * by CPU and GPU shader code within the process to set and query the content
+- * within that memory. The definition of the content is determined by the HSA
+- * runtime and potentially GPU shader code interfacing with the HSA runtime.
+- * The syncvar values may be commonly written through an PM4 WRITE_DATA packet
+- * in the user mode instruction stream. The OS scheduler event is typically
+- * associated and signaled by an interrupt issued by the GPU, but other HSA
+- * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced
+- * by the KFD by this mechanism, too. */
+-
+-/* these are the new definitions for events */
+-enum HSA_EVENTTYPE {
+-	HSA_EVENTTYPE_SIGNAL = 0,	/* user-mode generated GPU signal */
+-	HSA_EVENTTYPE_NODECHANGE = 1,	/* HSA node change (attach/detach) */
+-	HSA_EVENTTYPE_DEVICESTATECHANGE = 2,	/* HSA device state change
+-						   (start/stop) */
+-	HSA_EVENTTYPE_HW_EXCEPTION = 3,	/* GPU shader exception event */
+-	HSA_EVENTTYPE_SYSTEM_EVENT = 4,	/* GPU SYSCALL with parameter info */
+-	HSA_EVENTTYPE_DEBUG_EVENT = 5,	/* GPU signal for debugging */
+-	HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */
+-	HSA_EVENTTYPE_QUEUE_EVENT = 7,	/* GPU signal queue idle state
+-					   (EOP pm4) */
++} HsaDbgWaveMsgAMDGen2;
++
++typedef union _HsaDbgWaveMessageAMD {
++	HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2;
++	/* for future HsaDbgWaveMsgAMDGen3;  */
++} HsaDbgWaveMessageAMD;
++
++typedef struct _HsaDbgWaveMessage {
++	void *MemoryVA;		/*  ptr to associated host-accessible data  */
++	HsaDbgWaveMessageAMD DbgWaveMsg;
++} HsaDbgWaveMessage;
++
++/* TODO: This definitions to be MOVED to kfd_event, once it is implemented.
++
++ HSA sync primitive, Event and HW Exception notification API definitions
++ The API functions allow the runtime to define a so-called sync-primitive, a SW object
++ combining a user-mode provided "syncvar" and a scheduler event that can be signaled
++ through a defined GPU interrupt. A syncvar is a process virtual memory location of
++ a certain size that can be accessed by CPU and GPU shader code within the process to set
++ and query the content within that memory. The definition of the content is determined by
++ the HSA runtime and potentially GPU shader code interfacing with the HSA runtime.
++ The syncvar values may be commonly written through an PM4 WRITE_DATA packet in the
++ user mode instruction stream. The OS scheduler event is typically associated and
++ signaled by an interrupt issued by the GPU, but other HSA system interrupt conditions
++ from other HW (e.g. IOMMUv2) may besurfaced by the KFD by this mechanism, too.  */
++
++/*  these are the new definitions for events  */
++
++typedef enum _HSA_EVENTTYPE {
++	HSA_EVENTTYPE_SIGNAL = 0,	/* /user-mode generated GPU signal  */
++	HSA_EVENTTYPE_NODECHANGE = 1,	/* HSA node change (attach/detach)  */
++	HSA_EVENTTYPE_DEVICESTATECHANGE = 2,	/* HSA device state change( start/stop )  */
++	HSA_EVENTTYPE_HW_EXCEPTION = 3,	/* GPU shader exception event  */
++	HSA_EVENTTYPE_SYSTEM_EVENT = 4,	/* GPU SYSCALL with parameter info  */
++	HSA_EVENTTYPE_DEBUG_EVENT = 5,	/* GPU signal for debugging  */
++	HSA_EVENTTYPE_PROFILE_EVENT = 6,	/* GPU signal for profiling  */
++	HSA_EVENTTYPE_QUEUE_EVENT = 7,	/* GPU signal queue idle state (EOP pm4)  */
+ 	/* ...  */
+ 	HSA_EVENTTYPE_MAXID,
+ 	HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF
+-};
++} HSA_EVENTTYPE;
++
++typedef uint32_t HSA_EVENTID;
+ 
+-/* Sub-definitions for various event types: Syncvar */
+-struct HsaSyncVar {
+-	union SyncVar {
+-		void *UserData;	/* pointer to user mode data */
+-		uint64_t UserDataPtrValue; /* 64bit compatibility of value */
++/*  Subdefinitions for various event types: Syncvar  */
++
++typedef struct _HsaSyncVar {
++	union {
++		void *UserData;	/* pointer to user mode data  */
++		uint64_t UserDataPtrValue;	/* 64bit compatibility of value  */
+ 	} SyncVar;
+ 	uint64_t SyncVarSize;
+-};
++} HsaSyncVar;
+ 
+-/* Sub-definitions for various event types: NodeChange */
++/*
++ Subdefinitions for various event types: NodeChange
++*/
+ 
+-enum HSA_EVENTTYPE_NODECHANGE_FLAGS {
++typedef enum _HSA_EVENTTYPE_NODECHANGE_FLAGS {
+ 	HSA_EVENTTYPE_NODECHANGE_ADD = 0,
+ 	HSA_EVENTTYPE_NODECHANGE_REMOVE = 1,
+ 	HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF
+-};
++} HSA_EVENTTYPE_NODECHANGE_FLAGS;
+ 
+-struct HsaNodeChange {
+-	/* HSA node added/removed on the platform */
+-	enum HSA_EVENTTYPE_NODECHANGE_FLAGS Flags;
+-};
++typedef struct _HsaNodeChange {
++	HSA_EVENTTYPE_NODECHANGE_FLAGS Flags;	/*  HSA node added/removed on the platform  */
++} HsaNodeChange;
++
++/*
++ Sub-definitions for various event types: DeviceStateChange
++*/
+ 
+-/* Sub-definitions for various event types: DeviceStateChange */
+-enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS {
+-	/* device started (and available) */
+-	HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0,
+-	/* device stopped (i.e. unavailable) */
+-	HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1,
++typedef enum _HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS {
++	HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0,	/* device started (and available)  */
++	HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1,	/* device stopped (i.e. unavailable)  */
+ 	HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF
+-};
++} HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS;
+ 
+-enum HSA_DEVICE {
++typedef enum _HSA_DEVICE {
+ 	HSA_DEVICE_CPU = 0,
+ 	HSA_DEVICE_GPU = 1,
+ 	MAX_HSA_DEVICE = 2
+-};
++} HSA_DEVICE;
+ 
+-struct HsaDeviceStateChange {
++typedef struct _HsaDeviceStateChange {
+ 	uint32_t NodeId;	/* F-NUMA node that contains the device */
+-	enum HSA_DEVICE Device;	/* device type: GPU or CPU */
+-	enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */
+-};
++	HSA_DEVICE Device;	/* device type: GPU or CPU */
++	HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags;	/* event flags   */
++} HsaDeviceStateChange;
+ 
+-struct HsaEventData {
+-	enum HSA_EVENTTYPE EventType; /* event type */
+-	union EventData {
+-		/*
+-		 * return data associated with HSA_EVENTTYPE_SIGNAL
+-		 * and other events
+-		 */
+-		struct HsaSyncVar SyncVar;
++typedef struct _HsaEventData {
++	HSA_EVENTTYPE EventType;	/* event type */
++	union {
++		/* return data associated with HSA_EVENTTYPE_SIGNAL and other events */
++		HsaSyncVar SyncVar;
+ 
+ 		/* data associated with HSA_EVENTTYPE_NODE_CHANGE */
+-		struct HsaNodeChange NodeChangeState;
++		HsaNodeChange NodeChangeState;
+ 
+ 		/* data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE */
+-		struct HsaDeviceStateChange DeviceState;
++		HsaDeviceStateChange DeviceState;
+ 	} EventData;
+ 
+-	/* the following data entries are internal to the KFD & thunk itself */
++	/* the following data entries are internal to the KFD & thunk itself. */
+ 
+-	/* internal thunk store for Event data (OsEventHandle) */
+-	uint64_t HWData1;
+-	/* internal thunk store for Event data (HWAddress) */
+-	uint64_t HWData2;
+-	/* internal thunk store for Event data (HWData) */
+-	uint32_t HWData3;
+-};
++	uint64_t HWData1;	/* internal thunk store for Event data  (OsEventHandle) */
++	uint64_t HWData2;	/* internal thunk store for Event data  (HWAddress) */
++	uint32_t HWData3;	/* internal thunk store for Event data  (HWData) */
++} HsaEventData;
+ 
+-struct HsaEventDescriptor {
+-	/* event type to allocate */
+-	enum HSA_EVENTTYPE EventType;
+-	/* H-NUMA node containing GPU device that is event source */
+-	uint32_t NodeId;
+-	/* pointer to user mode syncvar data, syncvar->UserDataPtrValue
+-	 * may be NULL
+-	 */
+-	struct HsaSyncVar SyncVar;
+-};
++typedef struct _HsaEventDescriptor {
++	HSA_EVENTTYPE EventType;	/* event type to allocate */
++	uint32_t NodeId;	/* H-NUMA node containing GPU device that is event source */
++	HsaSyncVar SyncVar;	/* pointer to user mode syncvar data, syncvar->UserDataPtrValue may be NULL */
++} HsaEventDescriptor;
++
++typedef struct _HsaEvent {
++	HSA_EVENTID EventId;
++	HsaEventData EventData;
++} HsaEvent;
+ 
+-struct HsaEvent {
+-	uint32_t EventId;
+-	struct HsaEventData EventData;
+-};
+ 
+ #pragma pack(pop)
+ 
+-enum DBGDEV_TYPE {
++typedef enum _DBGDEV_TYPE {
+ 	DBGDEV_TYPE_ILLEGAL = 0,
+ 	DBGDEV_TYPE_NODIQ = 1,
+ 	DBGDEV_TYPE_DIQ = 2,
+ 	DBGDEV_TYPE_TEST = 3
+-};
++} DBGDEV_TYPE;
+ 
+ struct dbg_address_watch_info {
+ 	struct kfd_process *process;
+-	enum HSA_DBG_WATCH_MODE *watch_mode;
++	HSA_DBG_WATCH_MODE *watch_mode;
+ 	uint64_t *watch_address;
+ 	uint64_t *watch_mask;
+-	struct HsaEvent *watch_event;
++	HsaEvent *watch_event;
+ 	uint32_t num_watch_points;
+ };
+ 
+ struct dbg_wave_control_info {
+ 	struct kfd_process *process;
+ 	uint32_t trapId;
+-	enum HSA_DBG_WAVEOP operand;
+-	enum HSA_DBG_WAVEMODE mode;
+-	struct HsaDbgWaveMessage dbgWave_msg;
++	HSA_DBG_WAVEOP operand;
++	HSA_DBG_WAVEMODE mode;
++	HsaDbgWaveMessage dbgWave_msg;
+ };
+ 
+ struct kfd_dbgdev {
+ 
+ 	/* The device that owns this data. */
++
+ 	struct kfd_dev *dev;
+ 
+ 	/* kernel queue for DIQ */
++
+ 	struct kernel_queue *kq;
+ 
+ 	/* a pointer to the pqm of the calling process */
++
+ 	struct process_queue_manager *pqm;
+ 
+ 	/* type of debug device ( DIQ, non DIQ, etc. ) */
+-	enum DBGDEV_TYPE type;
++
++	DBGDEV_TYPE type;
+ 
+ 	/* virtualized function pointers to device dbg */
++
+ 	int (*dbgdev_register)(struct kfd_dbgdev *dbgdev);
+ 	int (*dbgdev_unregister)(struct kfd_dbgdev *dbgdev);
+-	int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev,
+-				struct dbg_address_watch_info *adw_info);
+-	int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev,
+-				struct dbg_wave_control_info *wac_info);
++	int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev, struct dbg_address_watch_info *adw_info);
++	int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, struct dbg_wave_control_info *wac_info);
+ 
+ };
+ 
+@@ -282,13 +272,12 @@ struct kfd_dbgmgr {
+ };
+ 
+ /* prototypes for debug manager functions */
+-struct mutex *kfd_get_dbgmgr_mutex(void);
++struct mutex *get_dbgmgr_mutex(void);
+ void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr);
+ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev);
+ long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p);
+ long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p);
+-long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr,
+-				struct dbg_wave_control_info *wac_info);
+-long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr,
+-			struct dbg_address_watch_info *adw_info);
+-#endif /* KFD_DBGMGR_H_ */
++long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info);
++long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info);
++long kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process);
++#endif				/* KFD_DBGMGR_H_ */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+index 3f95f7c..20592ba 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+@@ -24,9 +24,11 @@
+ #include <linux/bsearch.h>
+ #include <linux/pci.h>
+ #include <linux/slab.h>
++#include <linux/highmem.h>
+ #include "kfd_priv.h"
+ #include "kfd_device_queue_manager.h"
+ #include "kfd_pm4_headers.h"
++#include "cwsr_trap_handler_carrizo.h"
+ 
+ #define MQD_SIZE_ALIGNED 768
+ 
+@@ -38,7 +40,8 @@ static const struct kfd_device_info kaveri_device_info = {
+ 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+ 	.event_interrupt_class = &event_interrupt_class_cik,
+ 	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED
++	.mqd_size_aligned = MQD_SIZE_ALIGNED,
++	.is_need_iommu_device = true
+ };
+ 
+ static const struct kfd_device_info carrizo_device_info = {
+@@ -49,14 +52,50 @@ static const struct kfd_device_info carrizo_device_info = {
+ 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+ 	.event_interrupt_class = &event_interrupt_class_cik,
+ 	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED
++	.mqd_size_aligned = MQD_SIZE_ALIGNED,
++	.is_need_iommu_device = true
+ };
+ 
++static const struct kfd_device_info tonga_device_info = {
++	.asic_family = CHIP_TONGA,
++	.max_pasid_bits = 16,
++	.max_no_of_hqd  = 24,
++	.ih_ring_entry_size = 4 * sizeof(uint32_t),
++	.event_interrupt_class = &event_interrupt_class_cik,
++	.num_of_watch_points = 4,
++	.mqd_size_aligned = MQD_SIZE_ALIGNED,
++	.is_need_iommu_device = false
++};
++
++static const struct kfd_device_info fiji_device_info = {
++	.asic_family = CHIP_FIJI,
++	.max_pasid_bits = 16,
++	.max_no_of_hqd  = 24,
++	.ih_ring_entry_size = 4 * sizeof(uint32_t),
++	.event_interrupt_class = &event_interrupt_class_cik,
++	.num_of_watch_points = 4,
++	.mqd_size_aligned = MQD_SIZE_ALIGNED,
++	.is_need_iommu_device = false
++}
++;
+ struct kfd_deviceid {
+ 	unsigned short did;
+ 	const struct kfd_device_info *device_info;
+ };
+ 
++/*
++ * //
++// TONGA/AMETHYST device IDs (performance segment)
++//
++#define DEVICE_ID_VI_TONGA_P_6920               0x6920  // unfused
++#define DEVICE_ID_VI_TONGA_P_6921               0x6921  // Amethyst XT
++#define DEVICE_ID_VI_TONGA_P_6928               0x6928  // Tonga GL XT
++#define DEVICE_ID_VI_TONGA_P_692B               0x692B  // Tonga GL PRO
++#define DEVICE_ID_VI_TONGA_P_692F               0x692F  // Tonga GL PRO VF
++#define DEVICE_ID_VI_TONGA_P_6938               0x6938  // Tonga XT
++#define DEVICE_ID_VI_TONGA_P_6939               0x6939  // Tonga PRO
++ *
++ */
+ /* Please keep this sorted by increasing device id. */
+ static const struct kfd_deviceid supported_devices[] = {
+ 	{ 0x1304, &kaveri_device_info },	/* Kaveri */
+@@ -85,13 +124,23 @@ static const struct kfd_deviceid supported_devices[] = {
+ 	{ 0x9874, &carrizo_device_info },	/* Carrizo */
+ 	{ 0x9875, &carrizo_device_info },	/* Carrizo */
+ 	{ 0x9876, &carrizo_device_info },	/* Carrizo */
+-	{ 0x9877, &carrizo_device_info }	/* Carrizo */
++	{ 0x9877, &carrizo_device_info },	/* Carrizo */
++	{ 0x6920, &tonga_device_info   },	/* Tonga */
++	{ 0x6921, &tonga_device_info   },	/* Tonga */
++	{ 0x6928, &tonga_device_info   },	/* Tonga */
++	{ 0x692B, &tonga_device_info   },	/* Tonga */
++	{ 0x692F, &tonga_device_info   },	/* Tonga */
++	{ 0x6938, &tonga_device_info   },	/* Tonga */
++	{ 0x6939, &tonga_device_info   },	/* Tonga */
++	{ 0x7300, &fiji_device_info    }	/* Fiji */
+ };
+ 
+ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
+ 				unsigned int chunk_size);
+ static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
+ 
++static int kfd_resume(struct kfd_dev *kfd);
++
+ static const struct kfd_device_info *lookup_device_info(unsigned short did)
+ {
+ 	size_t i;
+@@ -117,6 +166,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
+ 	if (!device_info)
+ 		return NULL;
+ 
++	BUG_ON(!f2g);
++
+ 	kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
+ 	if (!kfd)
+ 		return NULL;
+@@ -170,15 +221,8 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd)
+ 				pasid_limit,
+ 				kfd->doorbell_process_limit - 1);
+ 
+-	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
+-	if (err < 0) {
+-		dev_err(kfd_device, "error initializing iommu device\n");
+-		return false;
+-	}
+-
+ 	if (!kfd_set_pasid_limit(pasid_limit)) {
+ 		dev_err(kfd_device, "error setting pasid limit\n");
+-		amd_iommu_free_device(kfd->pdev);
+ 		return false;
+ 	}
+ 
+@@ -219,13 +263,81 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
+ 	return AMD_IOMMU_INV_PRI_RSP_INVALID;
+ }
+ 
++static int kfd_cwsr_init(struct kfd_dev *kfd)
++{
++	/*
++	 * Initialize the CWSR required memory for TBA and TMA
++	 * only support CWSR on VI and up with FW version >=625.
++	 */
++	if (cwsr_enable &&
++		(kfd->mec_fw_version >= KFD_CWSR_CZ_FW_VER)) {
++		void *cwsr_addr = NULL;
++		unsigned int size = sizeof(cwsr_trap_carrizo_hex);
++
++		if (size > PAGE_SIZE) {
++			pr_err("amdkfd: wrong CWSR ISA size.\n");
++			return -EINVAL;
++		}
++		kfd->cwsr_size =
++			ALIGN(size, PAGE_SIZE) + PAGE_SIZE;
++		kfd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM,
++					get_order(kfd->cwsr_size));
++		if (!kfd->cwsr_pages) {
++			pr_err("amdkfd: error alloc CWSR isa memory.\n");
++			return -ENOMEM;
++		}
++		/*Only first page used for cwsr ISA code */
++		cwsr_addr = kmap(kfd->cwsr_pages);
++		memset(cwsr_addr, 0, PAGE_SIZE);
++		memcpy(cwsr_addr, cwsr_trap_carrizo_hex, size);
++		kunmap(kfd->cwsr_pages);
++		kfd->tma_offset = ALIGN(size, PAGE_SIZE);
++		kfd->cwsr_enabled = true;
++		dev_info(kfd_device,
++			"Reserved %d pages for cwsr.\n",
++			(kfd->cwsr_size >> PAGE_SHIFT));
++	}
++
++	return 0;
++}
++
++static void kfd_cwsr_fini(struct kfd_dev *kfd)
++{
++	if (kfd->cwsr_pages)
++		__free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size));
++}
++
+ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 			 const struct kgd2kfd_shared_resources *gpu_resources)
+ {
+ 	unsigned int size;
++	unsigned int vmid_bitmap_kfd, vmid_num_kfd;
++
++	kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd,
++			KGD_ENGINE_MEC1);
+ 
+ 	kfd->shared_resources = *gpu_resources;
+ 
++	vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap;
++	kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1;
++	kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1;
++	vmid_num_kfd = kfd->vm_info.last_vmid_kfd
++			- kfd->vm_info.first_vmid_kfd + 1;
++	kfd->vm_info.vmid_num_kfd = vmid_num_kfd;
++
++	/* If MEC firmware is too old, turn off hws multiple process mapping */
++	if (kfd->mec_fw_version	< KFD_MULTI_PROC_MAPPING_HWS_SUPPORT)
++		kfd->max_proc_per_quantum = 0;
++	/* Verify module parameters regarding mapped process number*/
++	else if ((hws_max_conc_proc < 0)
++			|| (hws_max_conc_proc > vmid_num_kfd)) {
++		dev_err(kfd_device,
++			"hws_max_conc_proc (%d) must be between 0 and %d, use %d instead\n",
++			hws_max_conc_proc, vmid_num_kfd, vmid_num_kfd);
++		kfd->max_proc_per_quantum = vmid_num_kfd;
++	} else
++		kfd->max_proc_per_quantum = hws_max_conc_proc;
++
+ 	/* calculate max size of mqds needed for queues */
+ 	size = max_num_of_queues_per_device *
+ 			kfd->device_info->mqd_size_aligned;
+@@ -280,16 +392,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 		goto kfd_interrupt_error;
+ 	}
+ 
+-	if (!device_iommu_pasid_init(kfd)) {
+-		dev_err(kfd_device,
+-			"Error initializing iommuv2 for device (%x:%x)\n",
+-			kfd->pdev->vendor, kfd->pdev->device);
+-		goto device_iommu_pasid_error;
+-	}
+-	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
+-						iommu_pasid_shutdown_callback);
+-	amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
+-
+ 	kfd->dqm = device_queue_manager_init(kfd);
+ 	if (!kfd->dqm) {
+ 		dev_err(kfd_device,
+@@ -298,13 +400,21 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 		goto device_queue_manager_error;
+ 	}
+ 
+-	if (kfd->dqm->ops.start(kfd->dqm) != 0) {
+-		dev_err(kfd_device,
+-			"Error starting queuen manager for device (%x:%x)\n",
+-			kfd->pdev->vendor, kfd->pdev->device);
+-		goto dqm_start_error;
++	if (kfd->device_info->is_need_iommu_device) {
++		if (!device_iommu_pasid_init(kfd)) {
++			dev_err(kfd_device,
++				"Error initializing iommuv2 for device (%x:%x)\n",
++				kfd->pdev->vendor, kfd->pdev->device);
++			goto device_iommu_pasid_error;
++		}
+ 	}
+ 
++	if (kfd_cwsr_init(kfd))
++		goto device_iommu_pasid_error;
++
++	if (kfd_resume(kfd))
++		goto kfd_resume_error;
++
+ 	kfd->dbgmgr = NULL;
+ 
+ 	kfd->init_complete = true;
+@@ -316,11 +426,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 
+ 	goto out;
+ 
+-dqm_start_error:
++kfd_resume_error:
++	kfd_cwsr_fini(kfd);
++device_iommu_pasid_error:
+ 	device_queue_manager_uninit(kfd->dqm);
+ device_queue_manager_error:
+-	amd_iommu_free_device(kfd->pdev);
+-device_iommu_pasid_error:
+ 	kfd_interrupt_exit(kfd);
+ kfd_interrupt_error:
+ 	kfd_topology_remove_device(kfd);
+@@ -338,8 +448,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ void kgd2kfd_device_exit(struct kfd_dev *kfd)
+ {
+ 	if (kfd->init_complete) {
++		kgd2kfd_suspend(kfd);
++		kfd_cwsr_fini(kfd);
+ 		device_queue_manager_uninit(kfd->dqm);
+-		amd_iommu_free_device(kfd->pdev);
+ 		kfd_interrupt_exit(kfd);
+ 		kfd_topology_remove_device(kfd);
+ 		kfd_gtt_sa_fini(kfd);
+@@ -355,32 +466,68 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
+ 
+ 	if (kfd->init_complete) {
+ 		kfd->dqm->ops.stop(kfd->dqm);
+-		amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+-		amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
+-		amd_iommu_free_device(kfd->pdev);
++		if (kfd->device_info->is_need_iommu_device) {
++			amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
++			amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
++			amd_iommu_free_device(kfd->pdev);
++		}
+ 	}
+ }
+ 
+-int kgd2kfd_resume(struct kfd_dev *kfd)
++int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem)
+ {
+-	unsigned int pasid_limit;
+-	int err;
++	return evict_bo(dev, mem);
++}
+ 
++int kgd2kfd_restore(struct kfd_dev *kfd)
++{
++	return restore(kfd);
++}
++
++int kgd2kfd_resume(struct kfd_dev *kfd)
++{
+ 	BUG_ON(kfd == NULL);
+ 
+-	pasid_limit = kfd_get_pasid_limit();
++	if (!kfd->init_complete)
++		return 0;
++
++	return kfd_resume(kfd);
++
++}
++
++static int kfd_resume(struct kfd_dev *kfd)
++{
++	int err = 0;
++
++	if (kfd->device_info->is_need_iommu_device) {
++		unsigned int pasid_limit = kfd_get_pasid_limit();
+ 
+-	if (kfd->init_complete) {
+ 		err = amd_iommu_init_device(kfd->pdev, pasid_limit);
+-		if (err < 0)
++		if (err)
+ 			return -ENXIO;
+ 		amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
+-						iommu_pasid_shutdown_callback);
+-		amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
+-		kfd->dqm->ops.start(kfd->dqm);
++				iommu_pasid_shutdown_callback);
++		amd_iommu_set_invalid_ppr_cb(kfd->pdev,
++				iommu_invalid_ppr_cb);
+ 	}
+ 
+-	return 0;
++	err = kfd->dqm->ops.start(kfd->dqm);
++	if (err) {
++		dev_err(kfd_device,
++			"Error starting queue manager for device (%x:%x)\n",
++			kfd->pdev->vendor, kfd->pdev->device);
++		goto dqm_start_error;
++	}
++
++	kfd->kfd2kgd->write_config_static_mem(kfd->kgd, true, 1, 3, 0);
++
++	return err;
++
++dqm_start_error:
++	if (kfd->device_info->is_need_iommu_device)
++		amd_iommu_free_device(kfd->pdev);
++
++	return err;
+ }
+ 
+ /* This is called directly from KGD at ISR. */
+@@ -399,6 +546,58 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
+ 	spin_unlock(&kfd->interrupt_lock);
+ }
+ 
++int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm)
++{
++	struct kfd_process *p;
++	struct kfd_process_device *pdd;
++	int r;
++
++	BUG_ON(kfd == NULL);
++	if (!kfd->init_complete)
++		return 0;
++
++	/* Because we are called from arbitrary context (workqueue) as opposed
++	 * to process context, kfd_process could attempt to exit while we are
++	 * running so the lookup function returns a read-locked process. */
++	p = kfd_lookup_process_by_mm(mm);
++	if (!p)
++		return -ENODEV;
++
++	r = -ENODEV;
++	pdd = kfd_get_process_device_data(kfd, p);
++	if (pdd)
++		r = process_evict_queues(kfd->dqm, &pdd->qpd);
++
++	up_read(&p->lock);
++	return r;
++}
++
++int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm)
++{
++	struct kfd_process *p;
++	struct kfd_process_device *pdd;
++	int r;
++
++	BUG_ON(kfd == NULL);
++	if (!kfd->init_complete)
++		return 0;
++
++	/* Because we are called from arbitrary context (workqueue) as opposed
++	 * to process context, kfd_process could attempt to exit while we are
++	 * running so the lookup function returns a read-locked process. */
++	p = kfd_lookup_process_by_mm(mm);
++	if (!p)
++		return -ENODEV;
++
++	r = -ENODEV;
++	pdd = kfd_get_process_device_data(kfd, p);
++	if (pdd)
++		r = process_restore_queues(kfd->dqm, &pdd->qpd);
++
++	up_read(&p->lock);
++	return r;
++}
++
+ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
+ 				unsigned int chunk_size)
+ {
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+index 42de22b..e123390 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+@@ -44,9 +44,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+ 					struct queue *q,
+ 					struct qcm_process_device *qpd);
+ 
+-static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock);
+-static int destroy_queues_cpsch(struct device_queue_manager *dqm,
+-				bool preempt_static_queues, bool lock);
++static int execute_queues_cpsch(struct device_queue_manager *dqm);
++static int unmap_queues_cpsch(struct device_queue_manager *dqm,
++		enum kfd_unmap_queues_filter filter,
++		uint32_t filter_param, bool reset);
+ 
+ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+ 					struct queue *q,
+@@ -116,11 +117,11 @@ static int allocate_vmid(struct device_queue_manager *dqm,
+ 	if (dqm->vmid_bitmap == 0)
+ 		return -ENOMEM;
+ 
+-	bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM);
++	bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap,
++				dqm->dev->vm_info.vmid_num_kfd);
+ 	clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap);
+ 
+-	/* Kaveri kfd vmid's starts from vmid 8 */
+-	allocated_vmid = bit + KFD_VMID_START_OFFSET;
++	allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd;
+ 	pr_debug("kfd: vmid allocation %d\n", allocated_vmid);
+ 	qpd->vmid = allocated_vmid;
+ 	q->properties.vmid = allocated_vmid;
+@@ -128,6 +129,11 @@ static int allocate_vmid(struct device_queue_manager *dqm,
+ 	set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid);
+ 	program_sh_mem_settings(dqm, qpd);
+ 
++	dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd,
++			allocated_vmid,
++			qpd->page_table_base);
++	/*invalidate the VM context after pasid and vmid mapping is set up*/
++	radeon_flush_tlb(dqm->dev, qpd->pqm->process->pasid);
+ 	return 0;
+ }
+ 
+@@ -135,7 +141,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
+ 				struct qcm_process_device *qpd,
+ 				struct queue *q)
+ {
+-	int bit = qpd->vmid - KFD_VMID_START_OFFSET;
++	int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
+ 
+ 	/* Release the vmid mapping */
+ 	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+@@ -175,6 +181,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
+ 	}
+ 	*allocated_vmid = qpd->vmid;
+ 	q->properties.vmid = qpd->vmid;
++	/*
++	 * Eviction state logic: we only mark active queues as evicted
++	 * to avoid the overhead of restoring inactive queues later
++	 */
++	if (qpd->evicted)
++		q->properties.is_evicted = (q->properties.queue_size > 0 &&
++					    q->properties.queue_percent > 0 &&
++					    q->properties.queue_address != 0);
+ 
+ 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+ 		retval = create_compute_queue_nocpsch(dqm, q, qpd);
+@@ -281,8 +295,12 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+ 			q->pipe,
+ 			q->queue);
+ 
++	dqm->dev->kfd2kgd->alloc_memory_of_scratch(
++			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
++
+ 	retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
+-			q->queue, (uint32_t __user *) q->properties.write_ptr);
++			q->queue, (uint32_t __user *) q->properties.write_ptr,
++			qpd->page_table_base);
+ 	if (retval != 0) {
+ 		deallocate_hqd(dqm, q);
+ 		mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+@@ -362,34 +380,56 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ {
+ 	int retval;
+ 	struct mqd_manager *mqd;
++	struct kfd_process_device *pdd;
++
+ 	bool prev_active = false;
+ 
+ 	BUG_ON(!dqm || !q || !q->mqd);
+ 
+ 	mutex_lock(&dqm->lock);
++
++	pdd = kfd_get_process_device_data(q->device, q->process);
++	if (!pdd) {
++		mutex_unlock(&dqm->lock);
++		return -ENODEV;
++	}
+ 	mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+ 	if (mqd == NULL) {
+ 		mutex_unlock(&dqm->lock);
+ 		return -ENOMEM;
+ 	}
++	/*
++	 * Eviction state logic: we only mark active queues as evicted
++	 * to avoid the overhead of restoring inactive queues later
++	 */
++	if (pdd->qpd.evicted > 0)
++		q->properties.is_evicted = (q->properties.queue_size > 0 &&
++					    q->properties.queue_percent > 0 &&
++					    q->properties.queue_address != 0);
+ 
++	/* save previous activity state for counters */
+ 	if (q->properties.is_active)
+ 		prev_active = true;
+ 
+-	/*
+-	 *
+-	 * check active state vs. the previous state
+-	 * and modify counter accordingly
+-	 */
++
+ 	retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
++	if (sched_policy == KFD_SCHED_POLICY_NO_HWS &&
++		q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
++		retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
++			q->queue,
++			(uint32_t __user *)q->properties.write_ptr, 0);
++	/*
++		 * check active state vs. the previous state
++		 * and modify counter accordingly
++	*/
+ 	if ((q->properties.is_active) && (!prev_active))
+ 		dqm->queue_count++;
+ 	else if ((!q->properties.is_active) && (prev_active))
+ 		dqm->queue_count--;
+ 
+ 	if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
+-		retval = execute_queues_cpsch(dqm, false);
++		retval = execute_queues_cpsch(dqm);
+ 
+ 	mutex_unlock(&dqm->lock);
+ 	return retval;
+@@ -415,15 +455,115 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
+ 	return mqd;
+ }
+ 
++int process_evict_queues(struct device_queue_manager *dqm,
++		struct qcm_process_device *qpd)
++{
++	struct queue *q, *next;
++	struct mqd_manager *mqd;
++	int retval = 0;
++
++	BUG_ON(!dqm || !qpd);
++
++	mutex_lock(&dqm->lock);
++	if (qpd->evicted++ > 0) { /* already evicted, do nothing */
++		mutex_unlock(&dqm->lock);
++		return 0;
++	}
++	/* unactivate all active queues on the qpd */
++	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
++		mqd = dqm->ops.get_mqd_manager(dqm,
++			get_mqd_type_from_queue_type(q->properties.type));
++		if (!mqd) { /* should not be here */
++			BUG();
++			continue;
++		}
++		/* if the queue is not active anyway, it is not evicted */
++		if (q->properties.is_active == true)
++			q->properties.is_evicted = true;
++
++		retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
++		if (sched_policy == KFD_SCHED_POLICY_NO_HWS &&
++				q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
++			retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
++				q->queue,
++				(uint32_t __user *)q->properties.write_ptr, 0);
++		if (q->properties.is_evicted)
++			dqm->queue_count--;
++	}
++	if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
++		retval = execute_queues_cpsch(dqm);
++
++	mutex_unlock(&dqm->lock);
++	return retval;
++
++}
++
++int process_restore_queues(struct device_queue_manager *dqm,
++		struct qcm_process_device *qpd)
++{
++	struct queue *q, *next;
++	struct mqd_manager *mqd;
++	int retval = 0;
++
++
++	BUG_ON(!dqm || !qpd);
++
++	mutex_lock(&dqm->lock);
++	if (qpd->evicted == 0) { /* already restored, do nothing */
++		mutex_unlock(&dqm->lock);
++		return 0;
++	}
++
++	if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
++		qpd->evicted--;
++		mutex_unlock(&dqm->lock);
++		return 0;
++	}
++
++	/* activate all active queues on the qpd */
++	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
++		mqd = dqm->ops.get_mqd_manager(dqm,
++			get_mqd_type_from_queue_type(q->properties.type));
++		if (!mqd) { /* should not be here */
++			BUG();
++			continue;
++		}
++		if (q->properties.is_evicted) {
++			q->properties.is_evicted = false;
++			retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
++			if (sched_policy == KFD_SCHED_POLICY_NO_HWS &&
++				q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
++				retval =
++						mqd->load_mqd(
++								mqd,
++								q->mqd,
++								q->pipe,
++								q->queue,
++				(uint32_t __user *)q->properties.write_ptr,
++								0);
++			dqm->queue_count++;
++		}
++	}
++	if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
++		retval = execute_queues_cpsch(dqm);
++
++	if (retval == 0)
++		qpd->evicted = 0;
++	mutex_unlock(&dqm->lock);
++	return retval;
++
++}
++
+ static int register_process_nocpsch(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
++	struct kfd_process_device *pdd;
+ 	struct device_process_node *n;
+ 	int retval;
+ 
+ 	BUG_ON(!dqm || !qpd);
+ 
+-	pr_debug("kfd: In func %s\n", __func__);
++	pr_debug("In func %s\n", __func__);
+ 
+ 	n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL);
+ 	if (!n)
+@@ -434,6 +574,11 @@ static int register_process_nocpsch(struct device_queue_manager *dqm,
+ 	mutex_lock(&dqm->lock);
+ 	list_add(&n->list, &dqm->queues);
+ 
++	pdd = qpd_to_pdd(qpd);
++	qpd->page_table_base =
++		dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
++	pr_debug("Retrieved PD address == 0x%08u\n", qpd->page_table_base);
++
+ 	retval = dqm->ops_asic_specific.register_process(dqm, qpd);
+ 
+ 	dqm->processes_count++;
+@@ -499,7 +644,6 @@ static void init_interrupts(struct device_queue_manager *dqm)
+ 		if (is_pipe_enabled(dqm, 0, i))
+ 			dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i);
+ }
+-
+ static int init_scheduler(struct device_queue_manager *dqm)
+ {
+ 	int retval = 0;
+@@ -534,7 +678,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
+ 	for (i = 0; i < get_pipes_per_mec(dqm); i++)
+ 		dqm->allocated_queues[i] = (1 << get_queues_per_pipe(dqm)) - 1;
+ 
+-	dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1;
++	dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
+ 	dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
+ 
+ 	init_scheduler(dqm);
+@@ -607,8 +751,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+ 	if (retval != 0)
+ 		return retval;
+ 
+-	q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
+-	q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM;
++	q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE;
++	q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
+ 
+ 	pr_debug("kfd: sdma id is:    %d\n", q->sdma_id);
+ 	pr_debug("     sdma queue id: %d\n", q->properties.sdma_queue_id);
+@@ -623,7 +767,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+ 	}
+ 
+ 	retval = mqd->load_mqd(mqd, q->mqd, 0,
+-				0, NULL);
++				0, NULL, 0);
+ 	if (retval != 0) {
+ 		deallocate_sdma_queue(dqm, q->sdma_id);
+ 		mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+@@ -646,8 +790,7 @@ static int set_sched_resources(struct device_queue_manager *dqm)
+ 
+ 	pr_debug("kfd: In func %s\n", __func__);
+ 
+-	res.vmid_mask = (1 << VMID_PER_DEVICE) - 1;
+-	res.vmid_mask <<= KFD_VMID_START_OFFSET;
++	res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
+ 
+ 	res.queue_mask = 0;
+ 	for (i = 0; i < KGD_MAX_QUEUES; ++i) {
+@@ -696,6 +839,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
+ 	dqm->queue_count = dqm->processes_count = 0;
+ 	dqm->sdma_queue_count = 0;
+ 	dqm->active_runlist = false;
++	dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
+ 	retval = dqm->ops_asic_specific.initialize(dqm);
+ 	if (retval != 0)
+ 		goto fail_init_pipelines;
+@@ -716,7 +860,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
+ 
+ 	retval = 0;
+ 
+-	retval = pm_init(&dqm->packets, dqm);
++	retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version);
+ 	if (retval != 0)
+ 		goto fail_packet_manager_init;
+ 
+@@ -743,7 +887,9 @@ static int start_cpsch(struct device_queue_manager *dqm)
+ 			kfd_bind_process_to_device(dqm->dev,
+ 						node->qpd->pqm->process);
+ 
+-	execute_queues_cpsch(dqm, true);
++	mutex_lock(&dqm->lock);
++	execute_queues_cpsch(dqm);
++	mutex_unlock(&dqm->lock);
+ 
+ 	return 0;
+ fail_allocate_vidmem:
+@@ -760,7 +906,11 @@ static int stop_cpsch(struct device_queue_manager *dqm)
+ 
+ 	BUG_ON(!dqm);
+ 
+-	destroy_queues_cpsch(dqm, true, true);
++	mutex_lock(&dqm->lock);
++
++	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
++
++	mutex_unlock(&dqm->lock);
+ 
+ 	list_for_each_entry(node, &dqm->queues, list) {
+ 		pdd = qpd_to_pdd(node->qpd);
+@@ -799,7 +949,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
+ 	list_add(&kq->list, &qpd->priv_queue_list);
+ 	dqm->queue_count++;
+ 	qpd->is_debug = true;
+-	execute_queues_cpsch(dqm, false);
++	execute_queues_cpsch(dqm);
+ 	mutex_unlock(&dqm->lock);
+ 
+ 	return 0;
+@@ -815,11 +965,11 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
+ 
+ 	mutex_lock(&dqm->lock);
+ 	/* here we actually preempt the DIQ */
+-	destroy_queues_cpsch(dqm, true, false);
++	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
+ 	list_del(&kq->list);
+ 	dqm->queue_count--;
+ 	qpd->is_debug = false;
+-	execute_queues_cpsch(dqm, false);
++	execute_queues_cpsch(dqm);
+ 	/*
+ 	 * Unconditionally decrement this counter, regardless of the queue's
+ 	 * type.
+@@ -830,14 +980,6 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
+ 	mutex_unlock(&dqm->lock);
+ }
+ 
+-static void select_sdma_engine_id(struct queue *q)
+-{
+-	static int sdma_id;
+-
+-	q->sdma_id = sdma_id;
+-	sdma_id = (sdma_id + 1) % 2;
+-}
+-
+ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+ 			struct qcm_process_device *qpd, int *allocate_vmid)
+ {
+@@ -860,9 +1002,15 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+ 		goto out;
+ 	}
+ 
+-	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+-		select_sdma_engine_id(q);
+-
++	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
++		retval = allocate_sdma_queue(dqm, &q->sdma_id);
++		if (retval != 0)
++			goto out;
++		q->properties.sdma_queue_id =
++			q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE;
++		q->properties.sdma_engine_id =
++			q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
++	}
+ 	mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+ 
+@@ -870,8 +1018,19 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+ 		mutex_unlock(&dqm->lock);
+ 		return -ENOMEM;
+ 	}
++	/*
++	 * Eviction state logic: we only mark active queues as evicted
++	 * to avoid the overhead of restoring inactive queues later
++	 */
++	if (qpd->evicted)
++		q->properties.is_evicted = (q->properties.queue_size > 0 &&
++					    q->properties.queue_percent > 0 &&
++					    q->properties.queue_address != 0);
+ 
+ 	dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd);
++
++	q->properties.tba_addr = qpd->tba_addr;
++	q->properties.tma_addr = qpd->tma_addr;
+ 	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ 				&q->gart_mqd_addr, &q->properties);
+ 	if (retval != 0)
+@@ -880,7 +1039,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+ 	list_add(&q->list, &qpd->queues_list);
+ 	if (q->properties.is_active) {
+ 		dqm->queue_count++;
+-		retval = execute_queues_cpsch(dqm, false);
++		retval = execute_queues_cpsch(dqm);
+ 	}
+ 
+ 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+@@ -917,20 +1076,20 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+ 	return 0;
+ }
+ 
+-static int destroy_sdma_queues(struct device_queue_manager *dqm,
++static int unmap_sdma_queues(struct device_queue_manager *dqm,
+ 				unsigned int sdma_engine)
+ {
+ 	return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
+-			KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false,
++			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false,
+ 			sdma_engine);
+ }
+ 
+-static int destroy_queues_cpsch(struct device_queue_manager *dqm,
+-				bool preempt_static_queues, bool lock)
++/* dqm->lock mutex has to be locked before calling this function */
++static int unmap_queues_cpsch(struct device_queue_manager *dqm,
++		enum kfd_unmap_queues_filter filter,
++		uint32_t filter_param, bool reset)
+ {
+ 	int retval;
+-	enum kfd_preempt_type_filter preempt_type;
+-	struct kfd_process_device *pdd;
+ 
+ 	BUG_ON(!dqm);
+ 
+@@ -940,23 +1099,21 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm,
+ 		mutex_lock(&dqm->lock);
+ 	if (!dqm->active_runlist)
+ 		goto out;
++	if (dqm->active_runlist == false)
++		return retval;
+ 
+ 	pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n",
+ 		dqm->sdma_queue_count);
+ 
+ 	if (dqm->sdma_queue_count > 0) {
+-		destroy_sdma_queues(dqm, 0);
+-		destroy_sdma_queues(dqm, 1);
++		unmap_sdma_queues(dqm, 0);
++		unmap_sdma_queues(dqm, 1);
+ 	}
+ 
+-	preempt_type = preempt_static_queues ?
+-			KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES :
+-			KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES;
+-
+ 	retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE,
+-			preempt_type, 0, false, 0);
++			filter, filter_param, reset, 0);
+ 	if (retval != 0)
+-		goto out;
++		return retval;
+ 
+ 	*dqm->fence_addr = KFD_FENCE_INIT;
+ 	pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr,
+@@ -965,55 +1122,47 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm,
+ 	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
+ 				QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
+ 	if (retval != 0) {
+-		pdd = kfd_get_process_device_data(dqm->dev,
+-				kfd_get_process(current));
+-		pdd->reset_wavefronts = true;
+-		goto out;
++		pr_err("kfd: unmapping queues failed.");
++		return retval;
+ 	}
++
+ 	pm_release_ib(&dqm->packets);
+ 	dqm->active_runlist = false;
+ 
+-out:
+-	if (lock)
+-		mutex_unlock(&dqm->lock);
+ 	return retval;
+ }
+ 
+-static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock)
++/* dqm->lock mutex has to be locked before calling this function */
++static int execute_queues_cpsch(struct device_queue_manager *dqm)
+ {
+ 	int retval;
+ 
+ 	BUG_ON(!dqm);
+ 
+-	if (lock)
+-		mutex_lock(&dqm->lock);
+-
+-	retval = destroy_queues_cpsch(dqm, false, false);
++	retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
++			0, false);
+ 	if (retval != 0) {
+ 		pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption");
+-		goto out;
++		return retval;
+ 	}
+ 
+ 	if (dqm->queue_count <= 0 || dqm->processes_count <= 0) {
+ 		retval = 0;
+-		goto out;
++		return retval;
+ 	}
+ 
+ 	if (dqm->active_runlist) {
+ 		retval = 0;
+-		goto out;
++		return retval;
+ 	}
+ 
+ 	retval = pm_send_runlist(&dqm->packets, &dqm->queues);
+ 	if (retval != 0) {
+ 		pr_err("kfd: failed to execute runlist");
+-		goto out;
++		return retval;
+ 	}
+ 	dqm->active_runlist = true;
+ 
+-out:
+-	if (lock)
+-		mutex_unlock(&dqm->lock);
+ 	return retval;
+ }
+ 
+@@ -1051,14 +1200,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+ 		goto failed;
+ 	}
+ 
+-	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
++	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+ 		dqm->sdma_queue_count--;
++		deallocate_sdma_queue(dqm, q->sdma_id);
++	}
+ 
+ 	list_del(&q->list);
+ 	if (q->properties.is_active)
+ 		dqm->queue_count--;
+ 
+-	execute_queues_cpsch(dqm, false);
++	retval = execute_queues_cpsch(dqm);
+ 
+ 	mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ 
+@@ -1072,7 +1223,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+ 
+ 	mutex_unlock(&dqm->lock);
+ 
+-	return 0;
++	return retval;
+ 
+ failed:
+ failed_try_destroy_debugged_queue:
+@@ -1156,6 +1307,172 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
+ 	return false;
+ }
+ 
++static int set_trap_handler(struct device_queue_manager *dqm,
++				struct qcm_process_device *qpd,
++				uint64_t tba_addr,
++				uint64_t tma_addr)
++{
++	uint64_t *tma;
++
++	tma = (uint64_t *)(qpd->cwsr_kaddr + dqm->dev->tma_offset);
++	tma[0] = tba_addr;
++	tma[1] = tma_addr;
++	return 0;
++}
++
++
++static int set_page_directory_base(struct device_queue_manager *dqm,
++					struct qcm_process_device *qpd)
++{
++	struct kfd_process_device *pdd;
++	uint32_t pd_base;
++	int retval = 0;
++
++	BUG_ON(!dqm || !qpd);
++
++	mutex_lock(&dqm->lock);
++
++	pdd = qpd_to_pdd(qpd);
++
++	/* Retrieve PD base */
++	pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
++
++	/* If it has not changed, just get out */
++	if (qpd->page_table_base == pd_base)
++		goto out;
++
++	/* Update PD Base in QPD */
++	qpd->page_table_base = pd_base;
++	pr_debug("Updated PD address == 0x%08u\n", pd_base);
++
++	/*
++	 * Preempt queues, destroy runlist and create new runlist. Queues
++	 * will have the update PD base address
++	 */
++	if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
++		retval = execute_queues_cpsch(dqm);
++
++out:
++	mutex_unlock(&dqm->lock);
++
++	return retval;
++}
++
++static int process_termination_nocpsch(struct device_queue_manager *dqm,
++		struct qcm_process_device *qpd)
++{
++	struct queue *q, *next;
++	struct mqd_manager *mqd;
++	struct device_process_node *cur, *next_dpn;
++
++	mutex_lock(&dqm->lock);
++
++	/* Clear all user mode queues */
++	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
++		mqd = dqm->ops.get_mqd_manager(dqm,
++			get_mqd_type_from_queue_type(q->properties.type));
++		if (!mqd) {
++			mutex_unlock(&dqm->lock);
++			return -ENOMEM;
++		}
++
++		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
++			dqm->sdma_queue_count--;
++			deallocate_sdma_queue(dqm, q->sdma_id);
++		}
++
++		list_del(&q->list);
++		if (q->properties.is_active)
++			dqm->queue_count--;
++
++		dqm->total_queue_count--;
++		mqd->destroy_mqd(mqd, q->mqd,
++				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
++				QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
++				q->pipe, q->queue);
++		mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
++		if (list_empty(&qpd->queues_list))
++			deallocate_vmid(dqm, qpd, q);
++	}
++
++	/* Unregister process */
++	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
++		if (qpd == cur->qpd) {
++			list_del(&cur->list);
++			kfree(cur);
++			dqm->processes_count--;
++			break;
++		}
++	}
++
++	mutex_unlock(&dqm->lock);
++
++	return 0;
++}
++
++
++static int process_termination_cpsch(struct device_queue_manager *dqm,
++		struct qcm_process_device *qpd)
++{
++	int retval;
++	struct queue *q, *next;
++	struct kernel_queue *kq, *kq_next;
++	struct mqd_manager *mqd;
++	struct device_process_node *cur, *next_dpn;
++
++	retval = 0;
++
++	mutex_lock(&dqm->lock);
++
++	/* Clean all kernel queues */
++	list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
++		list_del(&kq->list);
++		dqm->queue_count--;
++		qpd->is_debug = false;
++		dqm->total_queue_count--;
++	}
++
++	/* Clear all user mode queues */
++	list_for_each_entry(q, &qpd->queues_list, list) {
++		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
++			dqm->sdma_queue_count--;
++			deallocate_sdma_queue(dqm, q->sdma_id);
++		}
++
++		if (q->properties.is_active)
++			dqm->queue_count--;
++
++		dqm->total_queue_count--;
++	}
++
++	/* Unregister process */
++	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
++		if (qpd == cur->qpd) {
++			list_del(&cur->list);
++			kfree(cur);
++			dqm->processes_count--;
++			break;
++		}
++	}
++
++	retval = execute_queues_cpsch(dqm);
++
++	/* lastly, free mqd resources */
++	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
++		mqd = dqm->ops.get_mqd_manager(dqm,
++			get_mqd_type_from_queue_type(q->properties.type));
++		if (!mqd) {
++			mutex_unlock(&dqm->lock);
++			return -ENOMEM;
++		}
++		list_del(&q->list);
++		mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
++	}
++
++	mutex_unlock(&dqm->lock);
++	return retval;
++}
++
+ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ {
+ 	struct device_queue_manager *dqm;
+@@ -1186,6 +1503,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ 		dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
+ 		dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
+ 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
++		dqm->ops.set_trap_handler = set_trap_handler;
++		dqm->ops.set_page_directory_base = set_page_directory_base;
++		dqm->ops.process_termination = process_termination_cpsch;
+ 		break;
+ 	case KFD_SCHED_POLICY_NO_HWS:
+ 		/* initialize dqm for no cp scheduling */
+@@ -1200,6 +1520,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ 		dqm->ops.initialize = initialize_nocpsch;
+ 		dqm->ops.uninitialize = uninitialize_nocpsch;
+ 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
++		dqm->ops.set_trap_handler = set_trap_handler;
++		dqm->ops.set_page_directory_base = set_page_directory_base;
++		dqm->ops.process_termination = process_termination_nocpsch;
+ 		break;
+ 	default:
+ 		BUG();
+@@ -1214,6 +1537,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ 	case CHIP_KAVERI:
+ 		device_queue_manager_init_cik(&dqm->ops_asic_specific);
+ 		break;
++
++	case CHIP_TONGA:
++	case CHIP_FIJI:
++		device_queue_manager_init_vi_tonga(&dqm->ops_asic_specific);
++		break;
+ 	}
+ 
+ 	if (dqm->ops.initialize(dqm) != 0) {
+@@ -1231,3 +1559,20 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
+ 	dqm->ops.uninitialize(dqm);
+ 	kfree(dqm);
+ }
++
++int kfd_process_vm_fault(struct device_queue_manager *dqm,
++				unsigned int pasid)
++{
++	struct kfd_process_device *pdd;
++	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
++	int ret = 0;
++
++	if (!p)
++		return -EINVAL;
++	pdd = kfd_get_process_device_data(dqm->dev, p);
++	if (pdd)
++		ret = process_evict_queues(dqm, &pdd->qpd);
++	up_read(&p->lock);
++
++	return ret;
++}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+index faf820a..d6af017 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+@@ -29,10 +29,7 @@
+ #include "kfd_priv.h"
+ #include "kfd_mqd_manager.h"
+ 
+-#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS	(500)
+-#define CIK_VMID_NUM				(8)
+-#define KFD_VMID_START_OFFSET			(8)
+-#define VMID_PER_DEVICE				CIK_VMID_NUM
++#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS	(9000)
+ #define KFD_DQM_FIRST_PIPE			(0)
+ #define CIK_SDMA_QUEUES				(4)
+ #define CIK_SDMA_QUEUES_PER_ENGINE		(2)
+@@ -79,6 +76,12 @@ struct device_process_node {
+  * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the
+  * memory apertures.
+  *
++ * @set_page_directory_base: Sets the PD base address (GPU local memory)
++ * in all the queues of the relevant process running on the specified device.
++ * It preempts the queues, updates the value and execute the runlist again.
++ *
++ * @process_termination: Clears all process queues belongs to that device.
++ *
+  */
+ 
+ struct device_queue_manager_ops {
+@@ -122,6 +125,16 @@ struct device_queue_manager_ops {
+ 					   enum cache_policy alternate_policy,
+ 					   void __user *alternate_aperture_base,
+ 					   uint64_t alternate_aperture_size);
++
++	int	(*set_trap_handler)(struct device_queue_manager *dqm,
++				    struct qcm_process_device *qpd,
++				    uint64_t tba_addr,
++				    uint64_t tma_addr);
++
++	int	(*set_page_directory_base)(struct device_queue_manager *dqm,
++					struct qcm_process_device *qpd);
++	int (*process_termination)(struct device_queue_manager *dqm,
++			struct qcm_process_device *qpd);
+ };
+ 
+ struct device_queue_manager_asic_ops {
+@@ -178,12 +191,20 @@ struct device_queue_manager {
+ 
+ void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops);
+ void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops);
++void device_queue_manager_init_vi_tonga(
++		struct device_queue_manager_asic_ops *ops);
+ void program_sh_mem_settings(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd);
+ unsigned int get_queues_num(struct device_queue_manager *dqm);
+ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
+ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
+ 
++int process_evict_queues(struct device_queue_manager *dqm,
++		struct qcm_process_device *qpd);
++int process_restore_queues(struct device_queue_manager *dqm,
++		struct qcm_process_device *qpd);
++
++
+ static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
+ {
+ 	return (pdd->lds_base >> 16) & 0xFF;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+index 48dc056..da55e39c 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+@@ -24,6 +24,7 @@
+ #include "kfd_device_queue_manager.h"
+ #include "cik_regs.h"
+ #include "oss/oss_2_4_sh_mask.h"
++#include "gca/gfx_7_2_sh_mask.h"
+ 
+ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
+ 				   struct qcm_process_device *qpd,
+@@ -125,6 +126,7 @@ static int register_process_cik(struct device_queue_manager *dqm,
+ 	} else {
+ 		temp = get_sh_mem_bases_nybble_64(pdd);
+ 		qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
++		qpd->sh_mem_config |= 1  << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT;
+ 	}
+ 
+ 	pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n",
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+index 7e9cae9..c023e50 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+@@ -39,6 +39,31 @@ static int initialize_cpsch_vi(struct device_queue_manager *dqm);
+ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+ 				struct qcm_process_device *qpd);
+ 
++/*
++ * Tonga device queue manager functions
++ */
++static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm,
++			struct qcm_process_device *qpd,
++			enum cache_policy default_policy,
++			enum cache_policy alternate_policy,
++			void __user *alternate_aperture_base,
++			uint64_t alternate_aperture_size);
++static int register_process_vi_tonga(struct device_queue_manager *dqm,
++			struct qcm_process_device *qpd);
++static void init_sdma_vm_tonga(struct device_queue_manager *dqm,
++			struct queue *q,
++			struct qcm_process_device *qpd);
++
++void device_queue_manager_init_vi_tonga(
++		struct device_queue_manager_asic_ops *ops)
++{
++	ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga;
++	ops->register_process = register_process_vi_tonga;
++	ops->initialize = initialize_cpsch_vi;
++	ops->init_sdma_vm = init_sdma_vm_tonga;
++}
++
++
+ void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops)
+ {
+ 	ops->set_cache_memory_policy = set_cache_memory_policy_vi;
+@@ -104,6 +129,33 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
+ 	return true;
+ }
+ 
++static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm,
++		struct qcm_process_device *qpd,
++		enum cache_policy default_policy,
++		enum cache_policy alternate_policy,
++		void __user *alternate_aperture_base,
++		uint64_t alternate_aperture_size)
++{
++	uint32_t default_mtype;
++	uint32_t ape1_mtype;
++
++	default_mtype = (default_policy == cache_policy_coherent) ?
++			MTYPE_UC :
++			MTYPE_NC_NV;
++
++	ape1_mtype = (alternate_policy == cache_policy_coherent) ?
++			MTYPE_UC :
++			MTYPE_NC_NV;
++
++	qpd->sh_mem_config =
++			SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
++				   SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
++			default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
++			ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
++
++	return true;
++}
++
+ static int register_process_vi(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+@@ -137,6 +189,8 @@ static int register_process_vi(struct device_queue_manager *dqm,
+ 		qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
+ 		qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 <<
+ 			SH_MEM_CONFIG__ADDRESS_MODE__SHIFT;
++		qpd->sh_mem_config |= 1  <<
++			SH_MEM_CONFIG__PRIVATE_ATC__SHIFT;
+ 	}
+ 
+ 	pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n",
+@@ -145,6 +199,41 @@ static int register_process_vi(struct device_queue_manager *dqm,
+ 	return 0;
+ }
+ 
++static int register_process_vi_tonga(struct device_queue_manager *dqm,
++			struct qcm_process_device *qpd)
++{
++	struct kfd_process_device *pdd;
++	unsigned int temp;
++
++	BUG_ON(!dqm || !qpd);
++
++	pdd = qpd_to_pdd(qpd);
++
++	/* check if sh_mem_config register already configured */
++	if (qpd->sh_mem_config == 0) {
++		qpd->sh_mem_config =
++				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
++					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
++				MTYPE_UC <<
++					SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
++				MTYPE_UC <<
++					SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
++
++		qpd->sh_mem_ape1_limit = 0;
++		qpd->sh_mem_ape1_base = 0;
++	}
++
++	/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
++	 * aperture addresses. */
++	temp = get_sh_mem_bases_nybble_64(pdd);
++	qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
++
++	pr_debug("kfd: sh_mem_bases nybble: 0x%X and register 0x%X\n",
++		temp, qpd->sh_mem_bases);
++
++	return 0;
++}
++
+ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+ 				struct qcm_process_device *qpd)
+ {
+@@ -161,6 +250,23 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+ 	q->properties.sdma_vm_addr = value;
+ }
+ 
++static void init_sdma_vm_tonga(struct device_queue_manager *dqm,
++			struct queue *q,
++			struct qcm_process_device *qpd)
++{
++	uint32_t value = 0;
++
++	if (q->process->is_32bit_user_mode)
++		value |= (1 << SDMA0_RLC0_VIRTUAL_ADDR__PTR32__SHIFT) |
++				get_sh_mem_bases_32(qpd_to_pdd(qpd));
++	else
++		value |= ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) <<
++				SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) &
++				SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK;
++	q->properties.sdma_vm_addr = value;
++}
++
++
+ static int initialize_cpsch_vi(struct device_queue_manager *dqm)
+ {
+ 	return 0;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+index 453c5d6..d6a7e2a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+@@ -142,12 +142,11 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
+ 
+ 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ 
+-	pr_debug("kfd: mapping doorbell page in %s\n"
++	pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n"
+ 		 "     target user address == 0x%08llX\n"
+ 		 "     physical address    == 0x%08llX\n"
+ 		 "     vm_flags            == 0x%04lX\n"
+ 		 "     size                == 0x%04lX\n",
+-		 __func__,
+ 		 (unsigned long long) vma->vm_start, address, vma->vm_flags,
+ 		 doorbell_process_allocation());
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+index d1ce83d..23b5936 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+@@ -32,11 +32,10 @@
+ #include "kfd_events.h"
+ #include <linux/device.h>
+ 
+-/*
+- * A task can only be on a single wait_queue at a time, but we need to support
++/* A task can only be on a single wait_queue at a time, but we need to support
+  * waiting on multiple events (any/all).
+- * Instead of each event simply having a wait_queue with sleeping tasks, it
+- * has a singly-linked list of tasks.
++ * Instead of each event simply having a wait_queue with sleeping tasks, it has a
++ * singly-linked list of tasks.
+  * A thread that wants to sleep creates an array of these, one for each event
+  * and adds one to each event's waiter chain.
+  */
+@@ -52,12 +51,11 @@ struct kfd_event_waiter {
+ 	uint32_t input_index;
+ };
+ 
+-/*
+- * Over-complicated pooled allocator for event notification slots.
++/* Over-complicated pooled allocator for event notification slots.
+  *
+- * Each signal event needs a 64-bit signal slot where the signaler will write
+- * a 1 before sending an interrupt.l (This is needed because some interrupts
+- * do not contain enough spare data bits to identify an event.)
++ * Each signal event needs a 64-bit signal slot where the signaler will write a 1
++ * before sending an interrupt.l (This is needed because some interrupts do not
++ * contain enough spare data bits to identify an event.)
+  * We get whole pages from vmalloc and map them to the process VA.
+  * Individual signal events are then allocated a slot in a page.
+  */
+@@ -65,6 +63,7 @@ struct kfd_event_waiter {
+ struct signal_page {
+ 	struct list_head event_pages;	/* kfd_process.signal_event_pages */
+ 	uint64_t *kernel_address;
++	uint64_t handle;
+ 	uint64_t __user *user_address;
+ 	uint32_t page_index;		/* Index into the mmap aperture. */
+ 	unsigned int free_slots;
+@@ -74,8 +73,7 @@ struct signal_page {
+ #define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT
+ #define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE)
+ #define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1)
+-#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \
+-				SLOT_BITMAP_SIZE * sizeof(long))
++#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + SLOT_BITMAP_SIZE * sizeof(long))
+ 
+ /*
+  * For signal events, the event ID is used as the interrupt user data.
+@@ -85,23 +83,27 @@ struct signal_page {
+ #define INTERRUPT_DATA_BITS 8
+ #define SIGNAL_EVENT_ID_SLOT_SHIFT 0
+ 
++/* We can only create 8 debug events */
++
++#define KFD_DEBUG_EVENT_LIMIT 8
++#define KFD_DEBUG_EVENT_MASK 0x1F
++#define KFD_DEBUG_EVENT_SHIFT 5
++
+ static uint64_t *page_slots(struct signal_page *page)
+ {
+ 	return page->kernel_address;
+ }
+ 
+-static bool allocate_free_slot(struct kfd_process *process,
+-				struct signal_page **out_page,
+-				unsigned int *out_slot_index)
++static bool
++allocate_free_slot(struct kfd_process *process,
++		   struct signal_page **out_page,
++		   unsigned int *out_slot_index)
+ {
+ 	struct signal_page *page;
+ 
+ 	list_for_each_entry(page, &process->signal_event_pages, event_pages) {
+ 		if (page->free_slots > 0) {
+-			unsigned int slot =
+-				find_first_zero_bit(page->used_slot_bitmap,
+-							SLOTS_PER_PAGE);
+-
++			unsigned int slot = find_first_zero_bit(page->used_slot_bitmap, SLOTS_PER_PAGE);
+ 			__set_bit(slot, page->used_slot_bitmap);
+ 			page->free_slots--;
+ 
+@@ -130,6 +132,8 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p)
+ {
+ 	void *backing_store;
+ 	struct signal_page *page;
++	unsigned int slot;
++	int i;
+ 
+ 	page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL);
+ 	if (!page)
+@@ -137,17 +141,23 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p)
+ 
+ 	page->free_slots = SLOTS_PER_PAGE;
+ 
+-	backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
++	backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, \
+ 					get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
+ 	if (!backing_store)
+ 		goto fail_alloc_signal_store;
+ 
+ 	/* prevent user-mode info leaks */
+-	memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT,
+-		KFD_SIGNAL_EVENT_LIMIT * 8);
+-
++	memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, KFD_SIGNAL_EVENT_LIMIT * 8);
+ 	page->kernel_address = backing_store;
+ 
++	/* Set bits of debug events to prevent allocation */
++	for (i = 0 ; i < KFD_DEBUG_EVENT_LIMIT ; i++) {
++		slot = (i << KFD_DEBUG_EVENT_SHIFT) |
++				KFD_DEBUG_EVENT_MASK;
++		__set_bit(slot, page->used_slot_bitmap);
++		page->free_slots--;
++	}
++
+ 	if (list_empty(&p->signal_event_pages))
+ 		page->page_index = 0;
+ 	else
+@@ -169,10 +179,10 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p)
+ 	return false;
+ }
+ 
+-static bool allocate_event_notification_slot(struct file *devkfd,
+-					struct kfd_process *p,
+-					struct signal_page **page,
+-					unsigned int *signal_slot_index)
++static bool
++allocate_event_notification_slot(struct file *devkfd, struct kfd_process *p,
++				 struct signal_page **page,
++				 unsigned int *signal_slot_index)
+ {
+ 	bool ret;
+ 
+@@ -186,6 +196,88 @@ static bool allocate_event_notification_slot(struct file *devkfd,
+ 	return ret;
+ }
+ 
++static bool
++allocate_signal_page_dgpu(struct kfd_process *p,
++		uint64_t *kernel_address, uint64_t handle)
++{
++	struct signal_page *my_page;
++
++	my_page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL);
++	if (!my_page)
++		return false;
++
++	/* prevent user-mode info leaks */
++	memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
++			KFD_SIGNAL_EVENT_LIMIT * 8);
++
++	my_page->kernel_address = kernel_address;
++	my_page->handle = handle;
++	my_page->user_address = NULL;
++	my_page->free_slots = SLOTS_PER_PAGE;
++	if (list_empty(&p->signal_event_pages))
++			my_page->page_index = 0;
++	else
++		my_page->page_index = list_tail_entry(&p->signal_event_pages,
++		   struct signal_page,
++		   event_pages)->page_index + 1;
++
++	pr_debug("allocated new event signal page at %p, for process %p\n",
++			my_page, p);
++	pr_debug("page index is %d\n", my_page->page_index);
++
++	list_add(&my_page->event_pages, &p->signal_event_pages);
++
++	return true;
++}
++
++void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle)
++{
++	struct signal_page *page, *tmp;
++
++	list_for_each_entry_safe(page, tmp, &p->signal_event_pages,
++				event_pages) {
++		if (page->handle == handle) {
++			list_del(&page->event_pages);
++			kfree(page);
++			break;
++		}
++	}
++}
++
++static bool
++allocate_debug_event_notification_slot(struct file *devkfd,
++				struct kfd_process *p,
++				struct signal_page **out_page,
++				unsigned int *out_slot_index)
++{
++	struct signal_page *page;
++	unsigned int slot;
++	bool ret;
++
++	if (list_empty(&p->signal_event_pages)) {
++		ret = allocate_signal_page(devkfd, p);
++		if (ret == false)
++			return ret;
++	}
++
++	page = list_entry((&p->signal_event_pages)->next, struct signal_page,
++				event_pages);
++	slot = (p->debug_event_count << KFD_DEBUG_EVENT_SHIFT) |
++			KFD_DEBUG_EVENT_MASK;
++
++	pr_debug("page == %p\n", page);
++	pr_debug("slot == %d\n", slot);
++
++	page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT;
++	*out_page = page;
++	*out_slot_index = slot;
++
++	pr_debug("allocated debug event signal slot in page %p, slot %d\n",
++			page, slot);
++
++	return true;
++}
++
+ /* Assumes that the process's event_mutex is locked. */
+ static void release_event_notification_slot(struct signal_page *page,
+ 						size_t slot_index)
+@@ -202,10 +294,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p,
+ {
+ 	struct signal_page *page;
+ 
+-	/*
+-	 * This is safe because we don't delete signal pages until the
+-	 * process exits.
+-	 */
++	/* This is safe because we don't delete signal pages until the process exits. */
+ 	list_for_each_entry(page, &p->signal_event_pages, event_pages)
+ 		if (page->page_index == page_index)
+ 			return page;
+@@ -213,10 +302,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p,
+ 	return NULL;
+ }
+ 
+-/*
+- * Assumes that p->event_mutex is held and of course that p is not going
+- * away (current or locked).
+- */
++/* Assumes that p->event_mutex is held and of course that p is not going away (current or locked). */
+ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
+ {
+ 	struct kfd_event *ev;
+@@ -231,32 +317,27 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
+ static u32 make_signal_event_id(struct signal_page *page,
+ 					 unsigned int signal_slot_index)
+ {
+-	return page->page_index |
+-			(signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT);
++	return page->page_index | (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT);
+ }
+ 
+-/*
+- * Produce a kfd event id for a nonsignal event.
+- * These are arbitrary numbers, so we do a sequential search through
+- * the hash table for an unused number.
++/* Produce a kfd event id for a nonsignal event.
++ * These are arbitrary numbers, so we do a sequential search through the hash table
++ * for an unused number.
+  */
+ static u32 make_nonsignal_event_id(struct kfd_process *p)
+ {
+ 	u32 id;
+ 
+ 	for (id = p->next_nonsignal_event_id;
+-		id < KFD_LAST_NONSIGNAL_EVENT_ID &&
+-		lookup_event_by_id(p, id) != NULL;
+-		id++)
++	     id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL;
++	     id++)
+ 		;
+ 
+ 	if (id < KFD_LAST_NONSIGNAL_EVENT_ID) {
+ 
+-		/*
+-		 * What if id == LAST_NONSIGNAL_EVENT_ID - 1?
+-		 * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so
+-		 * the first loop fails immediately and we proceed with the
+-		 * wraparound loop below.
++		/* What if id == LAST_NONSIGNAL_EVENT_ID - 1?
++		 * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so the first loop
++		 * fails immediately and we proceed with the wraparound loop below.
+ 		 */
+ 		p->next_nonsignal_event_id = id + 1;
+ 
+@@ -264,54 +345,68 @@ static u32 make_nonsignal_event_id(struct kfd_process *p)
+ 	}
+ 
+ 	for (id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+-		id < KFD_LAST_NONSIGNAL_EVENT_ID &&
+-		lookup_event_by_id(p, id) != NULL;
+-		id++)
++	     id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL;
++	     id++)
+ 		;
+ 
+ 
+ 	if (id < KFD_LAST_NONSIGNAL_EVENT_ID) {
+ 		p->next_nonsignal_event_id = id + 1;
+ 		return id;
++	} else {
++		p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
++		return 0;
+ 	}
+-
+-	p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+-	return 0;
+ }
+ 
+-static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p,
+-						struct signal_page *page,
+-						unsigned int signal_slot)
++static struct kfd_event *
++lookup_event_by_page_slot(struct kfd_process *p,
++			  struct signal_page *page, unsigned int signal_slot)
+ {
+ 	return lookup_event_by_id(p, make_signal_event_id(page, signal_slot));
+ }
+ 
+-static int create_signal_event(struct file *devkfd,
+-				struct kfd_process *p,
+-				struct kfd_event *ev)
++static int
++create_signal_event(struct file *devkfd, struct kfd_process *p, struct kfd_event *ev)
+ {
+-	if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) {
++	if ((ev->type == KFD_EVENT_TYPE_SIGNAL) &&
++			(p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT)) {
+ 		pr_warn("amdkfd: Signal event wasn't created because limit was reached\n");
+ 		return -ENOMEM;
++	} else if ((ev->type == KFD_EVENT_TYPE_DEBUG) &&
++			(p->debug_event_count == KFD_DEBUG_EVENT_LIMIT)) {
++		pr_warn("amdkfd: Debug event wasn't created because limit was reached\n");
++		return -ENOMEM;
+ 	}
+ 
+-	if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page,
++	if (ev->type == KFD_EVENT_TYPE_SIGNAL) {
++		if (!allocate_event_notification_slot(devkfd, p,
++						&ev->signal_page,
+ 						&ev->signal_slot_index)) {
+-		pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n");
+-		return -ENOMEM;
+-	}
++			pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n");
++			return -ENOMEM;
++		}
+ 
+-	p->signal_event_count++;
++		p->signal_event_count++;
+ 
+-	ev->user_signal_address =
+-			&ev->signal_page->user_address[ev->signal_slot_index];
++		if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) ==
++				KFD_DEBUG_EVENT_MASK)
++			p->signal_event_count++;
+ 
+-	ev->event_id = make_signal_event_id(ev->signal_page,
+-						ev->signal_slot_index);
++	} else if (ev->type == KFD_EVENT_TYPE_DEBUG) {
++		if (!allocate_debug_event_notification_slot(devkfd, p,
++						&ev->signal_page,
++						&ev->signal_slot_index)) {
++			pr_warn("amdkfd: Debug event wasn't created because out of kernel memory\n");
++			return -ENOMEM;
++		}
+ 
+-	pr_debug("signal event number %zu created with id %d, address %p\n",
+-			p->signal_event_count, ev->event_id,
+-			ev->user_signal_address);
++		p->debug_event_count++;
++	}
++
++	ev->user_signal_address = &ev->signal_page->user_address[ev->signal_slot_index];
++
++	ev->event_id = make_signal_event_id(ev->signal_page, ev->signal_slot_index);
+ 
+ 	pr_debug("signal event number %zu created with id %d, address %p\n",
+ 			p->signal_event_count, ev->event_id,
+@@ -320,12 +415,10 @@ static int create_signal_event(struct file *devkfd,
+ 	return 0;
+ }
+ 
+-/*
+- * No non-signal events are supported yet.
+- * We create them as events that never signal.
+- * Set event calls from user-mode are failed.
+- */
+-static int create_other_event(struct kfd_process *p, struct kfd_event *ev)
++/* No non-signal events are supported yet.
++ * We create them as events that never signal. Set event calls from user-mode are failed. */
++static int
++create_other_event(struct kfd_process *p, struct kfd_event *ev)
+ {
+ 	ev->event_id = make_nonsignal_event_id(p);
+ 	if (ev->event_id == 0)
+@@ -341,20 +434,25 @@ void kfd_event_init_process(struct kfd_process *p)
+ 	INIT_LIST_HEAD(&p->signal_event_pages);
+ 	p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+ 	p->signal_event_count = 0;
++	p->debug_event_count = 0;
+ }
+ 
+ static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
+ {
+ 	if (ev->signal_page != NULL) {
+-		release_event_notification_slot(ev->signal_page,
+-						ev->signal_slot_index);
+-		p->signal_event_count--;
++		if (ev->type == KFD_EVENT_TYPE_SIGNAL) {
++			release_event_notification_slot(ev->signal_page,
++							ev->signal_slot_index);
++			p->signal_event_count--;
++			if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) ==
++					KFD_DEBUG_EVENT_MASK)
++				p->signal_event_count--;
++		} else if (ev->type == KFD_EVENT_TYPE_DEBUG) {
++			p->debug_event_count--;
++		}
+ 	}
+ 
+-	/*
+-	 * Abandon the list of waiters. Individual waiting threads will
+-	 * clean up their own data.
+-	 */
++	/* Abandon the list of waiters. Individual waiting threads will clean up their own data.*/
+ 	list_del(&ev->waiters);
+ 
+ 	hash_del(&ev->events);
+@@ -371,18 +469,17 @@ static void destroy_events(struct kfd_process *p)
+ 		destroy_event(p, ev);
+ }
+ 
+-/*
+- * We assume that the process is being destroyed and there is no need to
+- * unmap the pages or keep bookkeeping data in order.
+- */
++/* We assume that the process is being destroyed and there is no need to unmap the pages
++ * or keep bookkeeping data in order. */
+ static void shutdown_signal_pages(struct kfd_process *p)
+ {
+ 	struct signal_page *page, *tmp;
+ 
+-	list_for_each_entry_safe(page, tmp, &p->signal_event_pages,
+-					event_pages) {
+-		free_pages((unsigned long)page->kernel_address,
+-				get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
++	list_for_each_entry_safe(page, tmp, &p->signal_event_pages, event_pages) {
++		if (page->user_address) {
++			free_pages((unsigned long)page->kernel_address,
++					get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
++		}
+ 		kfree(page);
+ 	}
+ }
+@@ -395,8 +492,7 @@ void kfd_event_free_process(struct kfd_process *p)
+ 
+ static bool event_can_be_gpu_signaled(const struct kfd_event *ev)
+ {
+-	return ev->type == KFD_EVENT_TYPE_SIGNAL ||
+-					ev->type == KFD_EVENT_TYPE_DEBUG;
++	return ev->type == KFD_EVENT_TYPE_SIGNAL || ev->type == KFD_EVENT_TYPE_DEBUG;
+ }
+ 
+ static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
+@@ -407,11 +503,12 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
+ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 		     uint32_t event_type, bool auto_reset, uint32_t node_id,
+ 		     uint32_t *event_id, uint32_t *event_trigger_data,
+-		     uint64_t *event_page_offset, uint32_t *event_slot_index)
++		     uint64_t *event_page_offset, uint32_t *event_slot_index,
++		     void *kern_addr)
+ {
+ 	int ret = 0;
+-	struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ 
++	struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ 	if (!ev)
+ 		return -ENOMEM;
+ 
+@@ -421,17 +518,20 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 
+ 	INIT_LIST_HEAD(&ev->waiters);
+ 
+-	*event_page_offset = 0;
+-
+ 	mutex_lock(&p->event_mutex);
+ 
++	if (kern_addr && list_empty(&p->signal_event_pages))
++		allocate_signal_page_dgpu(p, kern_addr, *event_page_offset);
++
++	*event_page_offset = 0;
++
+ 	switch (event_type) {
+ 	case KFD_EVENT_TYPE_SIGNAL:
+ 	case KFD_EVENT_TYPE_DEBUG:
+ 		ret = create_signal_event(devkfd, p, ev);
+ 		if (!ret) {
+ 			*event_page_offset = (ev->signal_page->page_index |
+-					KFD_MMAP_EVENTS_MASK);
++					KFD_MMAP_TYPE_EVENTS);
+ 			*event_page_offset <<= PAGE_SHIFT;
+ 			*event_slot_index = ev->signal_slot_index;
+ 		}
+@@ -538,8 +638,7 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id)
+ 
+ static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev)
+ {
+-	page_slots(ev->signal_page)[ev->signal_slot_index] =
+-						UNSIGNALED_EVENT_SLOT;
++	page_slots(ev->signal_page)[ev->signal_slot_index] = UNSIGNALED_EVENT_SLOT;
+ }
+ 
+ static bool is_slot_signaled(struct signal_page *page, unsigned int index)
+@@ -547,8 +646,7 @@ static bool is_slot_signaled(struct signal_page *page, unsigned int index)
+ 	return page_slots(page)[index] != UNSIGNALED_EVENT_SLOT;
+ }
+ 
+-static void set_event_from_interrupt(struct kfd_process *p,
+-					struct kfd_event *ev)
++static void set_event_from_interrupt(struct kfd_process *p, struct kfd_event *ev)
+ {
+ 	if (ev && event_can_be_gpu_signaled(ev)) {
+ 		acknowledge_signal(p, ev);
+@@ -561,42 +659,39 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+ {
+ 	struct kfd_event *ev;
+ 
+-	/*
+-	 * Because we are called from arbitrary context (workqueue) as opposed
++	/* Because we are called from arbitrary context (workqueue) as opposed
+ 	 * to process context, kfd_process could attempt to exit while we are
+-	 * running so the lookup function returns a locked process.
+-	 */
++	 * running so the lookup function returns a read-locked process. */
+ 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+-
+ 	if (!p)
+ 		return; /* Presumably process exited. */
+ 
+ 	mutex_lock(&p->event_mutex);
+ 
+-	if (valid_id_bits >= INTERRUPT_DATA_BITS) {
++	if ((valid_id_bits >= INTERRUPT_DATA_BITS) &&
++			((partial_id & KFD_DEBUG_EVENT_MASK) ==
++					KFD_DEBUG_EVENT_MASK)) {
+ 		/* Partial ID is a full ID. */
+ 		ev = lookup_event_by_id(p, partial_id);
+ 		set_event_from_interrupt(p, ev);
+ 	} else {
+-		/*
+-		 * Partial ID is in fact partial. For now we completely
+-		 * ignore it, but we could use any bits we did receive to
+-		 * search faster.
+-		 */
++		/* Partial ID is in fact partial. For now we completely ignore it,
++		 * but we could use any bits we did receive to search faster. */
+ 		struct signal_page *page;
+ 		unsigned i;
+ 
+-		list_for_each_entry(page, &p->signal_event_pages, event_pages)
+-			for (i = 0; i < SLOTS_PER_PAGE; i++)
++		list_for_each_entry(page, &p->signal_event_pages, event_pages) {
++			for (i = 0; i < SLOTS_PER_PAGE; i++) {
+ 				if (is_slot_signaled(page, i)) {
+-					ev = lookup_event_by_page_slot(p,
+-								page, i);
++					ev = lookup_event_by_page_slot(p, page, i);
+ 					set_event_from_interrupt(p, ev);
+ 				}
++			}
++		}
+ 	}
+ 
+ 	mutex_unlock(&p->event_mutex);
+-	mutex_unlock(&p->mutex);
++	up_read(&p->lock);
+ }
+ 
+ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
+@@ -604,20 +699,20 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
+ 	struct kfd_event_waiter *event_waiters;
+ 	uint32_t i;
+ 
+-	event_waiters = kmalloc_array(num_events,
+-					sizeof(struct kfd_event_waiter),
+-					GFP_KERNEL);
++	event_waiters = kmalloc(num_events * sizeof(struct kfd_event_waiter), GFP_KERNEL);
+ 
+-	for (i = 0; (event_waiters) && (i < num_events) ; i++) {
+-		INIT_LIST_HEAD(&event_waiters[i].waiters);
+-		event_waiters[i].sleeping_task = current;
+-		event_waiters[i].activated = false;
++	if (event_waiters) {
++		for (i = 0; i < num_events; i++) {
++			INIT_LIST_HEAD(&event_waiters[i].waiters);
++			event_waiters[i].sleeping_task = current;
++			event_waiters[i].activated = false;
++		}
+ 	}
+ 
+ 	return event_waiters;
+ }
+ 
+-static int init_event_waiter(struct kfd_process *p,
++static int init_event_waiter_get_status(struct kfd_process *p,
+ 		struct kfd_event_waiter *waiter,
+ 		uint32_t event_id,
+ 		uint32_t input_index)
+@@ -632,13 +727,21 @@ static int init_event_waiter(struct kfd_process *p,
+ 	waiter->activated = ev->signaled;
+ 	ev->signaled = ev->signaled && !ev->auto_reset;
+ 
+-	list_add(&waiter->waiters, &ev->waiters);
+-
+ 	return 0;
+ }
+ 
++static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter)
++{
++	struct kfd_event *ev = waiter->event;
++
++	/* Only add to the wait list if we actually need to
++	 * wait on this event. */
++	if (!waiter->activated)
++		list_add(&waiter->waiters, &ev->waiters);
++}
++
+ static bool test_event_condition(bool all, uint32_t num_events,
+-				struct kfd_event_waiter *event_waiters)
++		struct kfd_event_waiter *event_waiters)
+ {
+ 	uint32_t i;
+ 	uint32_t activated_count = 0;
+@@ -663,23 +766,15 @@ static bool copy_signaled_event_data(uint32_t num_events,
+ 		struct kfd_event_waiter *event_waiters,
+ 		struct kfd_event_data __user *data)
+ {
+-	struct kfd_hsa_memory_exception_data *src;
+-	struct kfd_hsa_memory_exception_data __user *dst;
+-	struct kfd_event_waiter *waiter;
+-	struct kfd_event *event;
+ 	uint32_t i;
+ 
+-	for (i = 0; i < num_events; i++) {
+-		waiter = &event_waiters[i];
+-		event = waiter->event;
+-		if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
+-			dst = &data[waiter->input_index].memory_exception_data;
+-			src = &event->memory_exception_data;
+-			if (copy_to_user(dst, src,
+-				sizeof(struct kfd_hsa_memory_exception_data)))
++	for (i = 0; i < num_events; i++)
++		if (event_waiters[i].activated &&
++			event_waiters[i].event->type == KFD_EVENT_TYPE_MEMORY)
++			if (copy_to_user(&data[event_waiters[i].input_index].memory_exception_data,
++					&event_waiters[i].event->memory_exception_data,
++					sizeof(struct kfd_hsa_memory_exception_data)))
+ 				return false;
+-		}
+-	}
+ 
+ 	return true;
+ 
+@@ -695,11 +790,9 @@ static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
+ 	if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE)
+ 		return MAX_SCHEDULE_TIMEOUT;
+ 
+-	/*
+-	 * msecs_to_jiffies interprets all values above 2^31-1 as infinite,
++	/* msecs_to_jiffies interprets all values above 2^31-1 as infinite,
+ 	 * but we consider them finite.
+-	 * This hack is wrong, but nobody is likely to notice.
+-	 */
++	 * This hack is wrong, but nobody is likely to notice. */
+ 	user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF);
+ 
+ 	return msecs_to_jiffies(user_timeout_ms) + 1;
+@@ -724,11 +817,16 @@ int kfd_wait_on_events(struct kfd_process *p,
+ 			(struct kfd_event_data __user *) data;
+ 	uint32_t i;
+ 	int ret = 0;
++
+ 	struct kfd_event_waiter *event_waiters = NULL;
+ 	long timeout = user_timeout_to_jiffies(user_timeout_ms);
+ 
+ 	mutex_lock(&p->event_mutex);
+ 
++	/* Set to something unreasonable - this is really
++	 * just a bool for now. */
++	*wait_result = KFD_WAIT_TIMEOUT;
++
+ 	event_waiters = alloc_event_waiters(num_events);
+ 	if (!event_waiters) {
+ 		ret = -ENOMEM;
+@@ -744,14 +842,34 @@ int kfd_wait_on_events(struct kfd_process *p,
+ 			goto fail;
+ 		}
+ 
+-		ret = init_event_waiter(p, &event_waiters[i],
++		ret = init_event_waiter_get_status(p, &event_waiters[i],
+ 				event_data.event_id, i);
+ 		if (ret)
+ 			goto fail;
+ 	}
+ 
++	/* Check condition once. */
++	if (test_event_condition(all, num_events, event_waiters)) {
++		if (copy_signaled_event_data(num_events,
++				event_waiters, events))
++			*wait_result = KFD_WAIT_COMPLETE;
++		else
++			*wait_result = KFD_WAIT_ERROR;
++		free_waiters(num_events, event_waiters);
++	} else {
++		/* Add to wait lists if we need to wait. */
++		for (i = 0; i < num_events; i++)
++			init_event_waiter_add_to_waitlist(&event_waiters[i]);
++	}
++
+ 	mutex_unlock(&p->event_mutex);
+ 
++	/* Return if all waits were already satisfied. */
++	if (*wait_result != KFD_WAIT_TIMEOUT) {
++		__set_current_state(TASK_RUNNING);
++		return ret;
++	}
++
+ 	while (true) {
+ 		if (fatal_signal_pending(current)) {
+ 			ret = -EINTR;
+@@ -760,17 +878,17 @@ int kfd_wait_on_events(struct kfd_process *p,
+ 
+ 		if (signal_pending(current)) {
+ 			/*
+-			 * This is wrong when a nonzero, non-infinite timeout
+-			 * is specified. We need to use
+-			 * ERESTARTSYS_RESTARTBLOCK, but struct restart_block
+-			 * contains a union with data for each user and it's
+-			 * in generic kernel code that I don't want to
+-			 * touch yet.
++			 * This is wrong when a nonzero, non-infinite timeout is specified.
++			 * We need to use ERESTARTSYS_RESTARTBLOCK, but struct restart_block
++			 * contains a union with data for each user and it's in generic
++			 * kernel code that I don't want to touch yet.
+ 			 */
+ 			ret = -ERESTARTSYS;
+ 			break;
+ 		}
+ 
++		set_current_state(TASK_INTERRUPTIBLE);
++
+ 		if (test_event_condition(all, num_events, event_waiters)) {
+ 			if (copy_signaled_event_data(num_events,
+ 					event_waiters, events))
+@@ -785,7 +903,7 @@ int kfd_wait_on_events(struct kfd_process *p,
+ 			break;
+ 		}
+ 
+-		timeout = schedule_timeout_interruptible(timeout);
++		timeout = schedule_timeout(timeout);
+ 	}
+ 	__set_current_state(TASK_RUNNING);
+ 
+@@ -825,8 +943,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
+ 	page = lookup_signal_page_by_index(p, page_index);
+ 	if (!page) {
+ 		/* Probably KFD bug, but mmap is user-accessible. */
+-		pr_debug("signal page could not be found for page_index %u\n",
+-				page_index);
++		pr_debug("signal page could not be found for page_index %u\n", page_index);
+ 		return -EINVAL;
+ 	}
+ 
+@@ -858,23 +975,29 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
+ static void lookup_events_by_type_and_signal(struct kfd_process *p,
+ 		int type, void *event_data)
+ {
+-	struct kfd_hsa_memory_exception_data *ev_data;
+ 	struct kfd_event *ev;
+ 	int bkt;
+ 	bool send_signal = true;
+ 
+-	ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
+-
+-	hash_for_each(p->events, bkt, ev, events)
++	hash_for_each(p->events, bkt, ev, events) {
+ 		if (ev->type == type) {
+ 			send_signal = false;
+ 			dev_dbg(kfd_device,
+ 					"Event found: id %X type %d",
+ 					ev->event_id, ev->type);
+ 			set_event(ev);
+-			if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
+-				ev->memory_exception_data = *ev_data;
++			if (ev->type == KFD_EVENT_TYPE_MEMORY && event_data)
++				ev->memory_exception_data =
++						*(struct kfd_hsa_memory_exception_data *)event_data;
+ 		}
++	}
++
++	if (type == KFD_EVENT_TYPE_MEMORY) {
++		dev_warn(kfd_device,
++			"Sending SIGSEGV to HSA Process with PID %d ",
++				p->lead_thread->pid);
++		send_sig(SIGSEGV, p->lead_thread, 0);
++	}
+ 
+ 	/* Send SIGTERM no event of type "type" has been found*/
+ 	if (send_signal) {
+@@ -901,7 +1024,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+ 	/*
+ 	 * Because we are called from arbitrary context (workqueue) as opposed
+ 	 * to process context, kfd_process could attempt to exit while we are
+-	 * running so the lookup function returns a locked process.
++	 * running so the lookup function returns a read-locked process.
+ 	 */
+ 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ 
+@@ -916,24 +1039,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+ 	memory_exception_data.gpu_id = dev->id;
+ 	memory_exception_data.va = address;
+ 	/* Set failure reason */
+-	memory_exception_data.failure.NotPresent = 1;
+-	memory_exception_data.failure.NoExecute = 0;
+-	memory_exception_data.failure.ReadOnly = 0;
++	memory_exception_data.failure.NotPresent = true;
++	memory_exception_data.failure.NoExecute = false;
++	memory_exception_data.failure.ReadOnly = false;
+ 	if (vma) {
+ 		if (vma->vm_start > address) {
+-			memory_exception_data.failure.NotPresent = 1;
+-			memory_exception_data.failure.NoExecute = 0;
+-			memory_exception_data.failure.ReadOnly = 0;
++			memory_exception_data.failure.NotPresent = true;
++			memory_exception_data.failure.NoExecute = false;
++			memory_exception_data.failure.ReadOnly = false;
+ 		} else {
+-			memory_exception_data.failure.NotPresent = 0;
++			memory_exception_data.failure.NotPresent = false;
+ 			if (is_write_requested && !(vma->vm_flags & VM_WRITE))
+-				memory_exception_data.failure.ReadOnly = 1;
++				memory_exception_data.failure.ReadOnly = true;
+ 			else
+-				memory_exception_data.failure.ReadOnly = 0;
++				memory_exception_data.failure.ReadOnly = false;
+ 			if (is_execute_requested && !(vma->vm_flags & VM_EXEC))
+-				memory_exception_data.failure.NoExecute = 1;
++				memory_exception_data.failure.NoExecute = true;
+ 			else
+-				memory_exception_data.failure.NoExecute = 0;
++				memory_exception_data.failure.NoExecute = false;
+ 		}
+ 	}
+ 
+@@ -946,7 +1069,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+ 			&memory_exception_data);
+ 
+ 	mutex_unlock(&p->event_mutex);
+-	mutex_unlock(&p->mutex);
++	up_read(&p->lock);
+ }
+ 
+ void kfd_signal_hw_exception_event(unsigned int pasid)
+@@ -954,7 +1077,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
+ 	/*
+ 	 * Because we are called from arbitrary context (workqueue) as opposed
+ 	 * to process context, kfd_process could attempt to exit while we are
+-	 * running so the lookup function returns a locked process.
++	 * running so the lookup function returns a read-locked process.
+ 	 */
+ 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ 
+@@ -967,5 +1090,42 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
+ 	lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
+ 
+ 	mutex_unlock(&p->event_mutex);
+-	mutex_unlock(&p->mutex);
++	up_read(&p->lock);
++}
++
++void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
++				struct kfd_vm_fault_info *info)
++{
++	struct kfd_event *ev;
++	int bkt;
++	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
++	struct kfd_hsa_memory_exception_data memory_exception_data;
++
++	if (!p)
++		return; /* Presumably process exited. */
++	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
++	memory_exception_data.gpu_id = dev->id;
++	/* Set failure reason */
++	if (info) {
++		memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
++		memory_exception_data.failure.NotPresent =
++			info->prot_valid ? true : false;
++		memory_exception_data.failure.NoExecute =
++			info->prot_exec ? true : false;
++		memory_exception_data.failure.ReadOnly =
++			info->prot_write ? true : false;
++	}
++	mutex_lock(&p->event_mutex);
++
++	hash_for_each(p->events, bkt, ev, events) {
++		if (ev->type == KFD_EVENT_TYPE_MEMORY) {
++			ev->memory_exception_data = memory_exception_data;
++			set_event(ev);
++		}
++	}
++
++	mutex_unlock(&p->event_mutex);
++	up_read(&p->lock);
++
+ }
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+index 28f6838..d7987eb 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+@@ -34,8 +34,7 @@
+ #define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK
+ #define KFD_LAST_NONSIGNAL_EVENT_ID UINT_MAX
+ 
+-/*
+- * Written into kfd_signal_slot_t to indicate that the event is not signaled.
++/* Written into kfd_signal_slot_t to indicate that the event is not signaled.
+  * Since the event protocol may need to write the event ID into memory, this
+  * must not be a valid event ID.
+  * For the sake of easy memset-ing, this must be a byte pattern.
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+index 2b65510..587f847 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+@@ -33,7 +33,7 @@
+ #include <linux/time.h>
+ #include "kfd_priv.h"
+ #include <linux/mm.h>
+-#include <linux/mman.h>
++#include <uapi/asm-generic/mman-common.h>
+ #include <asm/processor.h>
+ 
+ /*
+@@ -278,21 +278,36 @@
+ #define MAKE_GPUVM_APP_BASE(gpu_num) \
+ 	(((uint64_t)(gpu_num) << 61) + 0x1000000000000L)
+ 
+-#define MAKE_GPUVM_APP_LIMIT(base) \
+-	(((uint64_t)(base) & \
+-		0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL)
++#define MAKE_GPUVM_APP_LIMIT(base, size) \
++	(((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1)
+ 
+-#define MAKE_SCRATCH_APP_BASE(gpu_num) \
+-	(((uint64_t)(gpu_num) << 61) + 0x100000000L)
++#define MAKE_SCRATCH_APP_BASE() \
++	(((uint64_t)(0x1UL) << 61) + 0x100000000L)
+ 
+ #define MAKE_SCRATCH_APP_LIMIT(base) \
+ 	(((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
+ 
+-#define MAKE_LDS_APP_BASE(gpu_num) \
+-	(((uint64_t)(gpu_num) << 61) + 0x0)
++#define MAKE_LDS_APP_BASE() \
++	(((uint64_t)(0x1UL) << 61) + 0x0)
++
+ #define MAKE_LDS_APP_LIMIT(base) \
+ 	(((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
+ 
++
++#define DGPU_VM_BASE_DEFAULT 0x100000
++
++int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
++					uint64_t base, uint64_t limit)
++{
++	if (base < (pdd->qpd.cwsr_base + pdd->dev->cwsr_size)) {
++		pr_err("Set dgpu vm base 0x%llx failed.\n", base);
++		return -EINVAL;
++	}
++	pdd->dgpu_base = base;
++	pdd->dgpu_limit = limit;
++	return 0;
++}
++
+ int kfd_init_apertures(struct kfd_process *process)
+ {
+ 	uint8_t id  = 0;
+@@ -300,13 +315,16 @@ int kfd_init_apertures(struct kfd_process *process)
+ 	struct kfd_process_device *pdd;
+ 
+ 	/*Iterating over all devices*/
+-	while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL &&
+-		id < NUM_OF_SUPPORTED_GPUS) {
++	while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
++		if (!dev) {
++			id++; /* Skip non GPU devices */
++			continue;
++		}
+ 
+ 		pdd = kfd_create_process_device_data(dev, process);
+ 		if (pdd == NULL) {
+ 			pr_err("Failed to create process device data\n");
+-			return -1;
++			goto err;
+ 		}
+ 		/*
+ 		 * For 64 bit process aperture will be statically reserved in
+@@ -322,19 +340,24 @@ int kfd_init_apertures(struct kfd_process *process)
+ 			 * node id couldn't be 0 - the three MSB bits of
+ 			 * aperture shoudn't be 0
+ 			 */
+-			pdd->lds_base = MAKE_LDS_APP_BASE(id + 1);
++			pdd->lds_base = MAKE_LDS_APP_BASE();
+ 
+ 			pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
+ 
+ 			pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
+ 
+-			pdd->gpuvm_limit =
+-					MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base);
++			pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(
++				pdd->gpuvm_base,
++				dev->shared_resources.gpuvm_size);
+ 
+-			pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1);
++			pdd->scratch_base = MAKE_SCRATCH_APP_BASE();
+ 
+ 			pdd->scratch_limit =
+ 				MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
++
++			if (KFD_IS_DGPU(dev->device_info->asic_family))
++				pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT;
++
+ 		}
+ 
+ 		dev_dbg(kfd_device, "node id %u\n", id);
+@@ -350,6 +373,32 @@ int kfd_init_apertures(struct kfd_process *process)
+ 	}
+ 
+ 	return 0;
++
++err:
++	return -1;
+ }
+ 
++void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid)
++{
++	uint8_t vmid;
++	int first_vmid_to_scan = 8;
++	int last_vmid_to_scan = 15;
+ 
++	const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
++	/* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. ATC_VMID15_PASID_MAPPING
++	 * to check which VMID the current process is mapped to
++	 * and flush TLB for this VMID if found*/
++	for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) {
++		if (f2g->get_atc_vmid_pasid_mapping_valid(
++			dev->kgd, vmid)) {
++			if (f2g->get_atc_vmid_pasid_mapping_pasid(
++				dev->kgd, vmid) == pasid) {
++				dev_dbg(kfd_device,
++					"TLB of vmid %u", vmid);
++				f2g->write_vmid_invalidate_request(
++					dev->kgd, vmid);
++				break;
++			}
++		}
++	}
++}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+index 7f134aa..a8cdbc8 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+@@ -172,8 +172,7 @@ static void interrupt_wq(struct work_struct *work)
+ 				sizeof(uint32_t))];
+ 
+ 	while (dequeue_ih_ring_entry(dev, ih_ring_entry))
+-		dev->device_info->event_interrupt_class->interrupt_wq(dev,
+-								ih_ring_entry);
++		dev->device_info->event_interrupt_class->interrupt_wq(dev, ih_ring_entry);
+ }
+ 
+ bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry)
+@@ -181,8 +180,7 @@ bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry)
+ 	/* integer and bitwise OR so there is no boolean short-circuiting */
+ 	unsigned wanted = 0;
+ 
+-	wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev,
+-								ih_ring_entry);
++	wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, ih_ring_entry);
+ 
+ 	return wanted != 0;
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+index d135cd0..513cfe6 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+@@ -143,7 +143,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ 		kq->queue->pipe = KFD_CIK_HIQ_PIPE;
+ 		kq->queue->queue = KFD_CIK_HIQ_QUEUE;
+ 		kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe,
+-					kq->queue->queue, NULL);
++					kq->queue->queue, NULL, 0);
+ 	} else {
+ 		/* allocate fence for DIQ */
+ 
+@@ -213,20 +213,23 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
+ 
+ 	BUG_ON(!kq || !buffer_ptr);
+ 
++	/* When rptr == wptr, the buffer is empty.
++	 * When rptr == wptr + 1, the buffer is full.
++	 * It is always rptr that advances to the position of wptr, rather than
++	 * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
++	 */
+ 	rptr = *kq->rptr_kernel;
+ 	wptr = *kq->wptr_kernel;
+ 	queue_address = (unsigned int *)kq->pq_kernel_addr;
+ 	queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t);
+ 
+-	pr_debug("rptr: %d\n", rptr);
+-	pr_debug("wptr: %d\n", wptr);
+-	pr_debug("queue_address 0x%p\n", queue_address);
++	pr_debug("amdkfd: In func %s\n rptr: %d\n wptr: %d\n queue_address 0x%p\n",
++			__func__, rptr, wptr, queue_address);
+ 
+-	available_size = (rptr - 1 - wptr + queue_size_dwords) %
++	available_size = (rptr + queue_size_dwords - 1 - wptr) %
+ 							queue_size_dwords;
+ 
+-	if (packet_size_in_dwords >= queue_size_dwords ||
+-			packet_size_in_dwords >= available_size) {
++	if (packet_size_in_dwords > available_size) {
+ 		/*
+ 		 * make sure calling functions know
+ 		 * acquire_packet_buffer() failed
+@@ -236,6 +239,13 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
+ 	}
+ 
+ 	if (wptr + packet_size_in_dwords >= queue_size_dwords) {
++		/* make sure after rolling back to position 0, there is
++		 * still enough space. */
++		if (packet_size_in_dwords >= rptr) {
++			*buffer_ptr = NULL;
++			return -ENOMEM;
++		}
++		/* fill nops, roll back and start at position 0 */
+ 		while (wptr > 0) {
+ 			queue_address[wptr] = kq->nop_packet;
+ 			wptr = (wptr + 1) % queue_size_dwords;
+@@ -295,6 +305,8 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
+ 
+ 	switch (dev->device_info->asic_family) {
+ 	case CHIP_CARRIZO:
++	case CHIP_TONGA:
++	case CHIP_FIJI:
+ 		kernel_queue_init_vi(&kq->ops_asic_specific);
+ 		break;
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+index 850a562..e9b886d 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+@@ -29,10 +29,11 @@
+ #define KFD_DRIVER_AUTHOR	"AMD Inc. and others"
+ 
+ #define KFD_DRIVER_DESC		"Standalone HSA driver for AMD's GPUs"
+-#define KFD_DRIVER_DATE		"20150421"
+-#define KFD_DRIVER_MAJOR	0
+-#define KFD_DRIVER_MINOR	7
+-#define KFD_DRIVER_PATCHLEVEL	2
++#define KFD_DRIVER_DATE		"20160129"
++#define KFD_DRIVER_MAJOR	1
++#define KFD_DRIVER_MINOR	8
++#define KFD_DRIVER_PATCHLEVEL	1
++#define KFD_DRIVER_RC_LEVEL	""
+ 
+ static const struct kgd2kfd_calls kgd2kfd = {
+ 	.exit		= kgd2kfd_exit,
+@@ -42,6 +43,10 @@ static const struct kgd2kfd_calls kgd2kfd = {
+ 	.interrupt	= kgd2kfd_interrupt,
+ 	.suspend	= kgd2kfd_suspend,
+ 	.resume		= kgd2kfd_resume,
++	.evict_bo	= kgd2kfd_evict_bo,
++	.restore	= kgd2kfd_restore,
++	.quiesce_mm	= kgd2kfd_quiesce_mm,
++	.resume_mm	= kgd2kfd_resume_mm,
+ };
+ 
+ int sched_policy = KFD_SCHED_POLICY_HWS;
+@@ -49,6 +54,15 @@ module_param(sched_policy, int, 0444);
+ MODULE_PARM_DESC(sched_policy,
+ 	"Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
+ 
++int hws_max_conc_proc = 0;
++module_param(hws_max_conc_proc, int, 0444);
++MODULE_PARM_DESC(hws_max_conc_proc,
++	"Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency (Default), #VMIDs for KFD = Maximum)");
++
++int cwsr_enable = 1;
++module_param(cwsr_enable, int, 0444);
++MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
++
+ int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
+ module_param(max_num_of_queues_per_device, int, 0444);
+ MODULE_PARM_DESC(max_num_of_queues_per_device,
+@@ -61,6 +75,11 @@ MODULE_PARM_DESC(send_sigterm,
+ 
+ static int amdkfd_init_completed;
+ 
++int debug_largebar = 0;
++module_param(debug_largebar, int, 0444);
++MODULE_PARM_DESC(debug_largebar,
++	"Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)");
++
+ int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f)
+ {
+ 	if (!amdkfd_init_completed)
+@@ -149,4 +168,5 @@ MODULE_DESCRIPTION(KFD_DRIVER_DESC);
+ MODULE_LICENSE("GPL and additional rights");
+ MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "."
+ 	       __stringify(KFD_DRIVER_MINOR) "."
+-	       __stringify(KFD_DRIVER_PATCHLEVEL));
++	       __stringify(KFD_DRIVER_PATCHLEVEL)
++	       KFD_DRIVER_RC_LEVEL);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+index b1ef136..ef1dc9b 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+@@ -31,6 +31,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
+ 		return mqd_manager_init_cik(type, dev);
+ 	case CHIP_CARRIZO:
+ 		return mqd_manager_init_vi(type, dev);
++	case CHIP_TONGA:
++	case CHIP_FIJI:
++		return mqd_manager_init_vi_tonga(type, dev);
+ 	}
+ 
+ 	return NULL;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+index 213a71e..eb60192 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+@@ -67,7 +67,8 @@ struct mqd_manager {
+ 
+ 	int	(*load_mqd)(struct mqd_manager *mm, void *mqd,
+ 				uint32_t pipe_id, uint32_t queue_id,
+-				uint32_t __user *wptr);
++				uint32_t __user *wptr,
++				uint32_t page_table_base);
+ 
+ 	int	(*update_mqd)(struct mqd_manager *mm, void *mqd,
+ 				struct queue_properties *q);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+index 6acc431..62dbdca 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+@@ -31,11 +31,71 @@
+ #include "cik_structs.h"
+ #include "oss/oss_2_4_sh_mask.h"
+ 
++#define AQL_ENABLE 1
++
+ static inline struct cik_mqd *get_mqd(void *mqd)
+ {
+ 	return (struct cik_mqd *)mqd;
+ }
+ 
++static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
++{
++	return (struct cik_sdma_rlc_registers *)mqd;
++}
++
++static void update_cu_mask(struct mqd_manager *mm, void *mqd,
++			struct queue_properties *q)
++{
++	struct cik_mqd *m;
++	struct kfd_cu_info cu_info;
++	uint32_t mgmt_se_mask;
++	uint32_t cu_sh_mask, cu_sh_shift;
++	uint32_t cu_mask;
++	int se, sh;
++
++	if (q->cu_mask == 0)
++		return;
++
++	m = get_mqd(mqd);
++	m->compute_static_thread_mgmt_se0 = 0;
++	m->compute_static_thread_mgmt_se1 = 0;
++	m->compute_static_thread_mgmt_se2 = 0;
++	m->compute_static_thread_mgmt_se3 = 0;
++
++	mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
++	cu_mask = q->cu_mask;
++	for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) {
++		mgmt_se_mask = 0;
++		for (sh = 0; sh < 2 && cu_mask; sh++) {
++			cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]);
++			cu_sh_mask = (1 << cu_sh_shift) - 1;
++			mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16);
++			cu_mask >>= cu_sh_shift;
++		}
++		switch (se) {
++		case 0:
++			m->compute_static_thread_mgmt_se0 = mgmt_se_mask;
++			break;
++		case 1:
++			m->compute_static_thread_mgmt_se1 = mgmt_se_mask;
++			break;
++		case 2:
++			m->compute_static_thread_mgmt_se2 = mgmt_se_mask;
++			break;
++		case 3:
++			m->compute_static_thread_mgmt_se3 = mgmt_se_mask;
++			break;
++		default:
++			break;
++		}
++	}
++	pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n",
++		m->compute_static_thread_mgmt_se0,
++		m->compute_static_thread_mgmt_se1,
++		m->compute_static_thread_mgmt_se2,
++		m->compute_static_thread_mgmt_se3);
++}
++
+ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ 		struct queue_properties *q)
+@@ -152,15 +212,16 @@ static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ }
+ 
+ static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
+-			uint32_t queue_id, uint32_t __user *wptr)
++			uint32_t queue_id, uint32_t __user *wptr,
++			uint32_t page_table_base)
+ {
+ 	return mm->dev->kfd2kgd->hqd_load
+-		(mm->dev->kgd, mqd, pipe_id, queue_id, wptr);
++		(mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base);
+ }
+ 
+ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ 			uint32_t pipe_id, uint32_t queue_id,
+-			uint32_t __user *wptr)
++			uint32_t __user *wptr, uint32_t page_table_base)
+ {
+ 	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd);
+ }
+@@ -197,11 +258,14 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
+ 		m->cp_hqd_pq_control |= NO_UPDATE_RPTR;
+ 	}
+ 
++	update_cu_mask(mm, mqd, q);
++
+ 	m->cp_hqd_active = 0;
+ 	q->is_active = false;
+ 	if (q->queue_size > 0 &&
+ 			q->queue_address != 0 &&
+-			q->queue_percent > 0) {
++			q->queue_percent > 0 &&
++			!q->is_evicted) {
+ 		m->cp_hqd_active = 1;
+ 		q->is_active = true;
+ 	}
+@@ -217,8 +281,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ 	BUG_ON(!mm || !mqd || !q);
+ 
+ 	m = get_sdma_mqd(mqd);
+-	m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) <<
+-			SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
++	m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
++			<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+ 			q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+ 			1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+ 			6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+@@ -239,7 +303,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ 	q->is_active = false;
+ 	if (q->queue_size > 0 &&
+ 			q->queue_address != 0 &&
+-			q->queue_percent > 0) {
++			q->queue_percent > 0 &&
++			!q->is_evicted) {
+ 		m->sdma_rlc_rb_cntl |=
+ 				1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT;
+ 
+@@ -388,7 +453,8 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+ 	q->is_active = false;
+ 	if (q->queue_size > 0 &&
+ 			q->queue_address != 0 &&
+-			q->queue_percent > 0) {
++			q->queue_percent > 0 &&
++			!q->is_evicted) {
+ 		m->cp_hqd_active = 1;
+ 		q->is_active = true;
+ 	}
+@@ -396,16 +462,6 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+ 	return 0;
+ }
+ 
+-struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+-{
+-	struct cik_sdma_rlc_registers *m;
+-
+-	BUG_ON(!mqd);
+-
+-	m = (struct cik_sdma_rlc_registers *)mqd;
+-
+-	return m;
+-}
+ 
+ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		struct kfd_dev *dev)
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+index a9b9882..4260c2f 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+@@ -28,9 +28,9 @@
+ #include "kfd_priv.h"
+ #include "kfd_mqd_manager.h"
+ #include "vi_structs.h"
+-#include "gca/gfx_8_0_sh_mask.h"
+-#include "gca/gfx_8_0_enum.h"
+-
++#include "asic_reg/gca/gfx_8_0_sh_mask.h"
++#include "asic_reg/gca/gfx_8_0_enum.h"
++#include "oss/oss_3_0_sh_mask.h"
+ #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8
+ 
+ static inline struct vi_mqd *get_mqd(void *mqd)
+@@ -38,6 +38,64 @@ static inline struct vi_mqd *get_mqd(void *mqd)
+ 	return (struct vi_mqd *)mqd;
+ }
+ 
++static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
++{
++	return (struct vi_sdma_mqd *)mqd;
++}
++
++static void update_cu_mask(struct mqd_manager *mm, void *mqd,
++			struct queue_properties *q)
++{
++	struct vi_mqd *m;
++	struct kfd_cu_info cu_info;
++	uint32_t mgmt_se_mask;
++	uint32_t cu_sh_mask, cu_sh_shift;
++	uint32_t cu_mask;
++	int se, sh;
++
++	if (q->cu_mask == 0)
++		return;
++
++	m = get_mqd(mqd);
++	m->compute_static_thread_mgmt_se0 = 0;
++	m->compute_static_thread_mgmt_se1 = 0;
++	m->compute_static_thread_mgmt_se2 = 0;
++	m->compute_static_thread_mgmt_se3 = 0;
++
++	mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
++	cu_mask = q->cu_mask;
++	for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) {
++		mgmt_se_mask = 0;
++		for (sh = 0; sh < 2 && cu_mask; sh++) {
++			cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]);
++			cu_sh_mask = (1 << cu_sh_shift) - 1;
++			mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16);
++			cu_mask >>= cu_sh_shift;
++		}
++		switch (se) {
++		case 0:
++			m->compute_static_thread_mgmt_se0 = mgmt_se_mask;
++			break;
++		case 1:
++			m->compute_static_thread_mgmt_se1 = mgmt_se_mask;
++			break;
++		case 2:
++			m->compute_static_thread_mgmt_se2 = mgmt_se_mask;
++			break;
++		case 3:
++			m->compute_static_thread_mgmt_se3 = mgmt_se_mask;
++			break;
++		default:
++			break;
++		}
++	}
++	pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n",
++		m->compute_static_thread_mgmt_se0,
++		m->compute_static_thread_mgmt_se1,
++		m->compute_static_thread_mgmt_se2,
++		m->compute_static_thread_mgmt_se3);
++}
++
+ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ 			struct queue_properties *q)
+@@ -84,6 +142,25 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 	if (q->format == KFD_QUEUE_FORMAT_AQL)
+ 		m->cp_hqd_iq_rptr = 1;
+ 
++	if (q->tba_addr) {
++		m->cp_hqd_persistent_state |=
++			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
++		m->compute_pgm_rsrc2 |=
++			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
++		m->cp_hqd_ctx_save_base_addr_lo =
++			lower_32_bits(q->ctx_save_restore_area_address);
++		m->cp_hqd_ctx_save_base_addr_hi =
++			upper_32_bits(q->ctx_save_restore_area_address);
++		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
++		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
++		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
++		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
++		m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8);
++		m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8);
++		m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8);
++		m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8);
++	}
++
+ 	*mqd = m;
+ 	if (gart_addr != NULL)
+ 		*gart_addr = addr;
+@@ -94,10 +171,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 
+ static int load_mqd(struct mqd_manager *mm, void *mqd,
+ 			uint32_t pipe_id, uint32_t queue_id,
+-			uint32_t __user *wptr)
++			uint32_t __user *wptr, uint32_t page_table_base)
+ {
+ 	return mm->dev->kfd2kgd->hqd_load
+-		(mm->dev->kgd, mqd, pipe_id, queue_id, wptr);
++		(mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base);
+ }
+ 
+ static int __update_mqd(struct mqd_manager *mm, void *mqd,
+@@ -155,12 +232,19 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
+ 		m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
+ 				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT;
+ 	}
++	if (q->tba_addr)
++		m->cp_hqd_ctx_save_control =
++			atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT |
++			mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
++
++	update_cu_mask(mm, mqd, q);
+ 
+ 	m->cp_hqd_active = 0;
+ 	q->is_active = false;
+ 	if (q->queue_size > 0 &&
+ 			q->queue_address != 0 &&
+-			q->queue_percent > 0) {
++			q->queue_percent > 0 &&
++			!q->is_evicted) {
+ 		m->cp_hqd_active = 1;
+ 		q->is_active = true;
+ 	}
+@@ -175,6 +259,12 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
+ 	return __update_mqd(mm, mqd, q, MTYPE_CC, 1);
+ }
+ 
++static int update_mqd_tonga(struct mqd_manager *mm, void *mqd,
++			struct queue_properties *q)
++{
++	return __update_mqd(mm, mqd, q, MTYPE_UC, 0);
++}
++
+ static int destroy_mqd(struct mqd_manager *mm, void *mqd,
+ 			enum kfd_preempt_type type,
+ 			unsigned int timeout, uint32_t pipe_id,
+@@ -233,6 +323,111 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+ 	return retval;
+ }
+ 
++static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
++		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
++		struct queue_properties *q)
++{
++	int retval;
++	struct vi_sdma_mqd *m;
++
++
++	BUG_ON(!mm || !mqd || !mqd_mem_obj);
++
++	retval = kfd_gtt_sa_allocate(mm->dev,
++			sizeof(struct vi_sdma_mqd),
++			mqd_mem_obj);
++
++	if (retval != 0)
++		return -ENOMEM;
++
++	m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
++
++	memset(m, 0, sizeof(struct vi_sdma_mqd));
++
++	*mqd = m;
++	if (gart_addr != NULL)
++		*gart_addr = (*mqd_mem_obj)->gpu_addr;
++
++	retval = mm->update_mqd(mm, m, q);
++
++	return retval;
++}
++
++static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
++		struct kfd_mem_obj *mqd_mem_obj)
++{
++	BUG_ON(!mm || !mqd);
++	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
++}
++
++static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
++		uint32_t pipe_id, uint32_t queue_id,
++		uint32_t __user *wptr, uint32_t page_table_base)
++{
++	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd);
++}
++
++static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
++		struct queue_properties *q)
++{
++	struct vi_sdma_mqd *m;
++	BUG_ON(!mm || !mqd || !q);
++
++	m = get_sdma_mqd(mqd);
++	m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
++		<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
++		q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
++		1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
++		6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
++
++	m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
++	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
++	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
++	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
++	m->sdmax_rlcx_doorbell = q->doorbell_off <<
++		SDMA0_RLC0_DOORBELL__OFFSET__SHIFT |
++		1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT;
++
++	m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr;
++
++	m->sdma_engine_id = q->sdma_engine_id;
++	m->sdma_queue_id = q->sdma_queue_id;
++
++	q->is_active = false;
++	if (q->queue_size > 0 &&
++			q->queue_address != 0 &&
++			q->queue_percent > 0 &&
++			!q->is_evicted) {
++		m->sdmax_rlcx_rb_cntl |=
++			1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT;
++
++		q->is_active = true;
++	}
++
++	return 0;
++}
++
++/*
++ *  * preempt type here is ignored because there is only one way
++ *  * to preempt sdma queue
++ */
++static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
++		enum kfd_preempt_type type,
++		unsigned int timeout, uint32_t pipe_id,
++		uint32_t queue_id)
++{
++	return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
++}
++
++static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
++		uint64_t queue_address, uint32_t pipe_id,
++		uint32_t queue_id)
++{
++	return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
++}
++
++
++
+ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		struct kfd_dev *dev)
+ {
+@@ -268,6 +463,12 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		mqd->is_occupied = is_occupied;
+ 		break;
+ 	case KFD_MQD_TYPE_SDMA:
++		mqd->init_mqd = init_mqd_sdma;
++		mqd->uninit_mqd = uninit_mqd_sdma;
++		mqd->load_mqd = load_mqd_sdma;
++		mqd->update_mqd = update_mqd_sdma;
++		mqd->destroy_mqd = destroy_mqd_sdma;
++		mqd->is_occupied = is_occupied_sdma;
+ 		break;
+ 	default:
+ 		kfree(mqd);
+@@ -276,3 +477,17 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 
+ 	return mqd;
+ }
++
++struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
++			struct kfd_dev *dev)
++{
++	struct mqd_manager *mqd;
++
++	mqd = mqd_manager_init_vi(type, dev);
++	if (!mqd)
++		return NULL;
++	if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE))
++		mqd->update_mqd = update_mqd_tonga;
++	return mqd;
++}
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+index 7e92921..55f7098 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+@@ -57,25 +57,37 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
+ {
+ 	unsigned int process_count, queue_count;
+ 	unsigned int map_queue_size;
++	unsigned int max_proc_per_quantum = 1;
+ 
+-	BUG_ON(!pm || !rlib_size || !over_subscription);
++	struct kfd_dev	*dev = pm->dqm->dev;
++
++	BUG_ON(!pm || !rlib_size || !over_subscription || !dev);
+ 
+ 	process_count = pm->dqm->processes_count;
+ 	queue_count = pm->dqm->queue_count;
+ 
+-	/* check if there is over subscription*/
++	/* check if there is over subscription
++	 * Note: the arbitration between the number of VMIDs and
++	 * hws_max_conc_proc has been done in
++	 * kgd2kfd_device_init().
++	 */
++
+ 	*over_subscription = false;
+-	if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) {
++
++	if (dev->max_proc_per_quantum > 1)
++		max_proc_per_quantum = dev->max_proc_per_quantum;
++
++	if ((process_count > max_proc_per_quantum) ||
++		queue_count > get_queues_num(pm->dqm)) {
+ 		*over_subscription = true;
+ 		pr_debug("kfd: over subscribed runlist\n");
+ 	}
+ 
+-	map_queue_size =
+-		(pm->dqm->dev->device_info->asic_family == CHIP_CARRIZO) ?
++	map_queue_size = KFD_IS_VI(pm->dqm->dev->device_info->asic_family) ?
+ 		sizeof(struct pm4_mes_map_queues) :
+ 		sizeof(struct pm4_map_queues);
+ 	/* calculate run list ib allocation size */
+-	*rlib_size = process_count * sizeof(struct pm4_map_process) +
++	*rlib_size = process_count * pm->pmf->get_map_process_packet_size() +
+ 		     queue_count * map_queue_size;
+ 
+ 	/*
+@@ -102,11 +114,14 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
+ 
+ 	pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
+ 
++	mutex_lock(&pm->lock);
++
+ 	retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size,
+ 					&pm->ib_buffer_obj);
+ 
+ 	if (retval != 0) {
+ 		pr_err("kfd: failed to allocate runlist IB\n");
++		mutex_unlock(&pm->lock);
+ 		return retval;
+ 	}
+ 
+@@ -115,6 +130,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
+ 
+ 	memset(*rl_buffer, 0, *rl_buffer_size);
+ 	pm->allocated = true;
++
++	mutex_unlock(&pm->lock);
+ 	return retval;
+ }
+ 
+@@ -122,9 +139,24 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
+                         uint64_t ib, size_t ib_size_in_dwords, bool chain)
+ {
+         struct pm4_runlist *packet;
++        int concurrent_proc_cnt = 0;
++        struct kfd_dev *kfd = pm->dqm->dev;
+ 
+         BUG_ON(!pm || !buffer || !ib);
+ 
++        /* Determine the number of processes to map together to HW:
++         * it can not exceed the number of VMIDs available to the
++         * scheduler, and it is determined by the smaller of the number
++         * of processes in the runlist and kfd module parameter
++         * hws_max_conc_proc.
++         * Note: the arbitration between the number of VMIDs and
++         * hws_max_conc_proc has been done in
++         * kgd2kfd_device_init().
++         */
++        concurrent_proc_cnt = min(pm->dqm->processes_count,
++                        kfd->max_proc_per_quantum);
++
++
+         packet = (struct pm4_runlist *)buffer;
+ 
+         memset(buffer, 0, sizeof(struct pm4_runlist));
+@@ -135,6 +167,7 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
+         packet->bitfields4.chain = chain ? 1 : 0;
+         packet->bitfields4.offload_polling = 0;
+         packet->bitfields4.valid = 1;
++	packet->bitfields4.process_cnt = concurrent_proc_cnt;
+         packet->ordinal2 = lower_32_bits(ib);
+         packet->bitfields3.ib_base_hi = upper_32_bits(ib);
+ 
+@@ -181,6 +214,90 @@ static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer,
+         return 0;
+ }
+ 
++static int pm_create_map_process_scratch_kv(struct packet_manager *pm,
++		uint32_t *buffer, struct qcm_process_device *qpd)
++{
++	struct pm4_map_process_scratch_kv *packet;
++	struct queue *cur;
++	uint32_t num_queues;
++
++	BUG_ON(!pm || !buffer || !qpd);
++
++	packet = (struct pm4_map_process_scratch_kv *)buffer;
++
++	pr_debug("kfd: In func %s\n", __func__);
++
++	memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv));
++
++	packet->header.u32all = build_pm4_header(IT_MAP_PROCESS,
++				sizeof(struct pm4_map_process_scratch_kv));
++	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
++	packet->bitfields2.process_quantum = 1;
++	packet->bitfields2.pasid = qpd->pqm->process->pasid;
++	packet->bitfields3.page_table_base = qpd->page_table_base;
++	packet->bitfields14.gds_size = qpd->gds_size;
++	packet->bitfields14.num_gws = qpd->num_gws;
++	packet->bitfields14.num_oac = qpd->num_oac;
++	num_queues = 0;
++	list_for_each_entry(cur, &qpd->queues_list, list)
++		num_queues++;
++	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues;
++
++	packet->sh_mem_config = qpd->sh_mem_config;
++	packet->sh_mem_bases = qpd->sh_mem_bases;
++	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
++	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
++
++	packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
++
++	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
++	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
++
++	return 0;
++}
++
++static int pm_create_map_process_scratch(struct packet_manager *pm,
++		uint32_t *buffer, struct qcm_process_device *qpd)
++{
++	struct pm4_map_process_scratch *packet;
++	struct queue *cur;
++	uint32_t num_queues;
++
++	BUG_ON(!pm || !buffer || !qpd);
++
++	packet = (struct pm4_map_process_scratch *)buffer;
++
++	pr_debug("kfd: In func %s\n", __func__);
++
++	memset(buffer, 0, sizeof(struct pm4_map_process_scratch));
++
++	packet->header.u32all = build_pm4_header(IT_MAP_PROCESS,
++					sizeof(struct pm4_map_process_scratch));
++	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
++	packet->bitfields2.process_quantum = 1;
++	packet->bitfields2.pasid = qpd->pqm->process->pasid;
++	packet->bitfields3.page_table_base = qpd->page_table_base;
++	packet->bitfields10.gds_size = qpd->gds_size;
++	packet->bitfields10.num_gws = qpd->num_gws;
++	packet->bitfields10.num_oac = qpd->num_oac;
++	num_queues = 0;
++	list_for_each_entry(cur, &qpd->queues_list, list)
++		num_queues++;
++	packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues;
++
++	packet->sh_mem_config = qpd->sh_mem_config;
++	packet->sh_mem_bases = qpd->sh_mem_bases;
++	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
++	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
++
++	packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
++
++	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
++	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
++
++	return 0;
++}
++
+ static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer,
+                 struct queue *q, bool is_static)
+ {
+@@ -218,7 +335,7 @@ static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer,
+                         queue_type__mes_map_queues__debug_interface_queue_vi;
+                 break;
+         case KFD_QUEUE_TYPE_SDMA:
+-                packet->bitfields2.engine_sel =
++		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+                                 engine_sel__mes_map_queues__sdma0_vi;
+                 use_static = false; /* no static queues under SDMA */
+                 break;
+@@ -278,7 +395,7 @@ static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer,
+                                 engine_sel__mes_map_queues__compute;
+                 break;
+         case KFD_QUEUE_TYPE_SDMA:
+-                packet->bitfields2.engine_sel =
++		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+                                 engine_sel__mes_map_queues__sdma0;
+                 use_static = false; /* no static queues under SDMA */
+                 break;
+@@ -347,12 +464,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 			return -ENOMEM;
+ 		}
+ 
+-		retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd);
++		retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd);
+ 		if (retval != 0)
+ 			return retval;
+ 
+ 		proccesses_mapped++;
+-		inc_wptr(&rl_wptr, sizeof(struct pm4_map_process),
++		inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(),
+ 				alloc_size_bytes);
+ 
+ 		list_for_each_entry(kq, &qpd->priv_queue_list, list) {
+@@ -362,8 +479,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 			pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n",
+ 				kq->queue->queue, qpd->is_debug);
+ 
+-			if (pm->dqm->dev->device_info->asic_family ==
+-					CHIP_CARRIZO)
++			if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family))
+ 				retval = pm_create_map_queue_vi(pm,
+ 						&rl_buffer[rl_wptr],
+ 						kq->queue,
+@@ -388,8 +504,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 			pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n",
+ 				q->queue, qpd->is_debug);
+ 
+-			if (pm->dqm->dev->device_info->asic_family ==
+-					CHIP_CARRIZO)
++			if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family))
+ 				retval = pm_create_map_queue_vi(pm,
+ 						&rl_buffer[rl_wptr],
+ 						q,
+@@ -422,7 +537,23 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 	return 0;
+ }
+ 
+-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
++static int get_map_process_packet_size(void)
++{
++	return sizeof(struct pm4_map_process);
++}
++
++static int get_map_process_packet_size_scratch_kv(void)
++{
++	return sizeof(struct pm4_map_process_scratch_kv);
++}
++
++static int get_map_process_packet_size_scratch(void)
++{
++	return sizeof(struct pm4_map_process_scratch);
++}
++
++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
++		uint16_t fw_ver)
+ {
+ 	BUG_ON(!dqm);
+ 
+@@ -433,8 +564,37 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
+ 		mutex_destroy(&pm->lock);
+ 		return -ENOMEM;
+ 	}
++	pm->pmf = kzalloc(sizeof(struct packet_manager_firmware), GFP_KERNEL);
+ 	pm->allocated = false;
+ 
++	switch (pm->dqm->dev->device_info->asic_family) {
++	case CHIP_KAVERI:
++		if (fw_ver >= KFD_SCRATCH_KV_FW_VER) {
++			pm->pmf->map_process = pm_create_map_process_scratch_kv;
++			pm->pmf->get_map_process_packet_size =
++					get_map_process_packet_size_scratch_kv;
++		} else {
++			pm->pmf->map_process = pm_create_map_process;
++			pm->pmf->get_map_process_packet_size =
++						get_map_process_packet_size;
++		}
++		break;
++	case CHIP_CARRIZO:
++	case CHIP_TONGA:
++	case CHIP_FIJI:
++		if (fw_ver >= KFD_SCRATCH_CZ_FW_VER) {
++			pm->pmf->map_process = pm_create_map_process_scratch;
++			pm->pmf->get_map_process_packet_size =
++					get_map_process_packet_size_scratch;
++		} else {
++			pm->pmf->map_process = pm_create_map_process;
++			pm->pmf->get_map_process_packet_size =
++						get_map_process_packet_size;
++		}
++		break;
++
++	}
++
+ 	return 0;
+ }
+ 
+@@ -444,6 +604,7 @@ void pm_uninit(struct packet_manager *pm)
+ 
+ 	mutex_destroy(&pm->lock);
+ 	kernel_queue_uninit(pm->priv_queue);
++	kfree(pm->pmf);
+ }
+ 
+ int pm_send_set_resources(struct packet_manager *pm,
+@@ -576,7 +737,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
+ }
+ 
+ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+-			enum kfd_preempt_type_filter mode,
++			enum kfd_unmap_queues_filter filter,
+ 			uint32_t filter_param, bool reset,
+ 			unsigned int sdma_engine)
+ {
+@@ -596,8 +757,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+ 
+ 	packet = (struct pm4_unmap_queues *)buffer;
+ 	memset(buffer, 0, sizeof(struct pm4_unmap_queues));
+-	pr_debug("kfd: static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n",
+-		mode, reset, type);
++	pr_debug("kfd: static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n",
++		filter, reset, type);
+ 	packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES,
+ 					sizeof(struct pm4_unmap_queues));
+ 	switch (type) {
+@@ -622,26 +783,26 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+ 		packet->bitfields2.action =
+ 				action__mes_unmap_queues__preempt_queues;
+ 
+-	switch (mode) {
+-	case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE:
++	switch (filter) {
++	case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
+ 		packet->bitfields2.queue_sel =
+ 				queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
+ 		packet->bitfields2.num_queues = 1;
+ 		packet->bitfields3b.doorbell_offset0 = filter_param;
+ 		break;
+-	case KFD_PREEMPT_TYPE_FILTER_BY_PASID:
++	case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
+ 		packet->bitfields2.queue_sel =
+ 				queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
+ 		packet->bitfields3a.pasid = filter_param;
+ 		break;
+-	case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES:
++	case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
+ 		packet->bitfields2.queue_sel =
+ 				queue_sel__mes_unmap_queues__perform_request_on_all_active_queues;
+ 		break;
+-	case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES:
++	case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
+ 		/* in this case, we do not preempt static queues */
+-		packet->bitfields2.queue_sel =
+-				queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only;
++		packet->bitfields2.queue_sel = 
++			queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only;
+ 		break;
+ 	default:
+ 		BUG();
+@@ -669,3 +830,4 @@ void pm_release_ib(struct packet_manager *pm)
+ 	}
+ 	mutex_unlock(&pm->lock);
+ }
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
+index 5b393f3..e7570cc 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
+@@ -127,7 +127,8 @@ struct pm4_runlist {
+ 			uint32_t offload_polling:1;
+ 			uint32_t reserved3:1;
+ 			uint32_t valid:1;
+-			uint32_t reserved4:8;
++			uint32_t process_cnt:4;
++			uint32_t reserved4:4;
+ 		} bitfields4;
+ 		uint32_t ordinal4;
+ 	};
+@@ -186,6 +187,123 @@ struct pm4_map_process {
+ };
+ #endif
+ 
++/*--------------------MES_MAP_PROCESS_SCRATCH-------------------- */
++
++#ifndef PM4_MES_MAP_PROCESS_SCRATCH_DEFINED
++#define PM4_MES_MAP_PROCESS_SCRATCH_DEFINED
++
++struct pm4_map_process_scratch {
++	union {
++		union PM4_MES_TYPE_3_HEADER header;	/* header */
++		uint32_t ordinal1;
++	};
++
++	union {
++		struct {
++			uint32_t pasid:16;
++			uint32_t reserved1:8;
++			uint32_t diq_enable:1;
++			uint32_t process_quantum:7;
++		} bitfields2;
++		uint32_t ordinal2;
++	};
++
++	union {
++		struct {
++			uint32_t page_table_base:28;
++			uint32_t reserved3:4;
++		} bitfields3;
++		uint32_t ordinal3;
++	};
++
++	uint32_t reserved;
++
++	uint32_t sh_mem_bases;
++	uint32_t sh_mem_config;
++	uint32_t sh_mem_ape1_base;
++	uint32_t sh_mem_ape1_limit;
++
++	uint32_t sh_hidden_private_base_vmid;
++
++	uint32_t reserved2;
++	uint32_t reserved3;
++
++	uint32_t gds_addr_lo;
++	uint32_t gds_addr_hi;
++
++	union {
++		struct {
++			uint32_t num_gws:6;
++			uint32_t reserved4:2;
++			uint32_t num_oac:4;
++			uint32_t reserved5:4;
++			uint32_t gds_size:6;
++			uint32_t num_queues:10;
++		} bitfields10;
++		uint32_t ordinal10;
++	};
++
++	uint32_t completion_signal_lo;
++	uint32_t completion_signal_hi;
++
++};
++#endif
++
++#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH
++#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH
++
++struct pm4_map_process_scratch_kv {
++	union {
++		union PM4_MES_TYPE_3_HEADER   header; /* header */
++		uint32_t            ordinal1;
++	};
++
++	union {
++		struct {
++			uint32_t pasid:16;
++			uint32_t reserved1:8;
++			uint32_t diq_enable:1;
++			uint32_t process_quantum:7;
++		} bitfields2;
++		uint32_t ordinal2;
++	};
++
++	union {
++		struct {
++			uint32_t page_table_base:28;
++			uint32_t reserved2:4;
++		} bitfields3;
++		uint32_t ordinal3;
++	};
++
++	uint32_t reserved3;
++	uint32_t sh_mem_bases;
++	uint32_t sh_mem_config;
++	uint32_t sh_mem_ape1_base;
++	uint32_t sh_mem_ape1_limit;
++	uint32_t sh_hidden_private_base_vmid;
++	uint32_t reserved4;
++	uint32_t reserved5;
++	uint32_t gds_addr_lo;
++	uint32_t gds_addr_hi;
++
++	union {
++		struct {
++			uint32_t num_gws:6;
++			uint32_t reserved6:2;
++			uint32_t num_oac:4;
++			uint32_t reserved7:4;
++			uint32_t gds_size:6;
++			uint32_t num_queues:10;
++		} bitfields14;
++		uint32_t ordinal14;
++	};
++
++	uint32_t completion_signal_lo32;
++uint32_t completion_signal_hi32;
++};
++#endif
++
+ /*--------------------MES_MAP_QUEUES--------------------*/
+ 
+ #ifndef PM4_MES_MAP_QUEUES_DEFINED
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+index 4750cab..c654471 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+@@ -30,13 +30,45 @@
+ #include <linux/atomic.h>
+ #include <linux/workqueue.h>
+ #include <linux/spinlock.h>
++#include <linux/idr.h>
+ #include <linux/kfd_ioctl.h>
++#include <linux/pid.h>
++#include <linux/interval_tree.h>
+ #include <kgd_kfd_interface.h>
+ 
++#include <drm/amd_rdma.h>
++
+ #define KFD_SYSFS_FILE_MODE 0444
+ 
+-#define KFD_MMAP_DOORBELL_MASK 0x8000000000000
+-#define KFD_MMAP_EVENTS_MASK 0x4000000000000
++/* GPU ID hash width in bits */
++#define KFD_GPU_ID_HASH_WIDTH 16
++
++/* Use upper bits of mmap offset to store KFD driver specific information.
++ * BITS[63:62] - Encode MMAP type
++ * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to
++ * BITS[45:40] - Reserved. Not Used.
++ * BITS[39:0]  - MMAP offset value. Used by TTM.
++ *
++ * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
++ *  defines are w.r.t to PAGE_SIZE
++ */
++#define KFD_MMAP_TYPE_SHIFT	(62 - PAGE_SHIFT)
++#define KFD_MMAP_TYPE_MASK	(0x3ULL << KFD_MMAP_TYPE_SHIFT)
++#define KFD_MMAP_TYPE_DOORBELL	(0x3ULL << KFD_MMAP_TYPE_SHIFT)
++#define KFD_MMAP_TYPE_EVENTS	(0x2ULL << KFD_MMAP_TYPE_SHIFT)
++#define KFD_MMAP_TYPE_MAP_BO	(0x1ULL << KFD_MMAP_TYPE_SHIFT)
++#define KFD_MMAP_TYPE_RESERVED_MEM	(0x0ULL << KFD_MMAP_TYPE_SHIFT)
++
++#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)
++#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
++				<< KFD_MMAP_GPU_ID_SHIFT)
++#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\
++				& KFD_MMAP_GPU_ID_MASK)
++#define KFD_MMAP_GPU_ID_GET(offset)    ((offset & KFD_MMAP_GPU_ID_MASK) \
++				>> KFD_MMAP_GPU_ID_SHIFT)
++
++#define KFD_MMAP_OFFSET_VALUE_MASK	(0xFFFFFFFFFFULL >> PAGE_SHIFT)
++#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK)
+ 
+ /*
+  * When working with cp scheduler we should assign the HIQ manually or via
+@@ -48,8 +80,6 @@
+ #define KFD_CIK_HIQ_PIPE 4
+ #define KFD_CIK_HIQ_QUEUE 0
+ 
+-/* GPU ID hash width in bits */
+-#define KFD_GPU_ID_HASH_WIDTH 16
+ 
+ /* Macro for allocating structures */
+ #define kfd_alloc_struct(ptr_to_struct)	\
+@@ -74,12 +104,26 @@ extern int max_num_of_queues_per_device;
+ /* Kernel module parameter to specify the scheduling policy */
+ extern int sched_policy;
+ 
++extern int cwsr_enable;
++
++/*
++ * Kernel module parameter to specify the maximum process
++ * number per HW scheduler
++ */
++extern int hws_max_conc_proc;
++
+ /*
+  * Kernel module parameter to specify whether to send sigterm to HSA process on
+  * unhandled exception
+  */
+ extern int send_sigterm;
+ 
++/*
++ * This kernel module is used to simulate large bar machine on non-large bar
++ * enabled machines.
++ */
++extern int debug_largebar;
++
+ /**
+  * enum kfd_sched_policy
+  *
+@@ -114,14 +158,17 @@ enum cache_policy {
+ 
+ enum asic_family_type {
+ 	CHIP_KAVERI = 0,
+-	CHIP_CARRIZO
++	CHIP_CARRIZO,
++	CHIP_TONGA,
++	CHIP_FIJI
+ };
+ 
++#define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_FIJI)
++#define KFD_IS_DGPU(chip) ((chip) >= CHIP_TONGA && (chip) <= CHIP_FIJI)
++
+ struct kfd_event_interrupt_class {
+-	bool (*interrupt_isr)(struct kfd_dev *dev,
+-				const uint32_t *ih_ring_entry);
+-	void (*interrupt_wq)(struct kfd_dev *dev,
+-				const uint32_t *ih_ring_entry);
++	bool (*interrupt_isr)(struct kfd_dev *dev, const uint32_t *ih_ring_entry);
++	void (*interrupt_wq)(struct kfd_dev *dev, const uint32_t *ih_ring_entry);
+ };
+ 
+ struct kfd_device_info {
+@@ -132,6 +179,7 @@ struct kfd_device_info {
+ 	size_t ih_ring_entry_size;
+ 	uint8_t num_of_watch_points;
+ 	uint16_t mqd_size_aligned;
++	bool is_need_iommu_device;
+ };
+ 
+ struct kfd_mem_obj {
+@@ -141,6 +189,12 @@ struct kfd_mem_obj {
+ 	uint32_t *cpu_ptr;
+ };
+ 
++struct kfd_vmid_info {
++	uint32_t first_vmid_kfd;
++	uint32_t last_vmid_kfd;
++	uint32_t vmid_num_kfd;
++};
++
+ struct kfd_dev {
+ 	struct kgd_dev *kgd;
+ 
+@@ -165,11 +219,12 @@ struct kfd_dev {
+ 					   */
+ 
+ 	struct kgd2kfd_shared_resources shared_resources;
++	struct kfd_vmid_info vm_info;
+ 
+ 	const struct kfd2kgd_calls *kfd2kgd;
+ 	struct mutex doorbell_mutex;
+-	DECLARE_BITMAP(doorbell_available_index,
+-			KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
++	unsigned long doorbell_available_index[DIV_ROUND_UP(
++		KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)];
+ 
+ 	void *gtt_mem;
+ 	uint64_t gtt_start_gpu_addr;
+@@ -179,6 +234,11 @@ struct kfd_dev {
+ 	unsigned int gtt_sa_chunk_size;
+ 	unsigned int gtt_sa_num_of_chunks;
+ 
++	/* QCM Device instance */
++	struct device_queue_manager *dqm;
++
++	bool init_complete;
++
+ 	/* Interrupts */
+ 	void *interrupt_ring;
+ 	size_t interrupt_ring_size;
+@@ -187,10 +247,6 @@ struct kfd_dev {
+ 	struct work_struct interrupt_work;
+ 	spinlock_t interrupt_lock;
+ 
+-	/* QCM Device instance */
+-	struct device_queue_manager *dqm;
+-
+-	bool init_complete;
+ 	/*
+ 	 * Interrupts of interest to KFD are copied
+ 	 * from the HW ring into a SW ring.
+@@ -198,7 +254,26 @@ struct kfd_dev {
+ 	bool interrupts_active;
+ 
+ 	/* Debug manager */
+-	struct kfd_dbgmgr           *dbgmgr;
++	struct kfd_dbgmgr *dbgmgr;
++
++	/* MEC firmware version*/
++	uint16_t mec_fw_version;
++
++	/* Maximum process number mapped to HW scheduler */
++	unsigned int max_proc_per_quantum;
++
++	/* cwsr */
++	bool cwsr_enabled;
++	struct page *cwsr_pages;
++	uint32_t cwsr_size;
++	uint32_t tma_offset;  /*Offset for TMA from the  start of cwsr_mem*/
++};
++
++struct kfd_bo {
++	void *mem;
++	struct interval_tree_node it;
++	struct kfd_dev *dev;
++	struct list_head cb_data_head;
+ };
+ 
+ /* KGD2KFD callbacks */
+@@ -221,22 +296,22 @@ void kfd_chardev_exit(void);
+ struct device *kfd_chardev(void);
+ 
+ /**
+- * enum kfd_preempt_type_filter
++ * enum kfd_unmap_queues_filter
+  *
+- * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue.
++ * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue.
+  *
+- * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the
++ * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the
+  *						running queues list.
+  *
+- * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to
++ * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to
+  *						specific process.
+  *
+  */
+-enum kfd_preempt_type_filter {
+-	KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE,
+-	KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES,
+-	KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES,
+-	KFD_PREEMPT_TYPE_FILTER_BY_PASID
++enum kfd_unmap_queues_filter {
++	KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE,
++	KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
++	KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
++	KFD_UNMAP_QUEUES_FILTER_BY_PASID
+ };
+ 
+ enum kfd_preempt_type {
+@@ -324,6 +399,7 @@ struct queue_properties {
+ 	uint32_t __iomem *doorbell_ptr;
+ 	uint32_t doorbell_off;
+ 	bool is_interop;
++	bool is_evicted; /* true -> queue is evicted */
+ 	bool is_active;
+ 	/* Not relevant for user mode queues in cp scheduling */
+ 	unsigned int vmid;
+@@ -336,6 +412,11 @@ struct queue_properties {
+ 	uint32_t eop_ring_buffer_size;
+ 	uint64_t ctx_save_restore_area_address;
+ 	uint32_t ctx_save_restore_area_size;
++	uint32_t ctl_stack_size;
++	uint64_t tba_addr;
++	uint64_t tma_addr;
++	/* Relevant for CU */
++	uint32_t cu_mask;
+ };
+ 
+ /**
+@@ -424,6 +505,7 @@ struct qcm_process_device {
+ 	unsigned int queue_count;
+ 	unsigned int vmid;
+ 	bool is_debug;
++	unsigned evicted; /* eviction counter, 0=active */
+ 	/*
+ 	 * All the memory management data should be here too
+ 	 */
+@@ -436,8 +518,22 @@ struct qcm_process_device {
+ 	uint32_t gds_size;
+ 	uint32_t num_gws;
+ 	uint32_t num_oac;
++	uint32_t sh_hidden_private_base;
++
++	/*cwsr memory*/
++	int  cwsr_mem_handle;
++	uint64_t cwsr_base;
++	uint64_t tba_addr;
++	uint64_t tma_addr;
++	void *cwsr_kaddr;
+ };
+ 
++/*8 byte handle containing GPU ID in the most significant 4 bytes and
++ * idr_handle in the least significant 4 bytes*/
++#define MAKE_HANDLE(gpu_id, idr_handle) (((uint64_t)(gpu_id) << 32) + idr_handle)
++#define GET_GPU_ID(handle) (handle >> 32)
++#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF)
++
+ /* Data that is per-process-per device. */
+ struct kfd_process_device {
+ 	/*
+@@ -449,6 +545,8 @@ struct kfd_process_device {
+ 	/* The device that owns this data. */
+ 	struct kfd_dev *dev;
+ 
++	/* The process that owns this kfd_process_device. */
++	struct kfd_process *process;
+ 
+ 	/* per-process-per device QCM data structure */
+ 	struct qcm_process_device qpd;
+@@ -460,10 +558,23 @@ struct kfd_process_device {
+ 	uint64_t gpuvm_limit;
+ 	uint64_t scratch_base;
+ 	uint64_t scratch_limit;
++	uint64_t dgpu_base;
++	uint64_t dgpu_limit;
++	uint64_t mapped_size;
++	uint64_t last_eviction;
++	bool	 evicted;
++
++	uint64_t sh_hidden_private_base_vmid;
+ 
+ 	/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
+ 	bool bound;
+ 
++	/* VM context for GPUVM allocations */
++	void *vm;
++
++	/* GPUVM allocations storage */
++	struct idr alloc_idr;
++
+ 	/* This flag tells if we should reset all
+ 	 * wavefronts on process termination
+ 	 */
+@@ -482,7 +593,7 @@ struct kfd_process {
+ 
+ 	struct mm_struct *mm;
+ 
+-	struct mutex mutex;
++	struct rw_semaphore lock;
+ 
+ 	/*
+ 	 * In any process, the thread that started main() is the lead
+@@ -513,6 +624,8 @@ struct kfd_process {
+ 	/* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */
+ 	struct kfd_queue **queues;
+ 
++	unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)];
++
+ 	/*Is the user space process 32 bit?*/
+ 	bool is_32bit_user_mode;
+ 
+@@ -520,10 +633,12 @@ struct kfd_process {
+ 	struct mutex event_mutex;
+ 	/* All events in process hashed by ID, linked on kfd_event.events. */
+ 	DECLARE_HASHTABLE(events, 4);
+-	struct list_head signal_event_pages;	/* struct slot_page_header.
+-								event_pages */
++	struct list_head signal_event_pages;	/* struct slot_page_header.event_pages */
+ 	u32 next_nonsignal_event_id;
+ 	size_t signal_event_count;
++	size_t debug_event_count;
++
++	struct rb_root bo_interval_tree;
+ };
+ 
+ /**
+@@ -546,9 +661,10 @@ struct amdkfd_ioctl_desc {
+ 
+ void kfd_process_create_wq(void);
+ void kfd_process_destroy_wq(void);
+-struct kfd_process *kfd_create_process(const struct task_struct *);
++struct kfd_process *kfd_create_process(struct file *filep);
+ struct kfd_process *kfd_get_process(const struct task_struct *);
+ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
++struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
+ 
+ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
+ 							struct kfd_process *p);
+@@ -558,6 +674,29 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
+ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+ 							struct kfd_process *p);
+ 
++int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma);
++
++/* KFD process API for creating and translating handles */
++int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
++					void *mem, uint64_t start,
++					uint64_t length);
++void *kfd_process_device_translate_handle(struct kfd_process_device *p,
++					int handle);
++struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd,
++					int handle);
++void *kfd_process_find_bo_from_interval(struct kfd_process *p,
++					uint64_t start_addr,
++					uint64_t last_addr);
++void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
++					int handle);
++
++void run_rdma_free_callback(struct kfd_bo *buf_obj);
++struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid);
++
++/* kfd dgpu memory */
++int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem,
++		struct kfd_process *p, struct kfd_process_device *pdd);
++
+ /* Process device data iterator */
+ struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p);
+ struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p,
+@@ -600,7 +739,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
+ int kfd_topology_remove_device(struct kfd_dev *gpu);
+ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
+ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
+-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx);
++struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd);
++uint32_t kfd_get_gpu_id(struct kfd_dev *dev);
++int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
++int kfd_numa_node_to_apic_id(int numa_node_id);
++int kfd_get_proximity_domain(const struct pci_bus *bus);
+ 
+ /* Interrupts */
+ int kfd_interrupt_init(struct kfd_dev *dev);
+@@ -615,9 +758,12 @@ int kgd2kfd_resume(struct kfd_dev *kfd);
+ 
+ /* amdkfd Apertures */
+ int kfd_init_apertures(struct kfd_process *process);
++int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
++				uint64_t base, uint64_t limit);
+ 
+ /* Queue Context Management */
+-struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd);
++inline uint32_t lower_32(uint64_t x);
++inline uint32_t upper_32(uint64_t x);
+ 
+ int init_queue(struct queue **q, const struct queue_properties *properties);
+ void uninit_queue(struct queue *q);
+@@ -630,11 +776,15 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		struct kfd_dev *dev);
+ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		struct kfd_dev *dev);
++struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
++		struct kfd_dev *dev);
+ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
+ void device_queue_manager_uninit(struct device_queue_manager *dqm);
+ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
+ 					enum kfd_queue_type type);
+ void kernel_queue_uninit(struct kernel_queue *kq);
++int kfd_process_vm_fault(struct device_queue_manager *dqm,
++				unsigned int pasid);
+ 
+ /* Process Queue Manager */
+ struct process_queue_node {
+@@ -649,18 +799,16 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 			    struct kfd_dev *dev,
+ 			    struct file *f,
+ 			    struct queue_properties *properties,
+-			    unsigned int flags,
+-			    enum kfd_queue_type type,
+ 			    unsigned int *qid);
+ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid);
+ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
+ 			struct queue_properties *p);
++int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
++			struct queue_properties *p);
+ struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
+ 						unsigned int qid);
+-
+-int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+-				unsigned int fence_value,
+-				unsigned long timeout);
++int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm);
++int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm);
+ 
+ /* Packet Manager */
+ 
+@@ -668,7 +816,9 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+ 
+ #define KFD_FENCE_COMPLETED (100)
+ #define KFD_FENCE_INIT   (10)
+-#define KFD_UNMAP_LATENCY (150)
++#define KFD_UNMAP_LATENCY (40)
++
++struct packet_manager_firmware;
+ 
+ struct packet_manager {
+ 	struct device_queue_manager *dqm;
+@@ -676,9 +826,19 @@ struct packet_manager {
+ 	struct mutex lock;
+ 	bool allocated;
+ 	struct kfd_mem_obj *ib_buffer_obj;
++
++	struct packet_manager_firmware *pmf;
+ };
+ 
+-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
++struct packet_manager_firmware {
++	/* Support different firmware versions for map process packet */
++	int (*map_process)(struct packet_manager *pm, uint32_t *buffer,
++				struct qcm_process_device *qpd);
++	int (*get_map_process_packet_size)(void);
++};
++
++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
++		uint16_t fw_ver);
+ void pm_uninit(struct packet_manager *pm);
+ int pm_send_set_resources(struct packet_manager *pm,
+ 				struct scheduling_resources *res);
+@@ -687,7 +847,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
+ 				uint32_t fence_value);
+ 
+ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+-			enum kfd_preempt_type_filter mode,
++			enum kfd_unmap_queues_filter mode,
+ 			uint32_t filter_param, bool reset,
+ 			unsigned int sdma_engine);
+ 
+@@ -696,6 +856,9 @@ void pm_release_ib(struct packet_manager *pm);
+ uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
+ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
+ 					struct kfd_process *process);
++int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
++				unsigned int fence_value,
++				unsigned long timeout);
+ 
+ /* Events */
+ extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
+@@ -714,8 +877,7 @@ int kfd_wait_on_events(struct kfd_process *p,
+ 		       uint32_t num_events, void __user *data,
+ 		       bool all, uint32_t user_timeout_ms,
+ 		       enum kfd_event_wait_result *wait_result);
+-void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+-				uint32_t valid_id_bits);
++void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits);
+ void kfd_signal_iommu_event(struct kfd_dev *dev,
+ 		unsigned int pasid, unsigned long address,
+ 		bool is_write_requested, bool is_execute_requested);
+@@ -723,11 +885,28 @@ void kfd_signal_hw_exception_event(unsigned int pasid);
+ int kfd_set_event(struct kfd_process *p, uint32_t event_id);
+ int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
+ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+-		     uint32_t event_type, bool auto_reset, uint32_t node_id,
+-		     uint32_t *event_id, uint32_t *event_trigger_data,
+-		     uint64_t *event_page_offset, uint32_t *event_slot_index);
++	     uint32_t event_type, bool auto_reset, uint32_t node_id,
++	     uint32_t *event_id, uint32_t *event_trigger_data,
++	     uint64_t *event_page_offset, uint32_t *event_slot_index,
++	     void *kern_addr);
+ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
++void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle);
++
++void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
++				struct kfd_vm_fault_info *info);
++
++void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid);
+ 
+ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
++int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem);
++int kgd2kfd_restore(struct kfd_dev *kfd);
++int evict_size(struct kfd_process *p, int size, int type);
++int evict_bo(struct kfd_dev *dev, void *mem);
++int restore(struct kfd_dev *kfd);
++
++#define KFD_SCRATCH_CZ_FW_VER 600
++#define KFD_SCRATCH_KV_FW_VER 413
++#define KFD_MULTI_PROC_MAPPING_HWS_SUPPORT 600
++#define KFD_CWSR_CZ_FW_VER 625
+ 
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+index 035bbc9..a069c3d 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+@@ -28,6 +28,10 @@
+ #include <linux/amd-iommu.h>
+ #include <linux/notifier.h>
+ #include <linux/compat.h>
++#include <linux/mm.h>
++#include <asm/tlb.h>
++#include <linux/highmem.h>
++#include <uapi/asm-generic/mman-common.h>
+ 
+ struct mm_struct;
+ 
+@@ -41,6 +45,7 @@ struct mm_struct;
+  */
+ #define INITIAL_QUEUE_ARRAY_SIZE 16
+ 
++static int evict_pdd(struct kfd_process_device *pdd);
+ /*
+  * List of struct kfd_process (field kfd_process).
+  * Unique/indexed by mm_struct*
+@@ -58,8 +63,14 @@ struct kfd_process_release_work {
+ 	struct kfd_process *p;
+ };
+ 
+-static struct kfd_process *find_process(const struct task_struct *thread);
++#define MIN_IDR_ID 1
++#define MAX_IDR_ID 0 /*0 - for unlimited*/
++
++static struct kfd_process *find_process(const struct task_struct *thread,
++		bool lock);
+ static struct kfd_process *create_process(const struct task_struct *thread);
++static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep);
++
+ 
+ void kfd_process_create_wq(void)
+ {
+@@ -75,10 +86,12 @@ void kfd_process_destroy_wq(void)
+ 	}
+ }
+ 
+-struct kfd_process *kfd_create_process(const struct task_struct *thread)
++struct kfd_process *kfd_create_process(struct file *filep)
+ {
+ 	struct kfd_process *process;
+ 
++	struct task_struct *thread = current;
++
+ 	BUG_ON(!kfd_process_wq);
+ 
+ 	if (thread->mm == NULL)
+@@ -99,7 +112,7 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
+ 	mutex_lock(&kfd_processes_mutex);
+ 
+ 	/* A prior open of /dev/kfd could have already created the process. */
+-	process = find_process(thread);
++	process = find_process(thread, false);
+ 	if (process)
+ 		pr_debug("kfd: process already found\n");
+ 
+@@ -110,6 +123,8 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
+ 
+ 	up_write(&thread->mm->mmap_sem);
+ 
++	kfd_process_init_cwsr(process, filep);
++
+ 	return process;
+ }
+ 
+@@ -124,7 +139,7 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread)
+ 	if (thread->group_leader->mm != thread->mm)
+ 		return ERR_PTR(-EINVAL);
+ 
+-	process = find_process(thread);
++	process = find_process(thread, false);
+ 
+ 	return process;
+ }
+@@ -141,23 +156,164 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
+ 	return NULL;
+ }
+ 
+-static struct kfd_process *find_process(const struct task_struct *thread)
++static struct kfd_process *find_process(const struct task_struct *thread,
++		bool lock)
+ {
+ 	struct kfd_process *p;
+ 	int idx;
+ 
+ 	idx = srcu_read_lock(&kfd_processes_srcu);
+ 	p = find_process_by_mm(thread->mm);
++	if (p && lock)
++		down_read(&p->lock);
+ 	srcu_read_unlock(&kfd_processes_srcu, idx);
+ 
+ 	return p;
+ }
+ 
++/* This returns with process->lock read-locked. */
++struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid)
++{
++	struct task_struct *task = NULL;
++	struct kfd_process *p    = NULL;
++
++	if (!pid)
++		task = current;
++	else
++		task = get_pid_task(pid, PIDTYPE_PID);
++
++	if (task)
++		p = find_process(task, true);
++
++	return p;
++}
++
++int evict_size(struct kfd_process *process, int size, int type)
++{
++	struct kfd_process_device *pdd, *temp_pdd = NULL;
++	struct kfd_process *p = process;
++	int temp = 0;
++
++	down_write(&p->lock);
++
++	if (type == EVICT_FIRST_PDD) {
++
++		list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
++			pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
++					pdd->dev->id, p->pasid);
++			if (pdd->mapped_size >= size) {
++				evict_pdd(pdd);
++				return 0;
++			}
++
++		}
++	} else if (type == EVICT_BIGGEST_PDD) {
++
++		list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
++			pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
++					pdd->dev->id, p->pasid);
++			if (pdd->mapped_size >= temp) {
++				temp = pdd->mapped_size;
++				temp_pdd = pdd;
++			}
++
++		}
++		if (temp_pdd->mapped_size > size) {
++			evict_pdd(temp_pdd);
++			return 0;
++		}
++
++	}
++	up_write(&p->lock);
++	return 0;
++
++}
++
++int evict_bo(struct kfd_dev *dev, void *mem)
++{
++	struct kfd_process_device *pdd;
++
++	pdd =  dev->kfd2kgd->get_pdd_from_buffer_object(dev->kgd,
++			((struct kgd_mem *)mem));
++
++	if (pdd)
++		evict_pdd(pdd);
++
++	return 0;
++}
++
++static int evict_pdd(struct kfd_process_device *pdd)
++{
++	void *mem;
++	int id;
++
++	/*process_evict_queues(struct device_queue_manager *dqm, pdd->qpd)*/
++	/*
++	* Remove all handles from idr and release appropriate
++	* local memory object
++	*/
++	idr_for_each_entry(&pdd->alloc_idr, mem, id) {
++		pdd->dev->kfd2kgd->unmap_memory_to_gpu(
++			pdd->dev->kgd, mem, pdd->vm);
++	}
++	pdd->last_eviction = jiffies;
++	pdd->mapped_size = 0;
++	pdd->evicted = true;
++
++	/*flush_tlb_all();*/
++
++	return 0;
++}
++
++int restore(struct kfd_dev *kfd)
++{
++	struct kfd_process *p = NULL;
++	/*  TODO still working on how to get the process */
++	struct kfd_process_device *pdd = kfd_get_process_device_data(kfd, p);
++	void *mem;
++	int id;
++
++	/* need to run on all processes*/
++	down_write(&p->lock);
++
++	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
++		pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
++				pdd->dev->id, p->pasid);
++
++		/*
++		 * Remove all handles from idr and release appropriate
++		 * local memory object
++		 */
++		if (pdd->evicted) {
++			idr_for_each_entry(&pdd->alloc_idr, mem, id) {
++				pdd->dev->kfd2kgd->map_memory_to_gpu(
++						pdd->dev->kgd,
++						mem, pdd->vm);
++				pdd->last_eviction = 0;
++				pdd->mapped_size = 0;
++			}
++
++			/*process_restore_queues
++			 * (struct device_queue_manager *dqm, pdd->qpd)*/
++		} else {
++			pdd->evicted = false;
++		}
++	}
++	up_write(&p->lock);
++	return 0;
++}
++
++/* No process locking is needed in this function, because the process
++ * is not findable any more. We must assume that no other thread is
++ * using it any more, otherwise we couldn't safely free the process
++ * stucture in the end. */
+ static void kfd_process_wq_release(struct work_struct *work)
+ {
+ 	struct kfd_process_release_work *my_work;
+-	struct kfd_process_device *pdd, *temp;
++	struct kfd_process_device *pdd, *temp, *peer_pdd;
+ 	struct kfd_process *p;
++	struct kfd_bo *buf_obj;
++	int id;
+ 
+ 	my_work = (struct kfd_process_release_work *) work;
+ 
+@@ -166,19 +322,40 @@ static void kfd_process_wq_release(struct work_struct *work)
+ 	pr_debug("Releasing process (pasid %d) in workqueue\n",
+ 			p->pasid);
+ 
+-	mutex_lock(&p->mutex);
+-
+-	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+-							per_device_list) {
++	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+ 		pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
+ 				pdd->dev->id, p->pasid);
+ 
+-		if (pdd->reset_wavefronts)
+-			dbgdev_wave_reset_wavefronts(pdd->dev, p);
++		if (pdd->dev->device_info->is_need_iommu_device)
++			amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
++
++		/*
++		 * Remove all handles from idr and release appropriate
++		 * local memory object
++		 */
++		idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) {
++			list_for_each_entry(peer_pdd,
++				&p->per_device_data, per_device_list) {
++					pdd->dev->kfd2kgd->unmap_memory_to_gpu(
++						peer_pdd->dev->kgd,
++						buf_obj->mem, peer_pdd->vm);
++			}
++
++			run_rdma_free_callback(buf_obj);
++			pdd->dev->kfd2kgd->free_memory_of_gpu(
++					pdd->dev->kgd, buf_obj->mem);
++			kfd_process_device_remove_obj_handle(pdd, id);
++		}
++	}
+ 
+-		amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
++	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
++				 per_device_list) {
++		radeon_flush_tlb(pdd->dev, p->pasid);
++		/* Destroy the GPUVM VM context */
++		if (pdd->vm)
++			pdd->dev->kfd2kgd->destroy_process_vm(
++				pdd->dev->kgd, pdd->vm);
+ 		list_del(&pdd->per_device_list);
+-
+ 		kfree(pdd);
+ 	}
+ 
+@@ -186,15 +363,11 @@ static void kfd_process_wq_release(struct work_struct *work)
+ 
+ 	kfd_pasid_free(p->pasid);
+ 
+-	mutex_unlock(&p->mutex);
+-
+-	mutex_destroy(&p->mutex);
+-
+ 	kfree(p->queues);
+ 
+ 	kfree(p);
+ 
+-	kfree(work);
++	kfree((void *)work);
+ }
+ 
+ static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+@@ -223,6 +396,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
+ {
+ 	struct kfd_process *p;
+ 	struct kfd_process_device *pdd = NULL;
++	struct kfd_dev *dev = NULL;
++	long status = -EFAULT;
+ 
+ 	/*
+ 	 * The kfd_process structure can not be free because the
+@@ -236,9 +411,31 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
+ 	mutex_unlock(&kfd_processes_mutex);
+ 	synchronize_srcu(&kfd_processes_srcu);
+ 
+-	mutex_lock(&p->mutex);
++	down_write(&p->lock);
++
++	/* Iterate over all process device data structures and if the pdd is in
++	 * debug mode,we should first force unregistration, then we will be
++	 * able to destroy the queues   */
++	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
++		dev = pdd->dev;
++		mutex_lock(get_dbgmgr_mutex());
++
++		if ((dev != NULL) &&
++			(dev->dbgmgr) &&
++			(dev->dbgmgr->pasid == p->pasid)) {
++
++			status = kfd_dbgmgr_unregister(dev->dbgmgr, p);
++			if (status == 0) {
++				kfd_dbgmgr_destroy(dev->dbgmgr);
++				dev->dbgmgr = NULL;
++			}
++		}
++		mutex_unlock(get_dbgmgr_mutex());
++	}
++
++
++	/* now we can uninit the pqm: */
+ 
+-	/* In case our notifier is called before IOMMU notifier */
+ 	pqm_uninit(&p->pqm);
+ 
+ 	/* Iterate over all process device data structure and check
+@@ -256,7 +453,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
+ 		}
+ 	}
+ 
+-	mutex_unlock(&p->mutex);
++	up_write(&p->lock);
+ 
+ 	/*
+ 	 * Because we drop mm_count inside kfd_process_destroy_delayed
+@@ -272,6 +469,94 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
+ 	.release = kfd_process_notifier_release,
+ };
+ 
++static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep)
++{
++	int err;
++	unsigned long  offset;
++	struct kfd_process_device *temp, *pdd = NULL;
++	void *mem = NULL;
++	struct kfd_dev *dev = NULL;
++	struct qcm_process_device *qpd = NULL;
++
++	down_write(&p->lock);
++	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
++				per_device_list) {
++		dev = pdd->dev;
++		qpd = &pdd->qpd;
++		if (!dev->cwsr_enabled || qpd->tba_addr)
++			continue;
++		if (qpd->cwsr_base) {
++			/* cwsr_base is only set for DGPU */
++
++			/* can't hold the process lock while
++			 * allocating from KGD */
++			up_write(&p->lock);
++
++			err = dev->kfd2kgd->alloc_memory_of_gpu(
++				dev->kgd, qpd->cwsr_base, dev->cwsr_size,
++				pdd->vm, (struct kgd_mem **)&mem,
++				NULL, &qpd->cwsr_kaddr, pdd,
++				ALLOC_MEM_FLAGS_GTT |
++				ALLOC_MEM_FLAGS_NONPAGED |
++				ALLOC_MEM_FLAGS_EXECUTE_ACCESS |
++				ALLOC_MEM_FLAGS_NO_SUBSTITUTE);
++			if (err)
++				goto err_alloc_tba;
++			err = kfd_map_memory_to_gpu(dev, mem, p, pdd);
++			if (err)
++				goto err_map_tba;
++
++			down_write(&p->lock);
++			/* Check if someone else allocated the memory
++			 * while we weren't looking */
++			if (qpd->tba_addr) {
++				up_write(&p->lock);
++				dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd,
++					(struct kgd_mem *)mem, pdd->vm);
++				dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem);
++				down_write(&p->lock);
++			} else {
++				qpd->cwsr_mem_handle =
++					kfd_process_device_create_obj_handle(
++						pdd, mem, qpd->cwsr_base,
++						dev->cwsr_size);
++				if (qpd->cwsr_mem_handle < 0)
++					goto err_create_handle;
++
++				memcpy(qpd->cwsr_kaddr, kmap(dev->cwsr_pages),
++				       PAGE_SIZE);
++				kunmap(dev->cwsr_pages);
++				qpd->tba_addr = qpd->cwsr_base;
++			}
++		} else {
++			offset = (kfd_get_gpu_id(dev) |
++				KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT;
++			qpd->tba_addr = (uint64_t)vm_mmap(filep, 0,
++				dev->cwsr_size,	PROT_READ | PROT_EXEC,
++				MAP_SHARED, offset);
++			qpd->cwsr_kaddr = (void *)qpd->tba_addr;
++		}
++		if (IS_ERR_VALUE(qpd->tba_addr)) {
++			pr_err("Failure to set tba address. error -%d.\n",
++				(int)qpd->tba_addr);
++			qpd->tba_addr = 0;
++			qpd->cwsr_kaddr = NULL;
++		} else
++			qpd->tma_addr = qpd->tba_addr + dev->tma_offset;
++			pr_debug("set tba :0x%llx, tma:0x%llx for pqm.\n",
++				qpd->tba_addr, qpd->tma_addr);
++	}
++
++err_create_handle:
++	up_write(&p->lock);
++	return err;
++
++err_map_tba:
++	dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem);
++err_alloc_tba:
++	return err;
++}
++
+ static struct kfd_process *create_process(const struct task_struct *thread)
+ {
+ 	struct kfd_process *process;
+@@ -282,6 +567,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
+ 	if (!process)
+ 		goto err_alloc_process;
+ 
++	process->bo_interval_tree = RB_ROOT;
++
+ 	process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE,
+ 					sizeof(process->queues[0]), GFP_KERNEL);
+ 	if (!process->queues)
+@@ -291,7 +578,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
+ 	if (process->pasid == 0)
+ 		goto err_alloc_pasid;
+ 
+-	mutex_init(&process->mutex);
++	init_rwsem(&process->lock);
+ 
+ 	process->mm = thread->mm;
+ 
+@@ -364,8 +651,22 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+ 		INIT_LIST_HEAD(&pdd->qpd.queues_list);
+ 		INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
+ 		pdd->qpd.dqm = dev->dqm;
++		pdd->qpd.pqm = &p->pqm;
++		pdd->qpd.evicted = 0;
+ 		pdd->reset_wavefronts = false;
++		pdd->process = p;
+ 		list_add(&pdd->per_device_list, &p->per_device_data);
++
++		/* Init idr used for memory handle translation */
++		idr_init(&pdd->alloc_idr);
++
++		/* Create the GPUVM context for this specific device */
++		if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm)) {
++			pr_err("Failed to create process VM object\n");
++			list_del(&pdd->per_device_list);
++			kfree(pdd);
++			pdd = NULL;
++		}
+ 	}
+ 
+ 	return pdd;
+@@ -393,9 +694,11 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
+ 	if (pdd->bound)
+ 		return pdd;
+ 
+-	err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
+-	if (err < 0)
+-		return ERR_PTR(err);
++	if (dev->device_info->is_need_iommu_device) {
++		err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
++		if (err < 0)
++			return ERR_PTR(err);
++	}
+ 
+ 	pdd->bound = true;
+ 
+@@ -420,18 +723,21 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
+ 
+ 	pr_debug("Unbinding process %d from IOMMU\n", pasid);
+ 
+-	if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid))
+-		kfd_dbgmgr_destroy(dev->dbgmgr);
+-
+-	pqm_uninit(&p->pqm);
++	mutex_lock(get_dbgmgr_mutex());
+ 
+-	pdd = kfd_get_process_device_data(dev, p);
++	if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) {
+ 
+-	if (!pdd) {
+-		mutex_unlock(&p->mutex);
+-		return;
++		if (kfd_dbgmgr_unregister(dev->dbgmgr, p) == 0) {
++			kfd_dbgmgr_destroy(dev->dbgmgr);
++			dev->dbgmgr = NULL;
++		}
+ 	}
+ 
++	mutex_unlock(get_dbgmgr_mutex());
++
++	pqm_uninit(&p->pqm);
++
++	pdd = kfd_get_process_device_data(dev, p);
+ 	if (pdd->reset_wavefronts) {
+ 		dbgdev_wave_reset_wavefronts(pdd->dev, p);
+ 		pdd->reset_wavefronts = false;
+@@ -444,9 +750,10 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
+ 	 * We don't call amd_iommu_unbind_pasid() here
+ 	 * because the IOMMU called us.
+ 	 */
+-	pdd->bound = false;
++	if (pdd)
++		pdd->bound = false;
+ 
+-	mutex_unlock(&p->mutex);
++	up_write(&p->lock);
+ }
+ 
+ struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p)
+@@ -469,7 +776,116 @@ bool kfd_has_process_device_data(struct kfd_process *p)
+ 	return !(list_empty(&p->per_device_data));
+ }
+ 
+-/* This returns with process->mutex locked. */
++/* Create specific handle mapped to mem from process local memory idr
++ * Assumes that the process lock is held. */
++int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
++					void *mem, uint64_t start,
++					uint64_t length)
++{
++	int handle;
++	struct kfd_bo *buf_obj;
++	struct kfd_process *p;
++
++	BUG_ON(pdd == NULL);
++	BUG_ON(mem == NULL);
++
++	p = pdd->process;
++
++	buf_obj = kmalloc(sizeof(*buf_obj), GFP_KERNEL);
++
++	if (!buf_obj)
++		return -ENOMEM;
++
++	buf_obj->it.start = start;
++	buf_obj->it.last = start + length - 1;
++	interval_tree_insert(&buf_obj->it, &p->bo_interval_tree);
++
++	buf_obj->mem = mem;
++	buf_obj->dev = pdd->dev;
++
++	INIT_LIST_HEAD(&buf_obj->cb_data_head);
++
++	idr_preload(GFP_KERNEL);
++
++	handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID,
++			GFP_NOWAIT);
++
++	idr_preload_end();
++
++	return handle;
++}
++
++struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd,
++					int handle)
++{
++	BUG_ON(pdd == NULL);
++
++	if (handle < 0)
++		return NULL;
++
++	return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle);
++}
++
++/* Translate specific handle from process local memory idr
++ * Assumes that the process lock is held. */
++void *kfd_process_device_translate_handle(struct kfd_process_device *pdd,
++					int handle)
++{
++	struct kfd_bo *buf_obj;
++
++	buf_obj = kfd_process_device_find_bo(pdd, handle);
++
++	return buf_obj->mem;
++}
++
++void *kfd_process_find_bo_from_interval(struct kfd_process *p,
++					uint64_t start_addr,
++					uint64_t last_addr)
++{
++	struct interval_tree_node *it_node;
++	struct kfd_bo *buf_obj;
++
++	it_node = interval_tree_iter_first(&p->bo_interval_tree,
++			start_addr, last_addr);
++	if (!it_node) {
++		pr_err("%llu - %llu does not relate to an existing buffer\n",
++				start_addr, last_addr);
++		return NULL;
++	}
++
++	BUG_ON(NULL != interval_tree_iter_next(it_node,
++			start_addr, last_addr));
++
++	buf_obj = container_of(it_node, struct kfd_bo, it);
++
++	return buf_obj;
++}
++
++/* Remove specific handle from process local memory idr
++ * Assumes that the process lock is held. */
++void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
++					int handle)
++{
++	struct kfd_bo *buf_obj;
++	struct kfd_process *p;
++
++	BUG_ON(pdd == NULL);
++
++	p = pdd->process;
++
++	if (handle < 0)
++		return;
++
++	buf_obj = kfd_process_device_find_bo(pdd, handle);
++
++	idr_remove(&pdd->alloc_idr, handle);
++
++	interval_tree_remove(&buf_obj->it, &p->bo_interval_tree);
++
++	kfree(buf_obj);
++}
++
++/* This returns with process->lock read-locked. */
+ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
+ {
+ 	struct kfd_process *p;
+@@ -479,7 +895,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
+ 
+ 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ 		if (p->pasid == pasid) {
+-			mutex_lock(&p->mutex);
++			down_read(&p->lock);
+ 			break;
+ 		}
+ 	}
+@@ -488,3 +904,53 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
+ 
+ 	return p;
+ }
++
++/* This returns with process->lock read-locked. */
++struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
++{
++	struct kfd_process *p;
++
++	int idx = srcu_read_lock(&kfd_processes_srcu);
++
++	p = find_process_by_mm(mm);
++	if (p != NULL)
++		down_read(&p->lock);
++
++	srcu_read_unlock(&kfd_processes_srcu, idx);
++
++	return p;
++}
++
++int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma)
++{
++	unsigned long pfn, i;
++	int ret = 0;
++	struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
++
++	if (dev == NULL)
++		return -EINVAL;
++	if ((vma->vm_start & (PAGE_SIZE - 1)) ||
++		(vma->vm_end & (PAGE_SIZE - 1))) {
++		pr_err("KFD only support page aligned memory map.\n");
++		return -EINVAL;
++	}
++
++	pr_debug("kfd reserved mem mmap been called.\n");
++	/* We supported  two reserved memory mmap in the future .
++	    1. Trap handler code and parameter (TBA and TMA , 2 pages total)
++	    2. Relaunch stack (control  block, 1 page for Carrizo)
++	 */
++
++	for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); ++i) {
++		pfn = page_to_pfn(&dev->cwsr_pages[i]);
++		vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND
++			| VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
++		/* mapping the page to user process */
++		ret = remap_pfn_range(vma, vma->vm_start + (i << PAGE_SHIFT),
++				pfn, PAGE_SIZE, vma->vm_page_prot);
++		if (ret)
++			break;
++	}
++	return ret;
++}
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+index 46f497e..e79cd42 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+@@ -89,23 +89,36 @@ void pqm_uninit(struct process_queue_manager *pqm)
+ {
+ 	int retval;
+ 	struct process_queue_node *pqn, *next;
++	struct kfd_process_device *pdd;
++	struct kfd_dev *dev = NULL;
+ 
+ 	BUG_ON(!pqm);
+ 
+ 	pr_debug("In func %s\n", __func__);
+ 
+ 	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
+-		retval = pqm_destroy_queue(
+-				pqm,
+-				(pqn->q != NULL) ?
+-					pqn->q->properties.queue_id :
+-					pqn->kq->queue->properties.queue_id);
+-
+-		if (retval != 0) {
+-			pr_err("kfd: failed to destroy queue\n");
+-			return;
++		if (pqn->q)
++			dev = pqn->q->device;
++		else if (pqn->kq)
++			dev = pqn->kq->dev;
++		else
++			BUG();
++
++		pdd = kfd_get_process_device_data(dev, pqm->process);
++		if (pdd) {
++			retval = dev->dqm->ops.process_termination
++				(dev->dqm, &pdd->qpd);
++			if (retval != 0)
++				pdd->reset_wavefronts = true;
+ 		}
+ 	}
++
++	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
++		uninit_queue(pqn->q);
++		list_del(&pqn->process_queue_list);
++		kfree(pqn);
++	}
++
+ 	kfree(pqm->queue_slot_bitmap);
+ 	pqm->queue_slot_bitmap = NULL;
+ }
+@@ -148,23 +161,19 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 			    struct kfd_dev *dev,
+ 			    struct file *f,
+ 			    struct queue_properties *properties,
+-			    unsigned int flags,
+-			    enum kfd_queue_type type,
+ 			    unsigned int *qid)
+ {
+ 	int retval;
+ 	struct kfd_process_device *pdd;
+-	struct queue_properties q_properties;
+ 	struct queue *q;
+ 	struct process_queue_node *pqn;
+ 	struct kernel_queue *kq;
+ 	int num_queues = 0;
+ 	struct queue *cur;
++	enum kfd_queue_type type = properties->type;
+ 
+ 	BUG_ON(!pqm || !dev || !properties || !qid);
+ 
+-	memset(&q_properties, 0, sizeof(struct queue_properties));
+-	memcpy(&q_properties, properties, sizeof(struct queue_properties));
+ 	q = NULL;
+ 	kq = NULL;
+ 
+@@ -192,10 +201,9 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 	if (retval != 0)
+ 		return retval;
+ 
+-	if (list_empty(&pqm->queues)) {
+-		pdd->qpd.pqm = pqm;
++	if (list_empty(&pdd->qpd.queues_list) &&
++			list_empty(&pdd->qpd.priv_queue_list))
+ 		dev->dqm->ops.register_process(dev->dqm, &pdd->qpd);
+-	}
+ 
+ 	pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL);
+ 	if (!pqn) {
+@@ -205,18 +213,34 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 
+ 	switch (type) {
+ 	case KFD_QUEUE_TYPE_SDMA:
++                if (dev->dqm->sdma_queue_count >= CIK_SDMA_QUEUES) {
++                        pr_err("kfd: over-subscription is not allowed for SDMA.\n");
++                        retval = -EPERM;
++                        goto err_create_queue;
++                }
++
++                retval = create_cp_queue(pqm, dev, &q, properties, f, *qid);
++                if (retval != 0)
++                        goto err_create_queue;
++                pqn->q = q;
++                pqn->kq = NULL;
++                retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd,
++                                                &q->properties.vmid);
++                pr_debug("DQM returned %d for create_queue\n", retval);
++                print_queue(q);
++                break;
+ 
+ 	case KFD_QUEUE_TYPE_COMPUTE:
+ 		/* check if there is over subscription */
+ 		if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
+-		((dev->dqm->processes_count >= VMID_PER_DEVICE) ||
++		((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) ||
+ 		(dev->dqm->queue_count >= get_queues_num(dev->dqm)))) {
+ 			pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n");
+ 			retval = -EPERM;
+ 			goto err_create_queue;
+ 		}
+ 
+-		retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid);
++		retval = create_cp_queue(pqm, dev, &q, properties, f, *qid);
+ 		if (retval != 0)
+ 			goto err_create_queue;
+ 		pqn->q = q;
+@@ -253,9 +277,8 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 	list_add(&pqn->process_queue_list, &pqm->queues);
+ 
+ 	if (q) {
+-		*properties = q->properties;
+ 		pr_debug("kfd: PQM done creating queue\n");
+-		print_queue_properties(properties);
++		print_queue_properties(&q->properties);
+ 	}
+ 
+ 	return retval;
+@@ -265,7 +288,8 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ err_allocate_pqn:
+ 	/* check if queues list is empty unregister process from device */
+ 	clear_bit(*qid, pqm->queue_slot_bitmap);
+-	if (list_empty(&pqm->queues))
++	if (list_empty(&pdd->qpd.queues_list) &&
++			list_empty(&pdd->qpd.priv_queue_list))
+ 		dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd);
+ 	return retval;
+ }
+@@ -314,9 +338,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
+ 	if (pqn->q) {
+ 		dqm = pqn->q->device->dqm;
+ 		retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
+-		if (retval != 0)
++		if (retval != 0) {
++			if (retval == -ETIME)
++				pdd->reset_wavefronts = true;
+ 			return retval;
+-
++		}
+ 		uninit_queue(pqn->q);
+ 	}
+ 
+@@ -324,7 +350,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
+ 	kfree(pqn);
+ 	clear_bit(qid, pqm->queue_slot_bitmap);
+ 
+-	if (list_empty(&pqm->queues))
++	if (list_empty(&pdd->qpd.queues_list) &&
++			list_empty(&pdd->qpd.priv_queue_list))
+ 		dqm->ops.unregister_process(dqm, &pdd->qpd);
+ 
+ 	return retval;
+@@ -358,6 +385,31 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
+ 	return 0;
+ }
+ 
++int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
++			struct queue_properties *p)
++{
++	int retval;
++	struct process_queue_node *pqn;
++
++	BUG_ON(!pqm);
++
++	pqn = get_queue_by_qid(pqm, qid);
++	if (!pqn) {
++		pr_debug("amdkfd: No queue %d exists for update operation\n",
++				qid);
++		return -EFAULT;
++	}
++
++	pqn->q->properties.cu_mask = p->cu_mask;
++
++	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
++							pqn->q);
++	if (retval != 0)
++		return retval;
++
++	return 0;
++}
++
+ struct kernel_queue *pqm_get_kernel_queue(
+ 					struct process_queue_manager *pqm,
+ 					unsigned int qid)
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
+new file mode 100644
+index 0000000..69bdaf1
+--- /dev/null
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
+@@ -0,0 +1,296 @@
++/*
++ * Copyright 2015 Advanced Micro Devices, Inc.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
++ * OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include <linux/device.h>
++#include <linux/export.h>
++#include <linux/pid.h>
++#include <linux/err.h>
++#include <linux/slab.h>
++#include "amd_rdma.h"
++#include "kfd_priv.h"
++
++
++struct rdma_cb {
++	struct list_head node;
++	struct amd_p2p_info amd_p2p_data;
++	void  (*free_callback)(void *client_priv);
++	void  *client_priv;
++};
++
++/**
++ * This function makes the pages underlying a range of GPU virtual memory
++ * accessible for DMA operations from another PCIe device
++ *
++ * \param   address       - The start address in the Unified Virtual Address
++ *			    space in the specified process
++ * \param   length        - The length of requested mapping
++ * \param   pid           - Pointer to structure pid to which address belongs.
++ *			    Could be NULL for current process address space.
++ * \param   p2p_data    - On return: Pointer to structure describing
++ *			    underlying pages/locations
++ * \param   free_callback - Pointer to callback which will be called when access
++ *			    to such memory must be stopped immediately: Memory
++ *			    was freed, GECC events, etc.
++ *			    Client should  immediately stop any transfer
++ *			    operations and returned as soon as possible.
++ *			    After return all resources associated with address
++ *			    will be release and no access will be allowed.
++ * \param   client_priv   - Pointer to be passed as parameter on
++ *			    'free_callback;
++ *
++ * \return  0 if operation was successful
++ */
++static int get_pages(uint64_t address, uint64_t length, struct pid *pid,
++		struct amd_p2p_info **amd_p2p_data,
++		void  (*free_callback)(void *client_priv),
++		void  *client_priv)
++{
++	struct kfd_bo *buf_obj;
++	struct kgd_mem *mem;
++	struct sg_table *sg_table_tmp;
++	struct kfd_dev *dev;
++	uint64_t last = address + length - 1;
++	uint64_t offset;
++	struct kfd_process *p;
++	struct rdma_cb *rdma_cb_data;
++	int ret = 0;
++
++	p = kfd_lookup_process_by_pid(pid);
++	if (!p) {
++		pr_err("could not find the process in %s.\n",
++				__func__);
++		return -EINVAL;
++	}
++
++	buf_obj = kfd_process_find_bo_from_interval(p, address, last);
++	if (!buf_obj) {
++		pr_err("can not find a kfd_bo for the range\n");
++		ret = -EINVAL;
++		goto out;
++	}
++
++	rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL);
++	if (!rdma_cb_data) {
++		*amd_p2p_data = NULL;
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	mem = buf_obj->mem;
++	dev = buf_obj->dev;
++	offset = address - buf_obj->it.start;
++
++	ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem,
++			offset, length, &sg_table_tmp);
++
++	if (ret) {
++		pr_err("pin_get_sg_table_bo failed.\n");
++		*amd_p2p_data = NULL;
++		goto free_mem;
++	}
++
++	rdma_cb_data->amd_p2p_data.va = address;
++	rdma_cb_data->amd_p2p_data.size = length;
++	rdma_cb_data->amd_p2p_data.pid = pid;
++	rdma_cb_data->amd_p2p_data.priv = buf_obj;
++	rdma_cb_data->amd_p2p_data.pages = sg_table_tmp;
++
++	rdma_cb_data->free_callback = free_callback;
++	rdma_cb_data->client_priv = client_priv;
++
++	list_add(&rdma_cb_data->node, &buf_obj->cb_data_head);
++
++	*amd_p2p_data = &rdma_cb_data->amd_p2p_data;
++
++	goto out;
++
++free_mem:
++	kfree(rdma_cb_data);
++out:
++	up_read(&p->lock);
++
++	return ret;
++}
++
++static int put_pages_helper(struct amd_p2p_info *p2p_data)
++{
++	struct kfd_bo *buf_obj;
++	struct kfd_dev *dev;
++	struct sg_table *sg_table_tmp;
++	struct rdma_cb *rdma_cb_data;
++
++	if (!p2p_data) {
++		pr_err("amd_p2p_info pointer is invalid.\n");
++		return -EINVAL;
++	}
++
++	rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data);
++
++	buf_obj = p2p_data->priv;
++	dev = buf_obj->dev;
++	sg_table_tmp = p2p_data->pages;
++
++	list_del(&rdma_cb_data->node);
++	kfree(rdma_cb_data);
++
++	dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp);
++
++
++	return 0;
++}
++
++void run_rdma_free_callback(struct kfd_bo *buf_obj)
++{
++	struct rdma_cb *tmp, *rdma_cb_data;
++
++	list_for_each_entry_safe(rdma_cb_data, tmp,
++			&buf_obj->cb_data_head, node) {
++		if (rdma_cb_data->free_callback)
++			rdma_cb_data->free_callback(
++					rdma_cb_data->client_priv);
++
++		put_pages_helper(&rdma_cb_data->amd_p2p_data);
++	}
++}
++
++/**
++ *
++ * This function release resources previously allocated by get_pages() call.
++ *
++ * \param   p_p2p_data - A pointer to pointer to amd_p2p_info entries
++ * 			allocated by get_pages() call.
++ *
++ * \return  0 if operation was successful
++ */
++static int put_pages(struct amd_p2p_info **p_p2p_data)
++{
++	struct kfd_process *p = NULL;
++	int ret = 0;
++
++	if (!(*p_p2p_data)) {
++		pr_err("amd_p2p_info pointer is invalid.\n");
++		return -EINVAL;
++	}
++
++	p = kfd_lookup_process_by_pid((*p_p2p_data)->pid);
++	if (!p) {
++		pr_err("could not find the process in %s\n",
++				__func__);
++		return -EINVAL;
++	}
++
++	ret = put_pages_helper(*p_p2p_data);
++
++	if (!ret)
++		*p_p2p_data = NULL;
++
++	up_read(&p->lock);
++
++	return ret;
++}
++
++/**
++ * Check if given address belongs to GPU address space.
++ *
++ * \param   address - Address to check
++ * \param   pid     - Process to which given address belongs.
++ *		      Could be NULL if current one.
++ *
++ * \return  0  - This is not GPU address managed by AMD driver
++ *	    1  - This is GPU address managed by AMD driver
++ */
++static int is_gpu_address(uint64_t address, struct pid *pid)
++{
++	struct kfd_bo *buf_obj;
++	struct kfd_process *p;
++
++	p = kfd_lookup_process_by_pid(pid);
++	if (!p) {
++		pr_err("could not find the process in %s.\n",
++				__func__);
++		return 0;
++	}
++
++	buf_obj = kfd_process_find_bo_from_interval(p, address, address);
++
++	up_read(&p->lock);
++	if (!buf_obj)
++		return 0;
++	else
++		return 1;
++}
++
++/**
++ * Return the single page size to be used when building scatter/gather table
++ * for given range.
++ *
++ * \param   address   - Address
++ * \param   length    - Range length
++ * \param   pid       - Process id structure. Could be NULL if current one.
++ * \param   page_size - On return: Page size
++ *
++ * \return  0 if operation was successful
++ */
++static int get_page_size(uint64_t address, uint64_t length, struct pid *pid,
++			unsigned long *page_size)
++{
++	/*
++	 * As local memory is always consecutive, we can assume the local
++	 * memory page size to be arbitrary.
++	 * Currently we assume the local memory page size to be the same
++	 * as system memory, which is 4KB.
++	 */
++	*page_size = PAGE_SIZE;
++
++	return 0;
++}
++
++
++/**
++ * Singleton object: rdma interface function pointers
++ */
++static const struct amd_rdma_interface  rdma_ops = {
++	.get_pages = get_pages,
++	.put_pages = put_pages,
++	.is_gpu_address = is_gpu_address,
++	.get_page_size = get_page_size,
++};
++
++/**
++ * amdkfd_query_rdma_interface - Return interface (function pointers table) for
++ *				 rdma interface
++ *
++ *
++ * \param interace     - OUT: Pointer to interface
++ *
++ * \return 0 if operation was successful.
++ */
++int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops)
++{
++	*ops  = &rdma_ops;
++
++	return 0;
++}
++EXPORT_SYMBOL(amdkfd_query_rdma_interface);
++
++
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+index 1e50647..ba1c61c 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+@@ -28,16 +28,19 @@
+ #include <linux/hash.h>
+ #include <linux/cpufreq.h>
+ #include <linux/log2.h>
++#include <linux/dmi.h>
++#include <linux/atomic.h>
+ 
+ #include "kfd_priv.h"
+ #include "kfd_crat.h"
+ #include "kfd_topology.h"
+ 
+-static struct list_head topology_device_list;
+-static int topology_crat_parsed;
++/* topology_device_list - Master list of all topology devices */
++struct list_head topology_device_list;
+ static struct kfd_system_properties sys_props;
+ 
+ static DECLARE_RWSEM(topology_lock);
++static atomic_t topology_crat_proximity_domain;
+ 
+ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
+ {
+@@ -57,311 +60,61 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
+ 	return device;
+ }
+ 
+-struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
++uint32_t kfd_get_gpu_id(struct kfd_dev *dev)
+ {
+ 	struct kfd_topology_device *top_dev;
+-	struct kfd_dev *device = NULL;
++	uint32_t gpu_id = 0;
+ 
+ 	down_read(&topology_lock);
+ 
+ 	list_for_each_entry(top_dev, &topology_device_list, list)
+-		if (top_dev->gpu->pdev == pdev) {
+-			device = top_dev->gpu;
++		if (top_dev->gpu == dev) {
++			gpu_id = top_dev->gpu_id;
+ 			break;
+ 		}
+ 
+ 	up_read(&topology_lock);
+ 
+-	return device;
+-}
+-
+-static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size)
+-{
+-	struct acpi_table_header *crat_table;
+-	acpi_status status;
+-
+-	if (!size)
+-		return -EINVAL;
+-
+-	/*
+-	 * Fetch the CRAT table from ACPI
+-	 */
+-	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
+-	if (status == AE_NOT_FOUND) {
+-		pr_warn("CRAT table not found\n");
+-		return -ENODATA;
+-	} else if (ACPI_FAILURE(status)) {
+-		const char *err = acpi_format_exception(status);
+-
+-		pr_err("CRAT table error: %s\n", err);
+-		return -EINVAL;
+-	}
+-
+-	if (*size >= crat_table->length && crat_image != NULL)
+-		memcpy(crat_image, crat_table, crat_table->length);
+-
+-	*size = crat_table->length;
+-
+-	return 0;
++	return gpu_id;
+ }
+ 
+-static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
+-		struct crat_subtype_computeunit *cu)
+-{
+-	BUG_ON(!dev);
+-	BUG_ON(!cu);
+-
+-	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
+-	dev->node_props.cpu_core_id_base = cu->processor_id_low;
+-	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
+-		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+-
+-	pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
+-			cu->processor_id_low);
+-}
+-
+-static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
+-		struct crat_subtype_computeunit *cu)
+-{
+-	BUG_ON(!dev);
+-	BUG_ON(!cu);
+-
+-	dev->node_props.simd_id_base = cu->processor_id_low;
+-	dev->node_props.simd_count = cu->num_simd_cores;
+-	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
+-	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
+-	dev->node_props.wave_front_size = cu->wave_front_size;
+-	dev->node_props.mem_banks_count = cu->num_banks;
+-	dev->node_props.array_count = cu->num_arrays;
+-	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
+-	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
+-	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
+-	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
+-		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
+-	pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores,
+-				cu->processor_id_low);
+-}
+-
+-/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */
+-static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu)
+-{
+-	struct kfd_topology_device *dev;
+-	int i = 0;
+-
+-	BUG_ON(!cu);
+-
+-	pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
+-			cu->proximity_domain, cu->hsa_capability);
+-	list_for_each_entry(dev, &topology_device_list, list) {
+-		if (cu->proximity_domain == i) {
+-			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
+-				kfd_populated_cu_info_cpu(dev, cu);
+-
+-			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
+-				kfd_populated_cu_info_gpu(dev, cu);
+-			break;
+-		}
+-		i++;
+-	}
+-
+-	return 0;
+-}
+-
+-/*
+- * kfd_parse_subtype_mem is called when the topology mutex is
+- * already acquired
+- */
+-static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem)
++struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
+ {
+-	struct kfd_mem_properties *props;
+-	struct kfd_topology_device *dev;
+-	int i = 0;
+-
+-	BUG_ON(!mem);
+-
+-	pr_info("Found memory entry in CRAT table with proximity_domain=%d\n",
+-			mem->promixity_domain);
+-	list_for_each_entry(dev, &topology_device_list, list) {
+-		if (mem->promixity_domain == i) {
+-			props = kfd_alloc_struct(props);
+-			if (props == NULL)
+-				return -ENOMEM;
+-
+-			if (dev->node_props.cpu_cores_count == 0)
+-				props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE;
+-			else
+-				props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
+-
+-			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
+-				props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
+-			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
+-				props->flags |= HSA_MEM_FLAGS_NON_VOLATILE;
+-
+-			props->size_in_bytes =
+-				((uint64_t)mem->length_high << 32) +
+-							mem->length_low;
+-			props->width = mem->width;
++	struct kfd_topology_device *top_dev;
++	struct kfd_dev *device = NULL;
+ 
+-			dev->mem_bank_count++;
+-			list_add_tail(&props->list, &dev->mem_props);
++	down_read(&topology_lock);
+ 
++	list_for_each_entry(top_dev, &topology_device_list, list)
++		if (top_dev->gpu && top_dev->gpu->pdev == pdev) {
++			device = top_dev->gpu;
+ 			break;
+ 		}
+-		i++;
+-	}
+-
+-	return 0;
+-}
+-
+-/*
+- * kfd_parse_subtype_cache is called when the topology mutex
+- * is already acquired
+- */
+-static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache)
+-{
+-	struct kfd_cache_properties *props;
+-	struct kfd_topology_device *dev;
+-	uint32_t id;
+-
+-	BUG_ON(!cache);
+-
+-	id = cache->processor_id_low;
+ 
+-	pr_info("Found cache entry in CRAT table with processor_id=%d\n", id);
+-	list_for_each_entry(dev, &topology_device_list, list)
+-		if (id == dev->node_props.cpu_core_id_base ||
+-		    id == dev->node_props.simd_id_base) {
+-			props = kfd_alloc_struct(props);
+-			if (props == NULL)
+-				return -ENOMEM;
+-
+-			props->processor_id_low = id;
+-			props->cache_level = cache->cache_level;
+-			props->cache_size = cache->cache_size;
+-			props->cacheline_size = cache->cache_line_size;
+-			props->cachelines_per_tag = cache->lines_per_tag;
+-			props->cache_assoc = cache->associativity;
+-			props->cache_latency = cache->cache_latency;
+-
+-			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+-				props->cache_type |= HSA_CACHE_TYPE_DATA;
+-			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
+-				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+-			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+-				props->cache_type |= HSA_CACHE_TYPE_CPU;
+-			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+-				props->cache_type |= HSA_CACHE_TYPE_HSACU;
+-
+-			dev->cache_count++;
+-			dev->node_props.caches_count++;
+-			list_add_tail(&props->list, &dev->cache_props);
+-
+-			break;
+-		}
++	up_read(&topology_lock);
+ 
+-	return 0;
++	return device;
+ }
+ 
+-/*
+- * kfd_parse_subtype_iolink is called when the topology mutex
+- * is already acquired
+- */
+-static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink)
++struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd)
+ {
+-	struct kfd_iolink_properties *props;
+-	struct kfd_topology_device *dev;
+-	uint32_t i = 0;
+-	uint32_t id_from;
+-	uint32_t id_to;
+-
+-	BUG_ON(!iolink);
+-
+-	id_from = iolink->proximity_domain_from;
+-	id_to = iolink->proximity_domain_to;
++	struct kfd_topology_device *top_dev;
++	struct kfd_dev *device = NULL;
+ 
+-	pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from);
+-	list_for_each_entry(dev, &topology_device_list, list) {
+-		if (id_from == i) {
+-			props = kfd_alloc_struct(props);
+-			if (props == NULL)
+-				return -ENOMEM;
+-
+-			props->node_from = id_from;
+-			props->node_to = id_to;
+-			props->ver_maj = iolink->version_major;
+-			props->ver_min = iolink->version_minor;
+-
+-			/*
+-			 * weight factor (derived from CDIR), currently always 1
+-			 */
+-			props->weight = 1;
+-
+-			props->min_latency = iolink->minimum_latency;
+-			props->max_latency = iolink->maximum_latency;
+-			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
+-			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
+-			props->rec_transfer_size =
+-					iolink->recommended_transfer_size;
+-
+-			dev->io_link_count++;
+-			dev->node_props.io_links_count++;
+-			list_add_tail(&props->list, &dev->io_link_props);
++	down_read(&topology_lock);
+ 
++	list_for_each_entry(top_dev, &topology_device_list, list)
++		if (top_dev->gpu && top_dev->gpu->kgd == kgd) {
++			device = top_dev->gpu;
+ 			break;
+ 		}
+-		i++;
+-	}
+ 
+-	return 0;
+-}
+-
+-static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr)
+-{
+-	struct crat_subtype_computeunit *cu;
+-	struct crat_subtype_memory *mem;
+-	struct crat_subtype_cache *cache;
+-	struct crat_subtype_iolink *iolink;
+-	int ret = 0;
+-
+-	BUG_ON(!sub_type_hdr);
+-
+-	switch (sub_type_hdr->type) {
+-	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
+-		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
+-		ret = kfd_parse_subtype_cu(cu);
+-		break;
+-	case CRAT_SUBTYPE_MEMORY_AFFINITY:
+-		mem = (struct crat_subtype_memory *)sub_type_hdr;
+-		ret = kfd_parse_subtype_mem(mem);
+-		break;
+-	case CRAT_SUBTYPE_CACHE_AFFINITY:
+-		cache = (struct crat_subtype_cache *)sub_type_hdr;
+-		ret = kfd_parse_subtype_cache(cache);
+-		break;
+-	case CRAT_SUBTYPE_TLB_AFFINITY:
+-		/*
+-		 * For now, nothing to do here
+-		 */
+-		pr_info("Found TLB entry in CRAT table (not processing)\n");
+-		break;
+-	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
+-		/*
+-		 * For now, nothing to do here
+-		 */
+-		pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n");
+-		break;
+-	case CRAT_SUBTYPE_IOLINK_AFFINITY:
+-		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
+-		ret = kfd_parse_subtype_iolink(iolink);
+-		break;
+-	default:
+-		pr_warn("Unknown subtype (%d) in CRAT\n",
+-				sub_type_hdr->type);
+-	}
++	up_read(&topology_lock);
+ 
+-	return ret;
++	return device;
+ }
+ 
++/* Called with write topology_lock acquired */
+ static void kfd_release_topology_device(struct kfd_topology_device *dev)
+ {
+ 	struct kfd_mem_properties *mem;
+@@ -398,20 +151,22 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev)
+ 	sys_props.num_devices--;
+ }
+ 
+-static void kfd_release_live_view(void)
++void kfd_release_live_view(void)
+ {
+ 	struct kfd_topology_device *dev;
+ 
++	down_write(&topology_lock);
+ 	while (topology_device_list.next != &topology_device_list) {
+ 		dev = container_of(topology_device_list.next,
+ 				 struct kfd_topology_device, list);
+ 		kfd_release_topology_device(dev);
+-}
+-
++	}
++	up_write(&topology_lock);
+ 	memset(&sys_props, 0, sizeof(sys_props));
+ }
+ 
+-static struct kfd_topology_device *kfd_create_topology_device(void)
++struct kfd_topology_device *kfd_create_topology_device(
++				struct list_head *device_list)
+ {
+ 	struct kfd_topology_device *dev;
+ 
+@@ -425,65 +180,12 @@ static struct kfd_topology_device *kfd_create_topology_device(void)
+ 	INIT_LIST_HEAD(&dev->cache_props);
+ 	INIT_LIST_HEAD(&dev->io_link_props);
+ 
+-	list_add_tail(&dev->list, &topology_device_list);
++	list_add_tail(&dev->list, device_list);
+ 	sys_props.num_devices++;
+ 
+ 	return dev;
+ }
+ 
+-static int kfd_parse_crat_table(void *crat_image)
+-{
+-	struct kfd_topology_device *top_dev;
+-	struct crat_subtype_generic *sub_type_hdr;
+-	uint16_t node_id;
+-	int ret;
+-	struct crat_header *crat_table = (struct crat_header *)crat_image;
+-	uint16_t num_nodes;
+-	uint32_t image_len;
+-
+-	if (!crat_image)
+-		return -EINVAL;
+-
+-	num_nodes = crat_table->num_domains;
+-	image_len = crat_table->length;
+-
+-	pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
+-
+-	for (node_id = 0; node_id < num_nodes; node_id++) {
+-		top_dev = kfd_create_topology_device();
+-		if (!top_dev) {
+-			kfd_release_live_view();
+-			return -ENOMEM;
+-		}
+-	}
+-
+-	sys_props.platform_id =
+-		(*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK;
+-	sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id);
+-	sys_props.platform_rev = crat_table->revision;
+-
+-	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
+-	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
+-			((char *)crat_image) + image_len) {
+-		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
+-			ret = kfd_parse_subtype(sub_type_hdr);
+-			if (ret != 0) {
+-				kfd_release_live_view();
+-				return ret;
+-			}
+-		}
+-
+-		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+-				sub_type_hdr->length);
+-	}
+-
+-	sys_props.generation_count++;
+-	topology_crat_parsed = 1;
+-
+-	return 0;
+-}
+-
+-
+ #define sysfs_show_gen_prop(buffer, fmt, ...) \
+ 		snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__)
+ #define sysfs_show_32bit_prop(buffer, name, value) \
+@@ -593,7 +295,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
+ 		char *buffer)
+ {
+ 	ssize_t ret;
+-	uint32_t i;
++	uint32_t i, j;
+ 	struct kfd_cache_properties *cache;
+ 
+ 	/* Making sure that the buffer is an empty string */
+@@ -611,12 +313,18 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
+ 	sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency);
+ 	sysfs_show_32bit_prop(buffer, "type", cache->cache_type);
+ 	snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer);
+-	for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++)
+-		ret = snprintf(buffer, PAGE_SIZE, "%s%d%s",
+-				buffer, cache->sibling_map[i],
+-				(i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ?
+-						"\n" : ",");
+-
++	for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
++		for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) {
++			/* Check each bit */
++			if (cache->sibling_map[i] & (1 << j))
++				ret = snprintf(buffer, PAGE_SIZE,
++					 "%s%d%s", buffer, 1, ",");
++			else
++				ret = snprintf(buffer, PAGE_SIZE,
++					 "%s%d%s", buffer, 0, ",");
++		}
++	/* Replace the last "," with end of line */
++	*(buffer + strlen(buffer) - 1) = 0xA;
+ 	return ret;
+ }
+ 
+@@ -635,6 +343,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+ 	char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
+ 	uint32_t i;
+ 	uint32_t log_max_watch_addr;
++	struct kfd_local_mem_info local_mem_info;
+ 
+ 	/* Making sure that the buffer is an empty string */
+ 	buffer[0] = 0;
+@@ -674,7 +383,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+ 	} else {
+ 		sysfs_show_32bit_prop(buffer, "mem_banks_count",
+ 				dev->node_props.mem_banks_count);
+-	}
+ 
+ 	sysfs_show_32bit_prop(buffer, "caches_count",
+ 			dev->node_props.caches_count);
+@@ -723,17 +431,30 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+ 				HSA_CAP_WATCH_POINTS_TOTALBITS_MASK);
+ 		}
+ 
++		if (dev->gpu->device_info->asic_family == CHIP_TONGA)
++			dev->node_props.capability |=
++					HSA_CAP_AQL_QUEUE_DOUBLE_MAP;
++
+ 		sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute",
+-			dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(
+-					dev->gpu->kgd));
++			dev->node_props.max_engine_clk_fcompute);
+ 
+-		sysfs_show_64bit_prop(buffer, "local_mem_size",
+-				(unsigned long long int) 0);
++		/*
++		 * If the ASIC is CZ, set local memory size to 0 to disable
++		 * local memory support
++		 */
++		if (dev->gpu->device_info->asic_family != CHIP_CARRIZO) {
++			dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
++				&local_mem_info);
++			sysfs_show_64bit_prop(buffer, "local_mem_size",
++					local_mem_info.local_mem_size_private +
++					local_mem_info.local_mem_size_public);
++		}
++		else
++			sysfs_show_64bit_prop(buffer, "local_mem_size",
++					(unsigned long long int) 0);
+ 
+ 		sysfs_show_32bit_prop(buffer, "fw_version",
+-			dev->gpu->kfd2kgd->get_fw_version(
+-						dev->gpu->kgd,
+-						KGD_ENGINE_MEC1));
++				dev->gpu->mec_fw_version);
+ 		sysfs_show_32bit_prop(buffer, "capability",
+ 				dev->node_props.capability);
+ 	}
+@@ -928,6 +649,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ 	return 0;
+ }
+ 
++/* Called with write topology lock acquired */
+ static int kfd_build_sysfs_node_tree(void)
+ {
+ 	struct kfd_topology_device *dev;
+@@ -944,6 +666,7 @@ static int kfd_build_sysfs_node_tree(void)
+ 	return 0;
+ }
+ 
++/* Called with write topology lock acquired */
+ static void kfd_remove_sysfs_node_tree(void)
+ {
+ 	struct kfd_topology_device *dev;
+@@ -1015,88 +738,200 @@ static void kfd_topology_release_sysfs(void)
+ 	}
+ }
+ 
++/* Called with write topology_lock acquired */
++static int kfd_topology_update_device_list(struct list_head *temp_list,
++					struct list_head *master_list)
++{
++	int num = 0;
++
++	while (!list_empty(temp_list)) {
++		list_move_tail(temp_list->next, master_list);
++		num++;
++	}
++	return num;
++}
++
++static void kfd_debug_print_topology(void)
++{
++	struct kfd_topology_device *dev;
++
++	down_read(&topology_lock);
++
++	dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list);
++	if (dev) {
++		if (dev->node_props.cpu_cores_count && dev->node_props.simd_count) {
++			pr_info("Topology: Add APU node [0x%0x:0x%0x]\n",
++				dev->node_props.device_id, dev->node_props.vendor_id);
++		}
++		else if (dev->node_props.cpu_cores_count)
++			pr_info("Topology: Add CPU node\n");
++		else if (dev->node_props.simd_count)
++			pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n",
++				dev->node_props.device_id, dev->node_props.vendor_id);
++	}
++	up_read(&topology_lock);
++}
++
++/* Helper function for intializing platform_xx members of kfd_system_properties
++ */
++static void kfd_update_system_properties(void)
++{
++	struct kfd_topology_device *dev;
++
++	down_read(&topology_lock);
++	dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list);
++	if (dev) {
++		sys_props.platform_id =
++			(*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK;
++		sys_props.platform_oem = *((uint64_t *)dev->oem_table_id);
++		sys_props.platform_rev = dev->oem_revision;
++	}
++	up_read(&topology_lock);
++}
++
++static void find_system_memory(const struct dmi_header *dm,
++	void *private)
++{
++	struct kfd_mem_properties *mem;
++	u16 mem_width, mem_clock;
++	struct kfd_topology_device *kdev =
++		(struct kfd_topology_device *)private;
++	const u8 *dmi_data = (const u8 *)(dm + 1);
++
++	if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) {
++		mem_width = (u16)(*(const u16 *)(dmi_data + 0x6));
++		mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11));
++		list_for_each_entry(mem, &kdev->mem_props, list) {
++			if (mem_width != 0xFFFF && mem_width != 0)
++				mem->width = mem_width;
++			if (mem_clock != 0)
++				mem->mem_clk_max = mem_clock;
++		}
++	}
++}
++/* kfd_add_non_crat_information - Add information that is not currently
++ *	defined in CRAT but is necessary for KFD topology
++ * @dev - topology device to which addition info is added
++ */
++static void kfd_add_non_crat_information(struct kfd_topology_device *kdev)
++{
++	/* Check if CPU only node. */
++	if (kdev->gpu == NULL) {
++		/* Add system memory information */
++		dmi_walk(find_system_memory, kdev);
++	}
++	/* TODO: For GPU node, rearrange code from kfd_topology_add_device */
++}
++
+ int kfd_topology_init(void)
+ {
+ 	void *crat_image = NULL;
+ 	size_t image_size = 0;
+ 	int ret;
++	struct list_head temp_topology_device_list;
++	int cpu_only_node = 0;
++	struct kfd_topology_device *kdev;
++	int proximity_domain;
++	int num_nodes;
++
++	/* topology_device_list - Master list of all topology devices
++	 * temp_topology_device_list - temporary list created while parsing CRAT
++	 * or VCRAT. Once parsing is complete the contents of list is moved to
++	 * topology_device_list
++	 */
+ 
+-	/*
+-	 * Initialize the head for the topology device list
++	 /* Initialize the head for the both the lists
+ 	 */
+ 	INIT_LIST_HEAD(&topology_device_list);
++	INIT_LIST_HEAD(&temp_topology_device_list);
+ 	init_rwsem(&topology_lock);
+-	topology_crat_parsed = 0;
+ 
+ 	memset(&sys_props, 0, sizeof(sys_props));
+ 
++	/* Proximity domains in ACPI CRAT tables start counting at
++	 * 0. The same should be true for virtual CRAT tables created
++	 * at this stage. GPUs added later in kfd_topology_add_device
++	 * use a counter. */
++	proximity_domain = 0;
++
+ 	/*
+-	 * Get the CRAT image from the ACPI
++	 * Get the CRAT image from the ACPI. If ACPI doesn't have one
++	 * create a virtual CRAT.
++	 * NOTE: The current implementation expects all AMD APUs to have
++	 *	CRAT. If no CRAT is available, it is assumed to be a CPU
+ 	 */
+-	ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
+-	if (ret == 0 && image_size > 0) {
+-		pr_info("Found CRAT image with size=%zd\n", image_size);
+-		crat_image = kmalloc(image_size, GFP_KERNEL);
+-		if (!crat_image) {
+-			ret = -ENOMEM;
+-			pr_err("No memory for allocating CRAT image\n");
+-			goto err;
+-		}
+-		ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
+-
+-		if (ret == 0) {
+-			down_write(&topology_lock);
+-			ret = kfd_parse_crat_table(crat_image);
+-			if (ret == 0)
+-				ret = kfd_topology_update_sysfs();
+-			up_write(&topology_lock);
+-		} else {
+-			pr_err("Couldn't get CRAT table size from ACPI\n");
+-		}
+-		kfree(crat_image);
+-	} else if (ret == -ENODATA) {
+-		ret = 0;
+-	} else {
+-		pr_err("Couldn't get CRAT table size from ACPI\n");
++	ret = kfd_create_crat_image_acpi(&crat_image, &image_size);
++	if (ret != 0) {
++		ret = kfd_create_crat_image_virtual(&crat_image, &image_size,
++				COMPUTE_UNIT_CPU, NULL,
++				proximity_domain);
++		cpu_only_node = 1;
++	}
++
++	if (ret == 0)
++		ret = kfd_parse_crat_table(crat_image,
++				&temp_topology_device_list,
++				proximity_domain);
++	else {
++		pr_err("Error getting/creating CRAT table\n");
++		goto err;
++	}
++
++	down_write(&topology_lock);
++	num_nodes = kfd_topology_update_device_list(&temp_topology_device_list,
++						    &topology_device_list);
++	atomic_set(&topology_crat_proximity_domain, num_nodes-1);
++	ret = kfd_topology_update_sysfs();
++	up_write(&topology_lock);
++
++	if (ret == 0) {
++		sys_props.generation_count++;
++		kfd_update_system_properties();
++		kfd_debug_print_topology();
++		pr_info("Finished initializing topology\n");
++	}
++	else
++		pr_err("Failed to update topology in sysfs ret=%d\n", ret);
++
++	/* For nodes with GPU, this information gets added
++	 * when GPU is detected (kfd_topology_add_device). */
++	if (cpu_only_node) {
++		/* Add additional information to CPU only node created above */
++		down_write(&topology_lock);
++		kdev = list_first_entry(&topology_device_list,
++				struct kfd_topology_device, list);
++		up_write(&topology_lock);
++		kfd_add_non_crat_information(kdev);
+ 	}
+ 
+ err:
+-	pr_info("Finished initializing topology ret=%d\n", ret);
++	kfd_destroy_crat_image(crat_image);
+ 	return ret;
+ }
+ 
+ void kfd_topology_shutdown(void)
+ {
++	down_write(&topology_lock);
+ 	kfd_topology_release_sysfs();
++	up_write(&topology_lock);
+ 	kfd_release_live_view();
+ }
+ 
+-static void kfd_debug_print_topology(void)
+-{
+-	struct kfd_topology_device *dev;
+-	uint32_t i = 0;
+-
+-	pr_info("DEBUG PRINT OF TOPOLOGY:");
+-	list_for_each_entry(dev, &topology_device_list, list) {
+-		pr_info("Node: %d\n", i);
+-		pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no"));
+-		pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count);
+-		pr_info("\tSIMD count: %d", dev->node_props.simd_count);
+-		i++;
+-	}
+-}
+-
+ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
+ {
+ 	uint32_t hashout;
+ 	uint32_t buf[7];
+ 	uint64_t local_mem_size;
+ 	int i;
++	struct kfd_local_mem_info local_mem_info;
+ 
+ 	if (!gpu)
+ 		return 0;
+ 
+-	local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd);
++	gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info);
++
++	local_mem_size = local_mem_info.local_mem_size_private +
++			local_mem_info.local_mem_size_public;
+ 
+ 	buf[0] = gpu->pdev->devfn;
+ 	buf[1] = gpu->pdev->subsystem_vendor;
+@@ -1111,7 +946,13 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
+ 
+ 	return hashout;
+ }
+-
++/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
++ *		the GPU device is not already present in the topology device list
++ *		then return NULL. This means a new topology device has to be
++ *		created for this GPU.
++ * TODO: Rather than assiging @gpu to first topology device withtout
++ *		gpu attached, it will better to have more stringent check.
++ */
+ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
+ {
+ 	struct kfd_topology_device *dev;
+@@ -1119,13 +960,14 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
+ 
+ 	BUG_ON(!gpu);
+ 
++	down_write(&topology_lock);
+ 	list_for_each_entry(dev, &topology_device_list, list)
+ 		if (dev->gpu == NULL && dev->node_props.simd_count > 0) {
+ 			dev->gpu = gpu;
+ 			out_dev = dev;
+ 			break;
+ 		}
+-
++	up_write(&topology_lock);
+ 	return out_dev;
+ }
+ 
+@@ -1137,70 +979,146 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival)
+ 	 */
+ }
+ 
++/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info,
++ *		patch this after CRAT parsing.
++ */
++static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev)
++{
++	struct kfd_mem_properties *mem;
++	struct kfd_local_mem_info local_mem_info;
++
++	if (dev == NULL)
++		return;
++
++	/* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with
++	 * single bank of VRAM local memory.
++	 * for dGPUs - VCRAT reports only one bank of Local Memory
++	 * for APUs - If CRAT from ACPI reports more than one bank, then
++	 *	all the banks will report the same mem_clk_max information
++	 */
++	dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
++		&local_mem_info);
++
++	list_for_each_entry(mem, &dev->mem_props, list)
++		mem->mem_clk_max = local_mem_info.mem_clk_max;
++}
++
+ int kfd_topology_add_device(struct kfd_dev *gpu)
+ {
+ 	uint32_t gpu_id;
+ 	struct kfd_topology_device *dev;
+-	int res;
++	struct kfd_cu_info cu_info;
++	int res = 0;
++	struct list_head temp_topology_device_list;
++	void *crat_image = NULL;
++	size_t image_size = 0;
++	int proximity_domain;
+ 
+ 	BUG_ON(!gpu);
+ 
++	INIT_LIST_HEAD(&temp_topology_device_list);
++
+ 	gpu_id = kfd_generate_gpu_id(gpu);
+ 
+ 	pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
+ 
+-	down_write(&topology_lock);
+-	/*
+-	 * Try to assign the GPU to existing topology device (generated from
+-	 * CRAT table
++	proximity_domain = atomic_inc_return(&
++				topology_crat_proximity_domain);
++
++	/* Check to see if this gpu device exists in the topology_device_list.
++	 * If so, assign the gpu to that device,
++	 * else create a Virtual CRAT for this gpu device and then parse that CRAT
++	 * to create a new topology device. Once created assign the gpu to that
++	 * topology device
+ 	 */
+ 	dev = kfd_assign_gpu(gpu);
+ 	if (!dev) {
+-		pr_info("GPU was not found in the current topology. Extending.\n");
+-		kfd_debug_print_topology();
+-		dev = kfd_create_topology_device();
+-		if (!dev) {
+-			res = -ENOMEM;
++		res = kfd_create_crat_image_virtual(&crat_image, &image_size,
++				COMPUTE_UNIT_GPU,
++				gpu, proximity_domain);
++		if (res == 0)
++			res = kfd_parse_crat_table(crat_image,
++				&temp_topology_device_list, proximity_domain);
++		else {
++			pr_err("Error in VCRAT for GPU (ID: 0x%x)\n", gpu_id);
+ 			goto err;
+ 		}
+-		dev->gpu = gpu;
+ 
+-		/*
+-		 * TODO: Make a call to retrieve topology information from the
+-		 * GPU vBIOS
+-		 */
++		down_write(&topology_lock);
++		kfd_topology_update_device_list(&temp_topology_device_list,
++			&topology_device_list);
+ 
+ 		/*
+ 		 * Update the SYSFS tree, since we added another topology device
+ 		 */
+-		if (kfd_topology_update_sysfs() < 0)
+-			kfd_topology_release_sysfs();
+-
++		res = kfd_topology_update_sysfs();
++		up_write(&topology_lock);
++
++		if (res == 0)
++			sys_props.generation_count++;
++		else
++			pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
++						gpu_id, res);
++		dev = kfd_assign_gpu(gpu);
++		BUG_ON(!dev);
+ 	}
+ 
+ 	dev->gpu_id = gpu_id;
+ 	gpu->id = gpu_id;
++
++	/* TODO: Move the following lines to function
++	 *	kfd_add_non_crat_information */
++
++	/* Fill-in additional information that is not available in CRAT but
++	 * needed for the topology */
++
++	dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info);
++	dev->node_props.simd_arrays_per_engine = cu_info.num_shader_arrays_per_engine;
++
+ 	dev->node_props.vendor_id = gpu->pdev->vendor;
+ 	dev->node_props.device_id = gpu->pdev->device;
+-	dev->node_props.location_id = (gpu->pdev->bus->number << 24) +
+-			(gpu->pdev->devfn & 0xffffff);
+-	/*
+-	 * TODO: Retrieve max engine clock values from KGD
+-	 */
+-
+-	if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
+-		dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE;
+-		pr_info("amdkfd: adding doorbell packet type capability\n");
++	dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number,
++		gpu->pdev->devfn);
++	dev->node_props.max_engine_clk_fcompute =
++		dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd);
++	dev->node_props.max_engine_clk_ccompute =
++		cpufreq_quick_get_max(0) / 1000;
++
++	kfd_fill_mem_clk_max_info(dev);
++
++	switch (dev->gpu->device_info->asic_family) {
++	case CHIP_KAVERI:
++		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 <<
++			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
++			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
++		break;
++	case CHIP_CARRIZO:
++	case CHIP_TONGA:
++	case CHIP_FIJI:
++		pr_debug("amdkfd: adding doorbell packet type capability\n");
++		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 <<
++			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
++			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
++		break;
+ 	}
+ 
+-	res = 0;
++	/* Fix errors in CZ CRAT.
++	 * simd_count: Carrizo CRAT reports wrong simd_count, probably because it
++	 *		doesn't consider masked out CUs
++	 * capability flag: Carrizo CRAT doesn't report IOMMU flags.
++	 */
++	if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
++		dev->node_props.simd_count =
++			cu_info.simd_per_cu * cu_info.cu_active_number;
++		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
++        }
+ 
++	kfd_debug_print_topology();
+ err:
+-	up_write(&topology_lock);
+-
+ 	if (res == 0)
+ 		kfd_notify_gpu_change(gpu_id, 1);
+ 
++	kfd_destroy_crat_image(crat_image);
+ 	return res;
+ }
+ 
+@@ -1233,22 +1151,26 @@ int kfd_topology_remove_device(struct kfd_dev *gpu)
+ 	return res;
+ }
+ 
+-/*
+- * When idx is out of bounds, the function will return NULL
++/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD
++ *	topology. If GPU device is found @idx, then valid kfd_dev pointer is
++ *	returned through @kdev
++ * Return -	0: On success (@kdev will be NULL for non GPU nodes)
++ *		-1: If end of list
+  */
+-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx)
++int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev)
+ {
+ 
+ 	struct kfd_topology_device *top_dev;
+-	struct kfd_dev *device = NULL;
+ 	uint8_t device_idx = 0;
+ 
++	*kdev = NULL;
+ 	down_read(&topology_lock);
+ 
+ 	list_for_each_entry(top_dev, &topology_device_list, list) {
+ 		if (device_idx == idx) {
+-			device = top_dev->gpu;
+-			break;
++			*kdev = top_dev->gpu;
++			up_read(&topology_lock);
++			return 0;
+ 		}
+ 
+ 		device_idx++;
+@@ -1256,6 +1178,57 @@ struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx)
+ 
+ 	up_read(&topology_lock);
+ 
+-	return device;
++	return -1;
++
++}
++
++static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask)
++{
++	const struct cpuinfo_x86 *cpuinfo;
++	int first_cpu_of_nuna_node;
++
++	if (cpumask == NULL || cpumask == cpu_none_mask)
++		return -1;
++	first_cpu_of_nuna_node = cpumask_first(cpumask);
++	cpuinfo = &cpu_data(first_cpu_of_nuna_node);
++
++	return cpuinfo->apicid;
++}
++
++/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor
++ *	of the given NUMA node (numa_node_id)
++ * Return -1 on failure
++ */
++int kfd_numa_node_to_apic_id(int numa_node_id)
++{
++	if (numa_node_id == -1) {
++		pr_warn("Invalid NUMA Node. Use online CPU mask\n");
++		return kfd_cpumask_to_apic_id(cpu_online_mask);
++	}
++	return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id));
++}
++
++/* kfd_get_proximity_domain - Find proximity_domain (node id) to which
++ *  given PCI bus belongs to. CRAT table contains only the APIC ID
++ *  of the parent NUMA node. So use that as the search parameter.
++ * Return -1 on failure
++ */
++int kfd_get_proximity_domain(const struct pci_bus *bus)
++{
++	struct kfd_topology_device *dev;
++	int proximity_domain = -1;
++
++	down_read(&topology_lock);
++
++	list_for_each_entry(dev, &topology_device_list, list)
++		if (dev->node_props.cpu_cores_count &&
++			dev->node_props.cpu_core_id_base ==
++			kfd_cpumask_to_apic_id(cpumask_of_pcibus(bus))) {
++				proximity_domain = dev->proximity_domain;
++			break;
++		}
++
++	up_read(&topology_lock);
+ 
++	return proximity_domain;
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+index c3ddb9b..ab28188 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+@@ -39,8 +39,16 @@
+ #define HSA_CAP_WATCH_POINTS_SUPPORTED		0x00000080
+ #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK	0x00000f00
+ #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT	8
+-#define HSA_CAP_RESERVED			0xfffff000
++#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK	0x00003000
++#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT	12
++#define HSA_CAP_RESERVED			0xffffc000
++
++#define HSA_CAP_DOORBELL_TYPE_PRE_1_0		0x0
++#define HSA_CAP_DOORBELL_TYPE_1_0		0x1
++#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK	0x00000f00
++#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT	8
+ #define HSA_CAP_DOORBELL_PACKET_TYPE		0x00001000
++#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
+ 
+ struct kfd_node_properties {
+ 	uint32_t cpu_cores_count;
+@@ -91,8 +99,6 @@ struct kfd_mem_properties {
+ 	struct attribute	attr;
+ };
+ 
+-#define KFD_TOPOLOGY_CPU_SIBLINGS 256
+-
+ #define HSA_CACHE_TYPE_DATA		0x00000001
+ #define HSA_CACHE_TYPE_INSTRUCTION	0x00000002
+ #define HSA_CACHE_TYPE_CPU		0x00000004
+@@ -109,7 +115,7 @@ struct kfd_cache_properties {
+ 	uint32_t		cache_assoc;
+ 	uint32_t		cache_latency;
+ 	uint32_t		cache_type;
+-	uint8_t			sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS];
++	uint8_t			sibling_map[CRAT_SIBLINGMAP_SIZE];
+ 	struct kobject		*kobj;
+ 	struct attribute	attr;
+ };
+@@ -135,8 +141,8 @@ struct kfd_iolink_properties {
+ struct kfd_topology_device {
+ 	struct list_head		list;
+ 	uint32_t			gpu_id;
++	uint32_t			proximity_domain;
+ 	struct kfd_node_properties	node_props;
+-	uint32_t			mem_bank_count;
+ 	struct list_head		mem_props;
+ 	uint32_t			cache_count;
+ 	struct list_head		cache_props;
+@@ -150,6 +156,9 @@ struct kfd_topology_device {
+ 	struct attribute		attr_gpuid;
+ 	struct attribute		attr_name;
+ 	struct attribute		attr_props;
++	uint8_t		oem_id[CRAT_OEMID_LENGTH];
++	uint8_t		oem_table_id[CRAT_OEMTABLEID_LENGTH];
++	uint32_t	oem_revision;
+ };
+ 
+ struct kfd_system_properties {
+@@ -164,6 +173,8 @@ struct kfd_system_properties {
+ 	struct attribute	attr_props;
+ };
+ 
+-
++struct kfd_topology_device *kfd_create_topology_device(
++		struct list_head *device_list);
++void kfd_release_live_view(void);
+ 
+ #endif /* __KFD_TOPOLOGY_H__ */
+diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+index 36f3766..5403164 100644
+--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
++++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+@@ -40,6 +40,41 @@ struct kfd_dev;
+ struct kgd_dev;
+ 
+ struct kgd_mem;
++struct kfd_process_device;
++struct amdgpu_bo;
++
++struct kfd_vm_fault_info {
++	uint64_t	page_addr;
++	uint32_t	vmid;
++	uint32_t	mc_id;
++	uint32_t	status;
++	bool		prot_valid;
++	bool		prot_read;
++	bool		prot_write;
++	bool		prot_exec;
++};
++
++struct kfd_cu_info {
++	uint32_t num_shader_engines;
++	uint32_t num_shader_arrays_per_engine;
++	uint32_t num_cu_per_sh;
++	uint32_t cu_active_number;
++	uint32_t cu_ao_mask;
++	uint32_t simd_per_cu;
++	uint32_t max_waves_per_simd;
++	uint32_t wave_front_size;
++	uint32_t max_scratch_slots_per_cu;
++	uint32_t lds_size;
++	uint32_t cu_bitmap[4][4];
++};
++
++/* For getting GPU local memory information from KGD */
++struct kfd_local_mem_info {
++	uint64_t local_mem_size_private;
++	uint64_t local_mem_size_public;
++	uint32_t vram_width;
++	uint32_t mem_clk_max;
++};
+ 
+ enum kgd_memory_pool {
+ 	KGD_POOL_SYSTEM_CACHEABLE = 1,
+@@ -80,8 +115,28 @@ struct kgd2kfd_shared_resources {
+ 
+ 	/* Number of bytes at start of aperture reserved for KGD. */
+ 	size_t doorbell_start_offset;
++
++	/* GPUVM address space size in bytes */
++	uint64_t gpuvm_size;
+ };
+ 
++/*
++ * Allocation flag domains currently only VRAM and GTT domain supported
++ */
++#define ALLOC_MEM_FLAGS_VRAM			(1 << 0)
++#define ALLOC_MEM_FLAGS_GTT				(1 << 1)
++#define ALLOC_MEM_FLAGS_USERPTR			(1 << 2)
++
++/*
++ * Allocation flags attributes/access options.
++ */
++#define ALLOC_MEM_FLAGS_NONPAGED		(1 << 31)
++#define ALLOC_MEM_FLAGS_READONLY		(1 << 30)
++#define ALLOC_MEM_FLAGS_PUBLIC			(1 << 29)
++#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE	(1 << 28)
++#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM	(1 << 27)
++#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS	(1 << 26)
++
+ /**
+  * struct kfd2kgd_calls
+  *
+@@ -90,7 +145,7 @@ struct kgd2kfd_shared_resources {
+  *
+  * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture
+  *
+- * @get_vmem_size: Retrieves (physical) size of VRAM
++ * @get_local_mem_info: Retrieves information about GPU local memory
+  *
+  * @get_gpu_clock_counter: Retrieves GPU clock counter
+  *
+@@ -121,8 +176,23 @@ struct kgd2kfd_shared_resources {
+  * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that
+  * SDMA hqd slot.
+  *
++ * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs
++ *
++ * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs
++ *
+  * @get_fw_version: Returns FW versions from the header
+  *
++ * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to
++ * IOMMU when address translation failed
++ *
++ * @get_cu_info: Retrieves activated cu info
++ *
++ * @get_dmabuf_info: Returns information about a dmabuf if it was
++ * created by the GPU driver
++ *
++ * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object
++ * Supports only DMA buffers created by GPU driver on the same GPU
++ *
+  * This structure contains function pointers to services that the kgd driver
+  * provides to amdkfd driver.
+  *
+@@ -134,11 +204,23 @@ struct kfd2kgd_calls {
+ 
+ 	void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj);
+ 
++	void(*get_local_mem_info)(struct kgd_dev *kgd,
++			struct kfd_local_mem_info *mem_info);
+ 	uint64_t (*get_vmem_size)(struct kgd_dev *kgd);
+ 	uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd);
+ 
+ 	uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd);
+ 
++	int (*create_process_vm)(struct kgd_dev *kgd, void **vm);
++	void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm);
++
++	int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem);
++	void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem);
++
++	uint32_t (*get_process_page_dir)(void *vm);
++
++	int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem);
++
+ 	/* Register access functions */
+ 	void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid,
+ 			uint32_t sh_mem_config,	uint32_t sh_mem_ape1_base,
+@@ -151,9 +233,11 @@ struct kfd2kgd_calls {
+ 				uint32_t hpd_size, uint64_t hpd_gpu_addr);
+ 
+ 	int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id);
++	
+ 
+ 	int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+-			uint32_t queue_id, uint32_t __user *wptr);
++				uint32_t queue_id, uint32_t __user *wptr,
++				uint32_t page_table_base);
+ 
+ 	int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd);
+ 
+@@ -168,7 +252,7 @@ struct kfd2kgd_calls {
+ 
+ 	int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd,
+ 				unsigned int timeout);
+-
++				
+ 	int (*address_watch_disable)(struct kgd_dev *kgd);
+ 	int (*address_watch_execute)(struct kgd_dev *kgd,
+ 					unsigned int watch_point_id,
+@@ -189,9 +273,53 @@ struct kfd2kgd_calls {
+ 					uint8_t vmid);
+ 	void (*write_vmid_invalidate_request)(struct kgd_dev *kgd,
+ 					uint8_t vmid);
++	int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va,
++			size_t size, void *vm,
++			struct kgd_mem **mem, uint64_t *offset,
++			void **kptr, struct kfd_process_device *pdd,
++			uint32_t flags);
++	int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem);
++	int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem,
++			void *vm);
++	int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem,
++			void *vm);
+ 
+ 	uint16_t (*get_fw_version)(struct kgd_dev *kgd,
+ 				enum kgd_engine_type type);
++
++	void (*set_num_of_requests)(struct kgd_dev *kgd,
++			uint8_t num_of_requests);
++	int (*alloc_memory_of_scratch)(struct kgd_dev *kgd,
++			uint64_t va, uint32_t vmid);
++	int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable,
++		uint8_t element_size, uint8_t index_stride, uint8_t mtype);
++	void (*get_cu_info)(struct kgd_dev *kgd,
++			struct kfd_cu_info *cu_info);
++	int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma);
++	int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd,
++			struct kgd_mem *mem, void **kptr);
++	void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid,
++			uint32_t page_table_base);
++	struct kfd_process_device* (*get_pdd_from_buffer_object)
++		(struct kgd_dev *kgd, struct kgd_mem *mem);
++	int (*return_bo_size)(struct kgd_dev *kgd, struct kgd_mem *mem);
++
++	int (*pin_get_sg_table_bo)(struct kgd_dev *kgd,
++			struct kgd_mem *mem, uint64_t offset,
++			uint64_t size, struct sg_table **ret_sg);
++	void (*unpin_put_sg_table_bo)(struct kgd_mem *mem,
++			struct sg_table *sg);
++
++	int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd,
++			       struct kgd_dev **dma_buf_kgd, uint64_t *bo_size,
++			       void *metadata_buffer, size_t buffer_size,
++			       uint32_t *metadata_size, uint32_t *flags);
++	int (*import_dmabuf)(struct kgd_dev *kgd, int dma_buf_fd, uint64_t va,
++			     void *vm, struct kgd_mem **mem, uint64_t *size);
++
++	int (*get_vm_fault_info)(struct kgd_dev *kgd,
++			struct kfd_vm_fault_info *info);
++
+ };
+ 
+ /**
+@@ -210,6 +338,10 @@ struct kfd2kgd_calls {
+  *
+  * @resume: Notifies amdkfd about a resume action done to a kgd device
+  *
++ * @quiesce_mm: Quiesce all user queue access to specified MM address space
++ *
++ * @resume_mm: Resume user queue access to specified MM address space
++ *
+  * This structure contains function callback pointers so the kgd driver
+  * will notify to the amdkfd about certain status changes.
+  *
+@@ -224,9 +356,13 @@ struct kgd2kfd_calls {
+ 	void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry);
+ 	void (*suspend)(struct kfd_dev *kfd);
+ 	int (*resume)(struct kfd_dev *kfd);
++	int (*evict_bo)(struct kfd_dev *dev, void *ptr);
++	int (*restore)(struct kfd_dev *kfd);
++	int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm);
++	int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm);
+ };
+ 
+ int kgd2kfd_init(unsigned interface_version,
+ 		const struct kgd2kfd_calls **g2f);
+ 
+-#endif	/* KGD_KFD_INTERFACE_H_INCLUDED */
++#endif /* KGD_KFD_INTERFACE_H_INCLUDED */
+-- 
+2.7.4
+