1 files changed, 0 insertions, 8695 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch
deleted file mode 100644
index a27db153..00000000
--- a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch
+++ /dev/null
@@ -1,8695 +0,0 @@
-From 817ccd6f0987f83ddbf989602f0fbf320157f0a9 Mon Sep 17 00:00:00 2001
-From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com>
-Date: Thu, 18 Oct 2018 12:42:04 +0530
-Subject: [PATCH 1353/4131] compilation fix for amdkfd porting
-
-Signed-off-by: Sanjay R Mehta <sanju.mehta@amd.com>
-Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com>
----
- drivers/gpu/drm/amd/amdgpu/Makefile               |    8 +-
- drivers/gpu/drm/amd/amdgpu/amdgpu.h               |    7 +
- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c        |  346 ++-
- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h        |  185 +-
- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c  |  196 ++
- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |  537 ++++-
- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |  590 ++++-
- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h |   62 +
- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1227 ++++++++++
- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 2578 +++++++++++++++++++++
- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c           |    4 -
- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c        |    3 +
- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h        |    1 +
- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h          |    1 +
- drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c          |   46 +-
- drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h          |    2 +
- drivers/gpu/drm/amd/amdgpu/soc15d.h               |    1 +
- drivers/gpu/drm/amd/amdgpu/vid.h                  |    2 +
- drivers/gpu/drm/amd/amdkfd/Makefile               |    2 -
- drivers/gpu/drm/amd/amdkfd/backport/backport.h    |    7 -
- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c          |    2 -
- drivers/gpu/drm/amd/amdkfd/kfd_crat.c             |    4 -
- drivers/gpu/drm/amd/amdkfd/kfd_device.c           |   12 -
- drivers/gpu/drm/amd/amdkfd/kfd_events.c           |   28 -
- drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c        |   10 -
- drivers/gpu/drm/amd/amdkfd/kfd_ipc.c              |    8 -
- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |    4 -
- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |    4 -
- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |    4 -
- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c   |    2 -
- drivers/gpu/drm/amd/amdkfd/kfd_priv.h             |    6 +-
- drivers/gpu/drm/amd/amdkfd/kfd_process.c          |   52 +-
- drivers/gpu/drm/amd/amdkfd/kfd_topology.c         |   22 -
- drivers/gpu/drm/amd/amdkfd/kfd_topology.h         |    6 -
- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |    4 -
- drivers/gpu/drm/amd/include/kgd_kfd_interface.h   |  231 +-
- drivers/gpu/drm/amd/include/v9_structs.h          |   48 +-
- drivers/gpu/drm/amd/include/vi_structs.h          |    2 +
- drivers/pci/pci.c                                 |   81 +
- include/drm/amd_rdma.h                            |   70 +
- include/linux/pci.h                               |    1 +
- include/uapi/linux/kfd_ioctl.h                    |  442 +++-
- include/uapi/linux/pci_regs.h                     |    3 +
- kernel/fork.c                                     |    1 +
- 44 files changed, 6315 insertions(+), 537 deletions(-)
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/Makefile
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu.h
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
- create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
- create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h
- create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
- create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/soc15d.h
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/vid.h
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/Makefile
- mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_priv.h
- mode change 100644 => 100755 drivers/gpu/drm/amd/include/kgd_kfd_interface.h
- mode change 100644 => 100755 drivers/gpu/drm/amd/include/v9_structs.h
- mode change 100644 => 100755 drivers/gpu/drm/amd/include/vi_structs.h
- mode change 100644 => 100755 drivers/pci/pci.c
- create mode 100644 include/drm/amd_rdma.h
- mode change 100644 => 100755 include/linux/pci.h
- mode change 100644 => 100755 include/uapi/linux/pci_regs.h
-
-diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
-old mode 100644
-new mode 100755
-index 57b8d5f..6b373d0
---- a/drivers/gpu/drm/amd/amdgpu/Makefile
-+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
-@@ -32,12 +32,11 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
- 	amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \
- 	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
- 	amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \
--	amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o
-+	amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o amdgpu_amdkfd_fence.o
- 
- # add asic specific block
- amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
- 	ci_smc.o ci_dpm.o dce_v8_0.o gfx_v7_0.o cik_sdma.o uvd_v4_2.o vce_v2_0.o \
--	amdgpu_amdkfd_gfx_v7.o
- 
- amdgpu-$(CONFIG_DRM_AMDGPU_SI)+= si.o gmc_v6_0.o gfx_v6_0.o si_ih.o si_dma.o dce_v6_0.o si_dpm.o si_smc.o
- 
-@@ -109,7 +108,10 @@ amdgpu-y += \
- # add amdkfd interfaces
- amdgpu-y += \
- 	 amdgpu_amdkfd.o \
--	 amdgpu_amdkfd_gfx_v8.o
-+	 amdgpu_amdkfd_gfx_v7.o \
-+	 amdgpu_amdkfd_gfx_v8.o \
-+	 amdgpu_amdkfd_gfx_v9.o \
-+	 amdgpu_amdkfd_gpuvm.o 
- 
- # add cgs
- amdgpu-y += amdgpu_cgs.o
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
-old mode 100644
-new mode 100755
-index fe23de8..bcf95e7
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
-@@ -184,6 +184,7 @@ struct amdgpu_cs_parser;
- struct amdgpu_job;
- struct amdgpu_irq_src;
- struct amdgpu_fpriv;
-+struct kfd_vm_fault_info;
- struct amdgpu_bo_va_mapping;
- 
- enum amdgpu_cp_irq {
-@@ -403,6 +404,7 @@ struct amdgpu_gem_object {
- 	struct amdgpu_bo		*bo;
- };
- 
-+struct kgd_mem;
- #define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_gem_object, base)->bo
- 
- void amdgpu_gem_object_free(struct drm_gem_object *obj);
-@@ -543,6 +545,9 @@ struct amdgpu_mc {
- 	u64					private_aperture_end;
- 	/* protects concurrent invalidation */
- 	spinlock_t		invalidate_lock;
-+
-+	struct kfd_vm_fault_info *vm_fault_info;
-+	atomic_t		vm_fault_info_updated;
- };
- 
- /*
-@@ -961,6 +966,7 @@ struct amdgpu_gfx_config {
- };
- 
- struct amdgpu_cu_info {
-+	uint32_t simd_per_cu;
- 	uint32_t max_waves_per_simd;
- 	uint32_t wave_front_size;
- 	uint32_t max_scratch_slots_per_cu;
-@@ -1649,6 +1655,7 @@ struct amdgpu_device {
- 	/* record hw reset is performed */
- 	bool has_hw_reset;
- 	u8				reset_magic[AMDGPU_RESET_MAGIC_NUM];
-+	spinlock_t tlb_invalidation_lock;
- 
- 	/* record last mm index being written through WREG32*/
- 	unsigned long last_mm_index;
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
-old mode 100644
-new mode 100755
-index 7ec1915..ec8141f
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
-@@ -20,23 +20,29 @@
-  * OTHER DEALINGS IN THE SOFTWARE.
-  */
- 
-+#undef pr_fmt
-+#define pr_fmt(fmt) "kfd2kgd: " fmt
-+
- #include "amdgpu_amdkfd.h"
--#include "amd_shared.h"
-+#include <linux/dma-buf.h>
- #include <drm/drmP.h>
- #include "amdgpu.h"
- #include "amdgpu_gfx.h"
- #include <linux/module.h>
- 
--const struct kfd2kgd_calls *kfd2kgd;
-+#define AMDKFD_SKIP_UNCOMPILED_CODE 1
-+
- const struct kgd2kfd_calls *kgd2kfd;
--bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**);
-+bool (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**);
-+
-+unsigned int global_compute_vmid_bitmap = 0xFF00;
- 
- int amdgpu_amdkfd_init(void)
- {
- 	int ret;
- 
- #if defined(CONFIG_HSA_AMD_MODULE)
--	int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**);
-+	int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**);
- 
- 	kgd2kfd_init_p = symbol_request(kgd2kfd_init);
- 
-@@ -57,56 +63,68 @@ int amdgpu_amdkfd_init(void)
- #else
- 	ret = -ENOENT;
- #endif
--
-+	amdgpu_amdkfd_gpuvm_init_mem_limits();
- 	return ret;
- }
- 
--bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev)
-+void amdgpu_amdkfd_fini(void)
- {
-+	if (kgd2kfd) {
-+		kgd2kfd->exit();
-+		symbol_put(kgd2kfd_init);
-+	}
-+}
-+
-+void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev)
-+{
-+	const struct kfd2kgd_calls *kfd2kgd;
-+
-+	if (!kgd2kfd)
-+		return;
-+
- 	switch (adev->asic_type) {
- #ifdef CONFIG_DRM_AMDGPU_CIK
- 	case CHIP_KAVERI:
-+	case CHIP_HAWAII:
- 		kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions();
- 		break;
- #endif
- 	case CHIP_CARRIZO:
-+	case CHIP_TONGA:
-+	case CHIP_FIJI:
-+	case CHIP_POLARIS10:
-+	case CHIP_POLARIS11:
- 		kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions();
- 		break;
-+	case CHIP_VEGA10:
-+	case CHIP_RAVEN:
-+		kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions();
-+		break;
- 	default:
--		return false;
--	}
--
--	return true;
--}
--
--void amdgpu_amdkfd_fini(void)
--{
--	if (kgd2kfd) {
--		kgd2kfd->exit();
--		symbol_put(kgd2kfd_init);
-+		dev_info(adev->dev, "kfd not supported on this ASIC\n");
-+		return;
- 	}
--}
- 
--void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev)
--{
--	if (kgd2kfd)
--		adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev,
--					adev->pdev, kfd2kgd);
-+	adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev,
-+				   adev->pdev, kfd2kgd);
- }
- 
- void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
- {
- 	int i;
- 	int last_valid_bit;
-+
- 	if (adev->kfd) {
- 		struct kgd2kfd_shared_resources gpu_resources = {
--			.compute_vmid_bitmap = 0xFF00,
-+			.compute_vmid_bitmap = global_compute_vmid_bitmap,
- 			.num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec,
--			.num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe
-+			.num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe,
-+			.gpuvm_size = (uint64_t)amdgpu_vm_size << 30
- 		};
- 
- 		/* this is going to have a few of the MSBs set that we need to
--		 * clear */
-+		 * clear
-+		 */
- 		bitmap_complement(gpu_resources.queue_bitmap,
- 				  adev->gfx.mec.queue_bitmap,
- 				  KGD_MAX_QUEUES);
-@@ -120,7 +138,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
- 				  gpu_resources.queue_bitmap);
- 
- 		/* According to linux/bitmap.h we shouldn't use bitmap_clear if
--		 * nbits is not compile time constant */
-+		 * nbits is not compile time constant
-+		 */
- 		last_valid_bit = 1 /* only first MEC can have compute queues */
- 				* adev->gfx.mec.num_pipe_per_mec
- 				* adev->gfx.mec.num_queue_per_pipe;
-@@ -131,6 +150,28 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
- 				&gpu_resources.doorbell_physical_address,
- 				&gpu_resources.doorbell_aperture_size,
- 				&gpu_resources.doorbell_start_offset);
-+		if (adev->asic_type >= CHIP_VEGA10) {
-+			/* On SOC15 the BIF is involved in routing
-+			 * doorbells using the low 12 bits of the
-+			 * address. Communicate the assignments to
-+			 * KFD. KFD uses two doorbell pages per
-+			 * process in case of 64-bit doorbells so we
-+			 * can use each doorbell assignment twice.
-+			 */
-+			gpu_resources.sdma_doorbell[0][0] =
-+				AMDGPU_DOORBELL64_sDMA_ENGINE0;
-+			gpu_resources.sdma_doorbell[0][1] =
-+				AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200;
-+			gpu_resources.sdma_doorbell[1][0] =
-+				AMDGPU_DOORBELL64_sDMA_ENGINE1;
-+			gpu_resources.sdma_doorbell[1][1] =
-+				AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200;
-+			/* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for
-+			 * SDMA, IH and VCN. So don't use them for the CP.
-+			 */
-+			gpu_resources.reserved_doorbell_mask = 0x1f0;
-+			gpu_resources.reserved_doorbell_val  = 0x0f0;
-+		}
- 
- 		kgd2kfd->device_init(adev->kfd, &gpu_resources);
- 	}
-@@ -167,24 +208,81 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev)
- 	return r;
- }
- 
-+int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
-+				uint32_t vmid, uint64_t gpu_addr,
-+				uint32_t *ib_cmd, uint32_t ib_len)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-+	struct amdgpu_job *job;
-+	struct amdgpu_ib *ib;
-+	struct amdgpu_ring *ring;
-+	struct dma_fence *f = NULL;
-+	int ret;
-+
-+	switch (engine) {
-+	case KGD_ENGINE_MEC1:
-+		ring = &adev->gfx.compute_ring[0];
-+		break;
-+	case KGD_ENGINE_SDMA1:
-+		ring = &adev->sdma.instance[0].ring;
-+		break;
-+	case KGD_ENGINE_SDMA2:
-+		ring = &adev->sdma.instance[1].ring;
-+		break;
-+	default:
-+		pr_err("Invalid engine in IB submission: %d\n", engine);
-+		ret = -EINVAL;
-+		goto err;
-+	}
-+
-+	ret = amdgpu_job_alloc(adev, 1, &job, NULL);
-+	if (ret)
-+		goto err;
-+
-+	ib = &job->ibs[0];
-+	memset(ib, 0, sizeof(struct amdgpu_ib));
-+
-+	ib->gpu_addr = gpu_addr;
-+	ib->ptr = ib_cmd;
-+	ib->length_dw = ib_len;
-+	/* This works for NO_HWS. TODO: need to handle without knowing VMID */
-+	job->vm_id = vmid;
-+
-+	ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
-+	if (ret) {
-+		DRM_ERROR("amdgpu: failed to schedule IB.\n");
-+		goto err_ib_sched;
-+	}
-+
-+	ret = dma_fence_wait(f, false);
-+
-+err_ib_sched:
-+	dma_fence_put(f);
-+	amdgpu_job_free(job);
-+err:
-+	return ret;
-+}
-+
-+u32 pool_to_domain(enum kgd_memory_pool p)
-+{
-+	switch (p) {
-+	case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM;
-+	default: return AMDGPU_GEM_DOMAIN_GTT;
-+	}
-+}
-+
- int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
- 			void **mem_obj, uint64_t *gpu_addr,
- 			void **cpu_ptr)
- {
- 	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
--	struct kgd_mem **mem = (struct kgd_mem **) mem_obj;
-+	struct amdgpu_bo *bo = NULL;
- 	int r;
--
--	BUG_ON(kgd == NULL);
--	BUG_ON(gpu_addr == NULL);
--	BUG_ON(cpu_ptr == NULL);
--
--	*mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL);
--	if ((*mem) == NULL)
--		return -ENOMEM;
-+	uint64_t gpu_addr_tmp = 0;
-+	void *cpu_ptr_tmp = NULL;
- 
- 	r = amdgpu_bo_create(adev, size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT,
--			     AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, &(*mem)->bo);
-+			AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &bo);
- 	if (r) {
- 		dev_err(adev->dev,
- 			"failed to allocate BO for amdkfd (%d)\n", r);
-@@ -192,64 +290,87 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
- 	}
- 
- 	/* map the buffer */
--	r = amdgpu_bo_reserve((*mem)->bo, true);
-+	r = amdgpu_bo_reserve(bo, true);
- 	if (r) {
- 		dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r);
- 		goto allocate_mem_reserve_bo_failed;
- 	}
- 
--	r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT,
--				&(*mem)->gpu_addr);
-+	r = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT,
-+				&gpu_addr_tmp);
- 	if (r) {
- 		dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r);
- 		goto allocate_mem_pin_bo_failed;
- 	}
--	*gpu_addr = (*mem)->gpu_addr;
- 
--	r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr);
-+	r = amdgpu_bo_kmap(bo, &cpu_ptr_tmp);
- 	if (r) {
- 		dev_err(adev->dev,
- 			"(%d) failed to map bo to kernel for amdkfd\n", r);
- 		goto allocate_mem_kmap_bo_failed;
- 	}
--	*cpu_ptr = (*mem)->cpu_ptr;
- 
--	amdgpu_bo_unreserve((*mem)->bo);
-+	*mem_obj = bo;
-+	*gpu_addr = gpu_addr_tmp;
-+	*cpu_ptr = cpu_ptr_tmp;
-+
-+	amdgpu_bo_unreserve(bo);
- 
- 	return 0;
- 
- allocate_mem_kmap_bo_failed:
--	amdgpu_bo_unpin((*mem)->bo);
-+	amdgpu_bo_unpin(bo);
- allocate_mem_pin_bo_failed:
--	amdgpu_bo_unreserve((*mem)->bo);
-+	amdgpu_bo_unreserve(bo);
- allocate_mem_reserve_bo_failed:
--	amdgpu_bo_unref(&(*mem)->bo);
-+	amdgpu_bo_unref(&bo);
- 
- 	return r;
- }
- 
- void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
- {
--	struct kgd_mem *mem = (struct kgd_mem *) mem_obj;
-+	struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
- 
--	BUG_ON(mem == NULL);
--
--	amdgpu_bo_reserve(mem->bo, true);
--	amdgpu_bo_kunmap(mem->bo);
--	amdgpu_bo_unpin(mem->bo);
--	amdgpu_bo_unreserve(mem->bo);
--	amdgpu_bo_unref(&(mem->bo));
--	kfree(mem);
-+	amdgpu_bo_reserve(bo, true);
-+	amdgpu_bo_kunmap(bo);
-+	amdgpu_bo_unpin(bo);
-+	amdgpu_bo_unreserve(bo);
-+	amdgpu_bo_unref(&(bo));
- }
- 
--uint64_t get_vmem_size(struct kgd_dev *kgd)
-+void get_local_mem_info(struct kgd_dev *kgd,
-+				struct kfd_local_mem_info *mem_info)
- {
--	struct amdgpu_device *adev =
--		(struct amdgpu_device *)kgd;
-+	uint64_t address_mask;
-+	resource_size_t aper_limit;
-+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
- 
--	BUG_ON(kgd == NULL);
-+	address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask :
-+					     ~((1ULL << 32) - 1);
-+	aper_limit = adev->mc.aper_base + adev->mc.aper_size;
-+
-+	memset(mem_info, 0, sizeof(*mem_info));
-+	if (!(adev->mc.aper_base & address_mask ||
-+			aper_limit & address_mask)) {
-+		mem_info->local_mem_size_public = adev->mc.visible_vram_size;
-+		mem_info->local_mem_size_private = adev->mc.real_vram_size -
-+				adev->mc.visible_vram_size;
-+	} else {
-+		mem_info->local_mem_size_public = 0;
-+		mem_info->local_mem_size_private = adev->mc.real_vram_size;
-+	}
-+	mem_info->vram_width = adev->mc.vram_width;
- 
--	return adev->mc.real_vram_size;
-+	pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n",
-+			adev->mc.aper_base, aper_limit,
-+			mem_info->local_mem_size_public,
-+			mem_info->local_mem_size_private);
-+
-+	if (amdgpu_sriov_vf(adev))
-+		mem_info->mem_clk_max = adev->clock.default_mclk / 100;
-+	else
-+		mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100;
- }
- 
- uint64_t get_gpu_clock_counter(struct kgd_dev *kgd)
-@@ -271,3 +392,106 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd)
- 
- 	return amdgpu_dpm_get_sclk(adev, false) / 100;
- }
-+
-+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info)
-+{
-+        struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-+        struct amdgpu_cu_info acu_info = adev->gfx.cu_info;
-+
-+        memset(cu_info, 0, sizeof(*cu_info));
-+        if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap))
-+                return;
-+
-+        cu_info->cu_active_number = acu_info.number;
-+        cu_info->cu_ao_mask = acu_info.ao_cu_mask;
-+        memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0],
-+                                        sizeof(acu_info.bitmap));
-+        cu_info->num_shader_engines = adev->gfx.config.max_shader_engines;
-+        cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se;
-+        cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh;
-+        cu_info->simd_per_cu = acu_info.simd_per_cu;
-+        cu_info->max_waves_per_simd = acu_info.max_waves_per_simd;
-+        cu_info->wave_front_size = acu_info.wave_front_size;
-+        cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu;
-+        cu_info->lds_size = acu_info.lds_size;
-+}
-+
-+int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
-+                                  struct kgd_dev **dma_buf_kgd,
-+                                  uint64_t *bo_size, void *metadata_buffer,
-+                                  size_t buffer_size, uint32_t *metadata_size,
-+                                  uint32_t *flags)
-+{
-+        struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-+        struct dma_buf *dma_buf;
-+        struct drm_gem_object *obj;
-+        struct amdgpu_bo *bo;
-+        uint64_t metadata_flags;
-+        int r = -EINVAL;
-+
-+        dma_buf = dma_buf_get(dma_buf_fd);
-+        if (IS_ERR(dma_buf))
-+                return PTR_ERR(dma_buf);
-+
-+        if (dma_buf->ops != &drm_gem_prime_dmabuf_ops)
-+                /* Can't handle non-graphics buffers */
-+                goto out_put;
-+
-+        obj = dma_buf->priv;
-+        if (obj->dev->driver != adev->ddev->driver)
-+                /* Can't handle buffers from different drivers */
-+                goto out_put;
-+
-+        adev = obj->dev->dev_private;
-+        bo = gem_to_amdgpu_bo(obj);
-+        if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
-+                                    AMDGPU_GEM_DOMAIN_GTT |
-+                                    AMDGPU_GEM_DOMAIN_DGMA)))
-+                /* Only VRAM, GTT and DGMA BOs are supported */
-+                goto out_put;
-+
-+        r = 0;
-+        if (dma_buf_kgd)
-+                *dma_buf_kgd = (struct kgd_dev *)adev;
-+        if (bo_size)
-+                *bo_size = amdgpu_bo_size(bo);
-+        if (metadata_size)
-+                *metadata_size = bo->metadata_size;
-+        if (metadata_buffer)
-+                r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size,
-+                                           metadata_size, &metadata_flags);
-+        if (flags) {
-+                /* If the preferred domain is DGMA, set flags to VRAM because
-+                 * KFD doesn't support allocating DGMA memory
-+                 */
-+                *flags = (bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
-+                                AMDGPU_GEM_DOMAIN_DGMA)) ?
-+                                ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT;
-+ 
-+                if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
-+                        *flags |= ALLOC_MEM_FLAGS_PUBLIC;
-+        }
-+
-+out_put:
-+        dma_buf_put(dma_buf);
-+        return r;
-+}
-+
-+uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
-+{
-+        struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-+        uint64_t usage =
-+                amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
-+        return usage;
-+}
-+
-+bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev,
-+                        u32 vmid)
-+{
-+        if (adev->kfd) {
-+                if ((1 << vmid) & global_compute_vmid_bitmap)
-+                        return true;
-+        }
-+
-+        return false;
-+}
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
-old mode 100644
-new mode 100755
-index 6d3a10b..b259ba7
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
-@@ -27,20 +27,109 @@
- 
- #include <linux/types.h>
- #include <linux/mm.h>
-+#include <linux/workqueue.h>
-+#include <linux/mmu_context.h>
- #include <kgd_kfd_interface.h>
-+#include "amdgpu.h"
-+
-+extern const struct kgd2kfd_calls *kgd2kfd;
- 
- struct amdgpu_device;
- 
-+struct kfd_bo_va_list {
-+        struct list_head bo_list;
-+        struct amdgpu_bo_va *bo_va;
-+        void *kgd_dev;
-+        bool is_mapped;
-+        bool map_fail;
-+        uint64_t va;
-+        uint64_t pte_flags;
-+};
-+
- struct kgd_mem {
-+	struct mutex lock;
- 	struct amdgpu_bo *bo;
--	uint64_t gpu_addr;
--	void *cpu_ptr;
-+        struct list_head bo_va_list;
-+        /* protected by amdkfd_process_info.lock */
-+        struct ttm_validate_buffer validate_list;
-+        struct ttm_validate_buffer resv_list;
-+        uint32_t domain;
-+        unsigned int mapped_to_gpu_memory;
-+        void *kptr;
-+        uint64_t va;
-+
-+        uint32_t mapping_flags;
-+
-+        atomic_t invalid;
-+        struct amdkfd_process_info *process_info;
-+        struct page **user_pages;
-+
-+        struct amdgpu_sync sync;
-+
-+        /* flags bitfield */
-+        bool coherent      : 1;
-+        bool no_substitute : 1;
-+        bool aql_queue     : 1;
-+};
-+
-+/* KFD Memory Eviction */
-+struct amdgpu_amdkfd_fence {
-+       	struct dma_fence base;
-+       	void *mm;
-+       	spinlock_t lock;
-+	char timeline_name[TASK_COMM_LEN];
-+};
-+
-+struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
-+                                                      void *mm);
-+bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm);
-+struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
-+
-+struct amdkfd_process_info {
-+        /* List head of all VMs that belong to a KFD process */
-+        struct list_head vm_list_head;
-+        /* List head for all KFD BOs that belong to a KFD process. */
-+        struct list_head kfd_bo_list;
-+        /* List of userptr BOs that are valid or invalid */
-+        struct list_head userptr_valid_list;
-+        struct list_head userptr_inval_list;
-+        /* Lock to protect kfd_bo_list */
-+        struct mutex lock;
-+
-+        /* Number of VMs */
-+        unsigned int n_vms;
-+        /* Eviction Fence */
-+        struct amdgpu_amdkfd_fence *eviction_fence;
-+
-+        /* MMU-notifier related fields */
-+        atomic_t evicted_bos;
-+        struct delayed_work work;
-+        struct pid *pid;
-+};
-+
-+/* struct amdkfd_vm -
-+ * For Memory Eviction KGD requires a mechanism to keep track of all KFD BOs
-+ * belonging to a KFD process. All the VMs belonging to the same process point
-+ * to the same amdkfd_process_info.
-+ */
-+struct amdkfd_vm {
-+        /* Keep base as the first parameter for pointer compatibility between
-+         * amdkfd_vm and amdgpu_vm.
-+         */
-+        struct amdgpu_vm base;
-+
-+        /* List node in amdkfd_process_info.vm_list_head*/
-+        struct list_head vm_list_node;
-+
-+        struct amdgpu_device *adev;
-+        /* Points to the KFD process VM info*/
-+        struct amdkfd_process_info *process_info;
- };
- 
-+
- int amdgpu_amdkfd_init(void);
- void amdgpu_amdkfd_fini(void);
- 
--bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev);
- 
- void amdgpu_amdkfd_suspend(struct amdgpu_device *adev);
- int amdgpu_amdkfd_resume(struct amdgpu_device *adev);
-@@ -50,17 +139,105 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
- void amdgpu_amdkfd_device_init(struct amdgpu_device *adev);
- void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev);
- 
-+int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm);
-+int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
-+				uint32_t vmid, uint64_t gpu_addr,
-+				uint32_t *ib_cmd, uint32_t ib_len);
-+int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
-+					    struct dma_fence **ef);
- struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void);
- struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void);
-+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void);
-+int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem,
-+		uint64_t src_offset, struct kgd_mem *dst_mem,
-+		uint64_t dest_offset, uint64_t size, struct dma_fence **f,
-+		uint64_t *actual_size);
-+
-+bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev,
-+			u32 vmid);
- 
- /* Shared API */
-+int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm,
-+		struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va);
- int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
- 			void **mem_obj, uint64_t *gpu_addr,
- 			void **cpu_ptr);
- void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
--uint64_t get_vmem_size(struct kgd_dev *kgd);
-+void get_local_mem_info(struct kgd_dev *kgd,
-+			struct kfd_local_mem_info *mem_info);
- uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);
- 
- uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
-+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info);
-+int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
-+				  struct kgd_dev **dmabuf_kgd,
-+				  uint64_t *bo_size, void *metadata_buffer,
-+				  size_t buffer_size, uint32_t *metadata_size,
-+				  uint32_t *flags);
-+uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
-+
-+#define read_user_wptr(mmptr, wptr, dst)				\
-+	({								\
-+		bool valid = false;					\
-+		if ((mmptr) && (wptr)) {				\
-+			if ((mmptr) == current->mm) {			\
-+				valid = !get_user((dst), (wptr));	\
-+			} else if (current->mm == NULL) {		\
-+				use_mm(mmptr);				\
-+				valid = !get_user((dst), (wptr));	\
-+				unuse_mm(mmptr);			\
-+			}						\
-+		}							\
-+		valid;							\
-+	})
-+
-+/* GPUVM API */
-+int amdgpu_amdkfd_gpuvm_sync_memory(
-+		struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);
-+int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
-+		struct kgd_dev *kgd, uint64_t va, uint64_t size,
-+		void *vm, struct kgd_mem **mem,
-+		uint64_t *offset, uint32_t flags);
-+int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
-+		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm);
-+int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
-+		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm);
-+int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
-+		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm);
- 
-+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm,
-+					  void **process_info,
-+					  struct dma_fence **ef);
-+void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm);
-+
-+uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm);
-+
-+int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
-+					      struct kfd_vm_fault_info *info);
-+
-+int amdgpu_amdkfd_gpuvm_mmap_bo(
-+		struct kgd_dev *kgd, struct vm_area_struct *vma);
-+
-+int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
-+		struct kgd_mem *mem, void **kptr);
-+
-+int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd,
-+		struct kgd_mem *mem, uint64_t offset,
-+		uint64_t size, struct sg_table **ret_sg);
-+void amdgpu_amdkfd_gpuvm_unpin_put_sg_table(
-+		struct kgd_mem *mem, struct sg_table *sg);
-+int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
-+				      struct dma_buf *dmabuf,
-+				      uint64_t va, void *vm,
-+				      struct kgd_mem **mem, uint64_t *size,
-+				      uint64_t *mmap_offset);
-+int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm,
-+				      struct kgd_mem *mem,
-+				      struct dma_buf **dmabuf);
-+int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm);
-+int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm);
-+
-+void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
-+void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo);
- #endif /* AMDGPU_AMDKFD_H_INCLUDED */
-+
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
-new file mode 100644
-index 0000000..3961937
---- /dev/null
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
-@@ -0,0 +1,196 @@
-+/*
-+ * Copyright 2016 Advanced Micro Devices, Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+
-+#include <linux/spinlock.h>
-+#include <linux/atomic.h>
-+#include <linux/stacktrace.h>
-+#include <linux/sched.h>
-+#include <linux/slab.h>
-+#include "amdgpu_amdkfd.h"
-+
-+const struct dma_fence_ops amd_kfd_fence_ops;
-+static atomic_t fence_seq = ATOMIC_INIT(0);
-+
-+static int amd_kfd_fence_signal(struct dma_fence *f);
-+
-+/* Eviction Fence
-+ * Fence helper functions to deal with KFD memory eviction.
-+ * Big Idea - Since KFD submissions are done by user queues, a BO cannot be
-+ *  evicted unless all the user queues for that process are evicted.
-+ *
-+ * All the BOs in a process share an eviction fence. When process X wants
-+ * to map VRAM memory but TTM can't find enough space, TTM will attempt to
-+ * evict BOs from its LRU list. TTM checks if the BO is valuable to evict
-+ * by calling ttm_bo_driver->eviction_valuable().
-+ *
-+ * ttm_bo_driver->eviction_valuable() - will return false if the BO belongs
-+ *  to process X. Otherwise, it will return true to indicate BO can be
-+ *  evicted by TTM.
-+ *
-+ * If ttm_bo_driver->eviction_valuable returns true, then TTM will continue
-+ * the evcition process for that BO by calling ttm_bo_evict --> amdgpu_bo_move
-+ * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler.
-+ *
-+ * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to
-+ *  nofity when the BO is free to move. fence_add_callback --> enable_signaling
-+ *  --> amdgpu_amdkfd_fence.enable_signaling
-+ *
-+ * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will quiesce
-+ * user queues and signal fence. The work item will also start another delayed
-+ * work item to restore BOs
-+ */
-+
-+struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
-+						       void *mm)
-+{
-+	struct amdgpu_amdkfd_fence *fence = NULL;
-+
-+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
-+	if (fence == NULL)
-+		return NULL;
-+
-+	/* mm_struct mm is used as void pointer to identify the parent
-+	 * KFD process. Don't dereference it. Fence and any threads using
-+	 * mm is guranteed to be released before process termination.
-+	 */
-+	fence->mm = mm;
-+	get_task_comm(fence->timeline_name, current);
-+	spin_lock_init(&fence->lock);
-+
-+	dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock,
-+		   context, atomic_inc_return(&fence_seq));
-+
-+	return fence;
-+}
-+
-+struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
-+{
-+	struct amdgpu_amdkfd_fence *fence;
-+
-+	if (!f)
-+		return NULL;
-+
-+	fence = container_of(f, struct amdgpu_amdkfd_fence, base);
-+	if (fence && f->ops == &amd_kfd_fence_ops)
-+		return fence;
-+
-+	return NULL;
-+}
-+
-+static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f)
-+{
-+	return "amdgpu_amdkfd_fence";
-+}
-+
-+static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f)
-+{
-+	struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
-+
-+	return fence->timeline_name;
-+}
-+
-+/**
-+ * amd_kfd_fence_enable_signaling - This gets called when TTM wants to evict
-+ *  a KFD BO and schedules a job to move the BO.
-+ *  If fence is already signaled return true.
-+ *  If fence is not signaled schedule a evict KFD process work item.
-+ */
-+static bool amd_kfd_fence_enable_signaling(struct dma_fence *f)
-+{
-+	struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
-+
-+	if (!fence)
-+		return false;
-+
-+	if (dma_fence_is_signaled(f))
-+		return true;
-+
-+	if (!kgd2kfd->schedule_evict_and_restore_process(
-+				(struct mm_struct *)fence->mm, f))
-+		return true;
-+
-+	return false;
-+}
-+
-+static int amd_kfd_fence_signal(struct dma_fence *f)
-+{
-+	unsigned long flags;
-+	int ret;
-+
-+	spin_lock_irqsave(f->lock, flags);
-+	/* Set enabled bit so cb will called */
-+	set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags);
-+	ret = dma_fence_signal_locked(f);
-+	spin_unlock_irqrestore(f->lock, flags);
-+
-+	return ret;
-+}
-+
-+/**
-+ * amd_kfd_fence_release - callback that fence can be freed
-+ *
-+ * @fence: fence
-+ *
-+ * This function is called when the reference count becomes zero.
-+ * It just RCU schedules freeing up the fence.
-+*/
-+static void amd_kfd_fence_release(struct dma_fence *f)
-+{
-+	struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
-+	/* Unconditionally signal the fence. The process is getting
-+	 * terminated.
-+	 */
-+	if (WARN_ON(!fence))
-+		return; /* Not an amdgpu_amdkfd_fence */
-+
-+	amd_kfd_fence_signal(f);
-+	kfree_rcu(f, rcu);
-+}
-+
-+/**
-+ * amd_kfd_fence_check_mm - Check if @mm is same as that of the fence @f
-+ *  if same return TRUE else return FALSE.
-+ *
-+ * @f: [IN] fence
-+ * @mm: [IN] mm that needs to be verified
-+*/
-+bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm)
-+{
-+	struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
-+
-+	if (!fence)
-+		return false;
-+	else if (fence->mm == mm)
-+		return true;
-+
-+	return false;
-+}
-+
-+const struct dma_fence_ops amd_kfd_fence_ops = {
-+	.get_driver_name = amd_kfd_fence_get_driver_name,
-+	.get_timeline_name = amd_kfd_fence_get_timeline_name,
-+	.enable_signaling = amd_kfd_fence_enable_signaling,
-+	.signaled = NULL,
-+	.wait = dma_fence_default_wait,
-+	.release = amd_kfd_fence_release,
-+};
-+
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
-old mode 100644
-new mode 100755
-index 5748504..6964ece
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
-@@ -20,6 +20,9 @@
-  * OTHER DEALINGS IN THE SOFTWARE.
-  */
- 
-+#undef pr_fmt
-+#define pr_fmt(fmt) "kfd2kgd: " fmt
-+
- #include <linux/fdtable.h>
- #include <linux/uaccess.h>
- #include <linux/firmware.h>
-@@ -39,6 +42,14 @@
- #include "gmc/gmc_7_1_sh_mask.h"
- #include "cik_structs.h"
- 
-+#define AMDKFD_SKIP_UNCOMPILED_CODE 1
-+
-+enum hqd_dequeue_request_type {
-+	NO_ACTION = 0,
-+	DRAIN_PIPE,
-+	RESET_WAVES
-+};
-+
- enum {
- 	MAX_TRAPID = 8,		/* 3 bits in the bitfield. */
- 	MAX_WATCH_ADDRESSES = 4
-@@ -55,8 +66,8 @@ enum {
- enum {
- 	ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL,
- 	ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF,
--	ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000,
--	/* extend the mask to 26 bits to match the low address field */
-+	ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000,
-+	/* extend the mask to 26 bits in order to match the low address field */
- 	ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6,
- 	ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF
- };
-@@ -81,30 +92,42 @@ union TCP_WATCH_CNTL_BITS {
- 	float f32All;
- };
- 
-+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
-+			int fd, uint32_t handle, struct kgd_mem **mem);
-+
-+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
-+
- /*
-  * Register access functions
-  */
- 
- static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
--		uint32_t sh_mem_config,	uint32_t sh_mem_ape1_base,
--		uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
--
-+			uint32_t sh_mem_config, uint32_t sh_mem_ape1_base,
-+			uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
- static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
--					unsigned int vmid);
--
-+			unsigned int vmid);
- static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
--				uint32_t hpd_size, uint64_t hpd_gpu_addr);
-+			uint32_t hpd_size, uint64_t hpd_gpu_addr);
- static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
- static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
--			uint32_t queue_id, uint32_t __user *wptr);
--static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
-+			uint32_t queue_id, uint32_t __user *wptr,
-+			uint32_t wptr_shift, uint32_t wptr_mask,
-+			struct mm_struct *mm);
-+static int kgd_hqd_dump(struct kgd_dev *kgd,
-+			uint32_t pipe_id, uint32_t queue_id,
-+			uint32_t (**dump)[2], uint32_t *n_regs);
-+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
-+			     uint32_t __user *wptr, struct mm_struct *mm);
-+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
-+			     uint32_t engine_id, uint32_t queue_id,
-+			     uint32_t (**dump)[2], uint32_t *n_regs);
- static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
--				uint32_t pipe_id, uint32_t queue_id);
--
--static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
-+		uint32_t pipe_id, uint32_t queue_id);
-+static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
-+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
-+				enum kfd_preempt_type reset_type,
- 				unsigned int utimeout, uint32_t pipe_id,
- 				uint32_t queue_id);
--static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
- static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
- 				unsigned int utimeout);
- static int kgd_address_watch_disable(struct kgd_dev *kgd);
-@@ -124,21 +147,60 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid);
- static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
- 							uint8_t vmid);
- static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
-+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
-+static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req);
-+static int alloc_memory_of_scratch(struct kgd_dev *kgd,
-+					 uint64_t va, uint32_t vmid);
-+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
-+		uint8_t element_size, uint8_t index_stride, uint8_t mtype);
-+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
-+		uint32_t page_table_base);
-+static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd);
-+
-+/* Because of REG_GET_FIELD() being used, we put this function in the
-+ * asic specific file.
-+ */
-+static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
-+		struct tile_config *config)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
- 
--static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
-+	config->gb_addr_config = adev->gfx.config.gb_addr_config;
-+	config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
-+				MC_ARB_RAMCFG, NOOFBANK);
-+	config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
-+				MC_ARB_RAMCFG, NOOFRANKS);
-+
-+	config->tile_config_ptr = adev->gfx.config.tile_mode_array;
-+	config->num_tile_configs =
-+			ARRAY_SIZE(adev->gfx.config.tile_mode_array);
-+	config->macro_tile_config_ptr =
-+			adev->gfx.config.macrotile_mode_array;
-+	config->num_macro_tile_configs =
-+			ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
-+
-+
-+	return 0;
-+}
- 
- static const struct kfd2kgd_calls kfd2kgd = {
- 	.init_gtt_mem_allocation = alloc_gtt_mem,
- 	.free_gtt_mem = free_gtt_mem,
--	.get_vmem_size = get_vmem_size,
-+	.get_local_mem_info = get_local_mem_info,
- 	.get_gpu_clock_counter = get_gpu_clock_counter,
- 	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
-+	.create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
-+	.destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
-+	.get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
-+	.open_graphic_handle = open_graphic_handle,
- 	.program_sh_mem_settings = kgd_program_sh_mem_settings,
- 	.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
- 	.init_pipeline = kgd_init_pipeline,
- 	.init_interrupts = kgd_init_interrupts,
- 	.hqd_load = kgd_hqd_load,
- 	.hqd_sdma_load = kgd_hqd_sdma_load,
-+	.hqd_dump = kgd_hqd_dump,
-+	.hqd_sdma_dump = kgd_hqd_sdma_dump,
- 	.hqd_is_occupied = kgd_hqd_is_occupied,
- 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
- 	.hqd_destroy = kgd_hqd_destroy,
-@@ -147,17 +209,50 @@ static const struct kfd2kgd_calls kfd2kgd = {
- 	.address_watch_execute = kgd_address_watch_execute,
- 	.wave_control_execute = kgd_wave_control_execute,
- 	.address_watch_get_offset = kgd_address_watch_get_offset,
--	.get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid,
--	.get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid,
-+	.get_atc_vmid_pasid_mapping_pasid =
-+			get_atc_vmid_pasid_mapping_pasid,
-+	.get_atc_vmid_pasid_mapping_valid =
-+			get_atc_vmid_pasid_mapping_valid,
-+	.read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg,
- 	.write_vmid_invalidate_request = write_vmid_invalidate_request,
--	.get_fw_version = get_fw_version
-+	.invalidate_tlbs = invalidate_tlbs,
-+	.sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
-+	.alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
-+	.free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
-+	.map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
-+	.unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
-+	.get_fw_version = get_fw_version,
-+	.set_num_of_requests = set_num_of_requests,
-+	.get_cu_info = get_cu_info,
-+	.alloc_memory_of_scratch = alloc_memory_of_scratch,
-+	.write_config_static_mem = write_config_static_mem,
-+	.mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo,
-+	.map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
-+	.set_vm_context_page_table_base = set_vm_context_page_table_base,
-+	.pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table,
-+	.unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table,
-+	.get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info,
-+	.import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf,
-+	.export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf,
-+	.get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info,
-+	.submit_ib = amdgpu_amdkfd_submit_ib,
-+	.get_tile_config = amdgpu_amdkfd_get_tile_config,
-+	.restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos,
-+	.copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem,
-+	.get_vram_usage = amdgpu_amdkfd_get_vram_usage
- };
- 
--struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
-+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions()
- {
- 	return (struct kfd2kgd_calls *)&kfd2kgd;
- }
- 
-+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
-+				int fd, uint32_t handle, struct kgd_mem **mem)
-+{
-+	return 0;
-+}
-+
- static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
- {
- 	return (struct amdgpu_device *)kgd;
-@@ -186,7 +281,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
- {
- 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
- 
--	uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
-+	uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
- 	uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
- 
- 	lock_srbm(kgd, mec, pipe, queue_id, 0);
-@@ -222,12 +317,12 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
- 
- 	/*
- 	 * We have to assume that there is no outstanding mapping.
--	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
--	 * a mapping is in progress or because a mapping finished and the
--	 * SW cleared it. So the protocol is to always wait & clear.
-+	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a
-+	 * mapping is in progress or because a mapping finished and the SW
-+	 * cleared it. So the protocol is to always wait & clear.
- 	 */
--	uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
--			ATC_VMID0_PASID_MAPPING__VALID_MASK;
-+	uint32_t pasid_mapping = (pasid == 0) ? 0 :
-+			(uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK;
- 
- 	WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping);
- 
-@@ -273,8 +368,7 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
- 
- 	retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
- 			m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
--
--	pr_debug("kfd: sdma base address: 0x%x\n", retval);
-+	pr_debug("sdma base address: 0x%x\n", retval);
- 
- 	return retval;
- }
-@@ -290,26 +384,91 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
- }
- 
- static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
--			uint32_t queue_id, uint32_t __user *wptr)
-+                        uint32_t queue_id, uint32_t __user *wptr,
-+                        uint32_t wptr_shift, uint32_t wptr_mask,
-+                        struct mm_struct *mm)
- {
- 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
--	uint32_t wptr_shadow, is_wptr_shadow_valid;
- 	struct cik_mqd *m;
-+        uint32_t *mqd_hqd;
-+        uint32_t reg, wptr_val, data;
-+        bool valid_wptr = false;
- 
- 	m = get_mqd(mqd);
- 
--	is_wptr_shadow_valid = !get_user(wptr_shadow, wptr);
--	if (is_wptr_shadow_valid)
--		m->cp_hqd_pq_wptr = wptr_shadow;
-+        acquire_queue(kgd, pipe_id, queue_id);
-+
-+        /* HQD registers extend from CP_MQD_BASE_ADDR to CP_MQD_CONTROL. */
-+        mqd_hqd = &m->cp_mqd_base_addr_lo;
-+
-+        for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++)
-+                WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]);
-+
-+        /* Copy userspace write pointer value to register.
-+         * Activate doorbell logic to monitor subsequent changes.
-+         */
-+        data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
-+                            CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
-+        WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data);
-+
-+        /* read_user_ptr may take the mm->mmap_sem.
-+         * release srbm_mutex to avoid circular dependency between
-+         * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex.
-+         */
-+        release_queue(kgd);
-+        valid_wptr = read_user_wptr(mm, wptr, wptr_val);
- 
- 	acquire_queue(kgd, pipe_id, queue_id);
--	gfx_v7_0_mqd_commit(adev, m);
-+        if (valid_wptr)
-+                WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask);
-+
-+        data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
-+        WREG32(mmCP_HQD_ACTIVE, data);
-+
-+
- 	release_queue(kgd);
- 
- 	return 0;
- }
- 
--static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
-+static int kgd_hqd_dump(struct kgd_dev *kgd,
-+                        uint32_t pipe_id, uint32_t queue_id,
-+                        uint32_t (**dump)[2], uint32_t *n_regs)
-+{
-+        struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+        uint32_t i = 0, reg;
-+#define HQD_N_REGS (35+4)
-+#define DUMP_REG(addr) do {                            \
-+                if (WARN_ON_ONCE(i >= HQD_N_REGS))      \
-+                        break;                          \
-+                (*dump)[i][0] = (addr) << 2;            \
-+                (*dump)[i++][1] = RREG32(addr);         \
-+        } while (0)
-+
-+        *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
-+        if (*dump == NULL)
-+                return -ENOMEM;
-+
-+        acquire_queue(kgd, pipe_id, queue_id);
-+
-+        DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0);
-+        DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1);
-+        DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2);
-+        DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3);
-+
-+        for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++)
-+                DUMP_REG(reg);
-+
-+        release_queue(kgd);
-+
-+        WARN_ON_ONCE(i != HQD_N_REGS);
-+        *n_regs = i;
-+
-+        return 0;
-+}
-+
-+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
-+                            uint32_t __user *wptr, struct mm_struct *mm)
- {
- 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
- 	struct cik_sdma_rlc_registers *m;
-@@ -320,17 +479,17 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
- 	m = get_sdma_mqd(mqd);
- 	sdma_base_addr = get_sdma_base_addr(m);
- 
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
--		m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
-+        WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
-+                m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
- 
--	end_jiffies = msecs_to_jiffies(2000) + jiffies;
- 	while (true) {
--		data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
--		if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
--			break;
--		if (time_after(jiffies, end_jiffies))
--			return -ETIME;
--		usleep_range(500, 1000);
-+                temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
-+                if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
-+                        break;
-+                if (timeout == 0)
-+                        return -ETIME;
-+                msleep(10);
-+                timeout -= 10;
- 	}
- 	if (m->sdma_engine_id) {
- 		data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL);
-@@ -344,25 +503,59 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
- 		WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
- 	}
- 
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL,
--				m->sdma_rlc_doorbell);
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
--				m->sdma_rlc_virtual_addr);
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base);
-+	data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL,
-+                            ENABLE, 1);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr);
-+        if (read_user_wptr(mm, wptr, data))
-+                WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
-+        else
-+                WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
-+                       m->sdma_rlc_rb_rptr);
-+
-+        WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
-+                                m->sdma_rlc_virtual_addr);
-+        WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base);
-+
- 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
- 			m->sdma_rlc_rb_base_hi);
- 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
- 			m->sdma_rlc_rb_rptr_addr_lo);
- 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
- 			m->sdma_rlc_rb_rptr_addr_hi);
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
--			m->sdma_rlc_rb_cntl);
--
-+        data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL,
-+                             RB_ENABLE, 1);
-+        WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
- 	return 0;
- }
- 
-+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
-+                            uint32_t engine_id, uint32_t queue_id,
-+                            uint32_t (**dump)[2], uint32_t *n_regs)
-+{
-+        struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+        uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET +
-+               queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
-+        uint32_t i = 0, reg;
-+#undef HQD_N_REGS
-+#define HQD_N_REGS (19+4)
-+
-+        *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
-+        if (*dump == NULL)
-+                return -ENOMEM;
-+
-+        for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
-+                DUMP_REG(sdma_offset + reg);
-+        for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK;
-+             reg++)
-+                DUMP_REG(sdma_offset + reg);
-+
-+        WARN_ON_ONCE(i != HQD_N_REGS);
-+        *n_regs = i;
-+
-+        return 0;
-+}
-+
- static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
- 				uint32_t pipe_id, uint32_t queue_id)
- {
-@@ -403,30 +596,99 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
- 	return false;
- }
- 
--static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
-+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
-+				enum kfd_preempt_type reset_type,
- 				unsigned int utimeout, uint32_t pipe_id,
- 				uint32_t queue_id)
- {
- 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
- 	uint32_t temp;
--	int timeout = utimeout;
-+	enum hqd_dequeue_request_type type;
-+	unsigned long flags, end_jiffies;
-+	int retry;
- 
- 	acquire_queue(kgd, pipe_id, queue_id);
- 	WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0);
- 
--	WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type);
-+	switch (reset_type) {
-+	case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
-+		type = DRAIN_PIPE;
-+		break;
-+	case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
-+		type = RESET_WAVES;
-+		break;
-+	default:
-+		type = DRAIN_PIPE;
-+		break;
-+	}
-+
-+	/* Workaround: If IQ timer is active and the wait time is close to or
-+	 * equal to 0, dequeueing is not safe. Wait until either the wait time
-+	 * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is
-+	 * cleared before continuing. Also, ensure wait times are set to at
-+	 * least 0x3.
-+	 */
-+	local_irq_save(flags);
-+	preempt_disable();
-+	retry = 5000; /* wait for 500 usecs at maximum */
-+	while (true) {
-+		temp = RREG32(mmCP_HQD_IQ_TIMER);
-+		if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) {
-+			pr_debug("HW is processing IQ\n");
-+			goto loop;
-+		}
-+		if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) {
-+			if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE)
-+					== 3) /* SEM-rearm is safe */
-+				break;
-+			/* Wait time 3 is safe for CP, but our MMIO read/write
-+			 * time is close to 1 microsecond, so check for 10 to
-+			 * leave more buffer room
-+			 */
-+			if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME)
-+					>= 10)
-+				break;
-+			pr_debug("IQ timer is active\n");
-+		} else
-+			break;
-+loop:
-+		if (!retry) {
-+			pr_err("CP HQD IQ timer status time out\n");
-+			break;
-+		}
-+		ndelay(100);
-+		--retry;
-+	}
-+	retry = 1000;
-+	while (true) {
-+		temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST);
-+		if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK))
-+			break;
-+		pr_debug("Dequeue request is pending\n");
- 
-+		if (!retry) {
-+			pr_err("CP HQD dequeue request time out\n");
-+			break;
-+		}
-+		ndelay(100);
-+		--retry;
-+	}
-+	local_irq_restore(flags);
-+	preempt_enable();
-+
-+	WREG32(mmCP_HQD_DEQUEUE_REQUEST, type);
-+
-+	end_jiffies = (utimeout * HZ / 1000) + jiffies;
- 	while (true) {
- 		temp = RREG32(mmCP_HQD_ACTIVE);
--		if (temp & CP_HQD_ACTIVE__ACTIVE_MASK)
-+		if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
- 			break;
--		if (timeout <= 0) {
--			pr_err("kfd: cp queue preemption time out.\n");
-+		if (time_after(jiffies, end_jiffies)) {
-+			pr_err("cp queue preemption time out\n");
- 			release_queue(kgd);
- 			return -ETIME;
- 		}
--		msleep(20);
--		timeout -= 20;
-+		usleep_range(500, 1000);
- 	}
- 
- 	release_queue(kgd);
-@@ -440,7 +702,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
- 	struct cik_sdma_rlc_registers *m;
- 	uint32_t sdma_base_addr;
- 	uint32_t temp;
--	int timeout = utimeout;
-+	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
- 
- 	m = get_sdma_mqd(mqd);
- 	sdma_base_addr = get_sdma_base_addr(m);
-@@ -451,12 +713,11 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
- 
- 	while (true) {
- 		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
--		if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
-+		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
- 			break;
--		if (timeout <= 0)
-+		if (time_after(jiffies, end_jiffies))
- 			return -ETIME;
--		msleep(20);
--		timeout -= 20;
-+		usleep_range(500, 1000);
- 	}
- 
- 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
-@@ -464,6 +725,8 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
- 		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
- 		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
- 
-+	 m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
-+
- 	return 0;
- }
- 
-@@ -481,8 +744,9 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd)
- 
- 	/* Turning off this address until we set all the registers */
- 	for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
--		WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX +
--			ADDRESS_WATCH_REG_CNTL], cntl.u32All);
-+		WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX
-+				+ ADDRESS_WATCH_REG_CNTL],
-+				cntl.u32All);
- 
- 	return 0;
- }
-@@ -500,20 +764,24 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd,
- 
- 	/* Turning off this watch point until we set all the registers */
- 	cntl.bitfields.valid = 0;
--	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
--		ADDRESS_WATCH_REG_CNTL], cntl.u32All);
-+	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
-+			+ ADDRESS_WATCH_REG_CNTL],
-+		cntl.u32All);
- 
--	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
--		ADDRESS_WATCH_REG_ADDR_HI], addr_hi);
-+	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
-+			+ ADDRESS_WATCH_REG_ADDR_HI],
-+		addr_hi);
- 
--	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
--		ADDRESS_WATCH_REG_ADDR_LO], addr_lo);
-+	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
-+			+ ADDRESS_WATCH_REG_ADDR_LO],
-+		addr_lo);
- 
- 	/* Enable the watch point */
- 	cntl.bitfields.valid = 1;
- 
--	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
--		ADDRESS_WATCH_REG_CNTL], cntl.u32All);
-+	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
-+			+ ADDRESS_WATCH_REG_CNTL],
-+		cntl.u32All);
- 
- 	return 0;
- }
-@@ -567,7 +835,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
- 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
- 
- 	reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid);
--	return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
-+	return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
- }
- 
- static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
-@@ -577,52 +845,90 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
- 	WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid);
- }
- 
-+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+	int vmid;
-+
-+	for (vmid = 0; vmid < 16; vmid++) {
-+		if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
-+			continue;
-+		if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) &
-+			ATC_VMID0_PASID_MAPPING__VALID_MASK) {
-+			if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) &
-+				ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) {
-+				WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid);
-+				break;
-+			}
-+		}
-+	}
-+
-+	return 0;
-+}
-+
-+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
-+		uint8_t element_size, uint8_t index_stride, uint8_t mtype)
-+{
-+	uint32_t reg;
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+
-+	reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT |
-+		element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT |
-+		index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT |
-+		mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT;
-+
-+	WREG32(mmSH_STATIC_MEM_CONFIG, reg);
-+	return 0;
-+}
-+static int alloc_memory_of_scratch(struct kgd_dev *kgd,
-+				 uint64_t va, uint32_t vmid)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+
-+	lock_srbm(kgd, 0, 0, 0, vmid);
-+	WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va);
-+	unlock_srbm(kgd);
-+
-+	return 0;
-+}
-+
-+
- static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
- {
- 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
- 	const union amdgpu_firmware_header *hdr;
- 
--	BUG_ON(kgd == NULL);
--
- 	switch (type) {
- 	case KGD_ENGINE_PFP:
--		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.pfp_fw->data;
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_ME:
--		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.me_fw->data;
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_CE:
--		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.ce_fw->data;
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_MEC1:
--		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.mec_fw->data;
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_MEC2:
--		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.mec2_fw->data;
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_RLC:
--		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.rlc_fw->data;
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_SDMA1:
--		hdr = (const union amdgpu_firmware_header *)
--							adev->sdma.instance[0].fw->data;
-+		hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data;
- 		break;
- 
- 	case KGD_ENGINE_SDMA2:
--		hdr = (const union amdgpu_firmware_header *)
--							adev->sdma.instance[1].fw->data;
-+		hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data;
- 		break;
- 
- 	default:
-@@ -636,3 +942,42 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
- 	return hdr->common.ucode_version;
- }
- 
-+static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req)
-+{
-+	uint32_t value;
-+	struct amdgpu_device *adev = get_amdgpu_device(dev);
-+
-+	value = RREG32(mmATC_ATS_DEBUG);
-+	value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK;
-+	value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT);
-+
-+	WREG32(mmATC_ATS_DEBUG, value);
-+}
-+
-+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
-+			uint32_t page_table_base)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	/* TODO: Don't use hardcoded VMIDs */
-+	if (vmid < 8 || vmid > 15) {
-+		pr_err("trying to set page table base for wrong VMID\n");
-+		return;
-+	}
-+	WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base);
-+}
-+
-+ /**
-+  * read_vmid_from_vmfault_reg - read vmid from register
-+  *
-+  * adev: amdgpu_device pointer
-+  * @vmid: vmid pointer
-+  * read vmid from register (CIK).
-+  */
-+static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+
-+	uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS);
-+
-+	return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID);
-+}
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
-old mode 100644
-new mode 100755
-index c5044d5..2ff10e9
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
-@@ -20,6 +20,9 @@
-  * OTHER DEALINGS IN THE SOFTWARE.
-  */
- 
-+#undef pr_fmt
-+#define pr_fmt(fmt) "kfd2kgd: " fmt
-+
- #include <linux/module.h>
- #include <linux/fdtable.h>
- #include <linux/uaccess.h>
-@@ -28,7 +31,7 @@
- #include "amdgpu.h"
- #include "amdgpu_amdkfd.h"
- #include "amdgpu_ucode.h"
--#include "gfx_v8_0.h"
-+#include "amdgpu_amdkfd_gfx_v8.h"
- #include "gca/gfx_8_0_sh_mask.h"
- #include "gca/gfx_8_0_d.h"
- #include "gca/gfx_8_0_enum.h"
-@@ -39,7 +42,31 @@
- #include "vi_structs.h"
- #include "vid.h"
- 
--struct cik_sdma_rlc_registers;
-+enum hqd_dequeue_request_type {
-+	NO_ACTION = 0,
-+	DRAIN_PIPE,
-+	RESET_WAVES,
-+	SAVE_WAVES
-+};
-+
-+static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = {
-+	mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL,
-+	mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL,
-+	mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL,
-+	mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL
-+};
-+
-+
-+struct vi_sdma_mqd;
-+
-+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
-+		void *vm, struct kgd_mem **mem);
-+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem);
-+
-+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
-+				int fd, uint32_t handle, struct kgd_mem **mem);
-+
-+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
- 
- /*
-  * Register access functions
-@@ -55,17 +82,26 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
- 		uint32_t hpd_size, uint64_t hpd_gpu_addr);
- static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
- static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
--		uint32_t queue_id, uint32_t __user *wptr);
--static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
-+			uint32_t queue_id, uint32_t __user *wptr,
-+			uint32_t wptr_shift, uint32_t wptr_mask,
-+			struct mm_struct *mm);
-+static int kgd_hqd_dump(struct kgd_dev *kgd,
-+			uint32_t pipe_id, uint32_t queue_id,
-+			uint32_t (**dump)[2], uint32_t *n_regs);
-+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
-+			     uint32_t __user *wptr, struct mm_struct *mm);
-+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
-+			     uint32_t engine_id, uint32_t queue_id,
-+			     uint32_t (**dump)[2], uint32_t *n_regs);
- static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
- 		uint32_t pipe_id, uint32_t queue_id);
- static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
--static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
-+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
-+				enum kfd_preempt_type reset_type,
- 				unsigned int utimeout, uint32_t pipe_id,
- 				uint32_t queue_id);
- static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
- 				unsigned int utimeout);
--static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
- static int kgd_address_watch_disable(struct kgd_dev *kgd);
- static int kgd_address_watch_execute(struct kgd_dev *kgd,
- 					unsigned int watch_point_id,
-@@ -84,20 +120,61 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
- static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
- 		uint8_t vmid);
- static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
--static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
-+static void set_num_of_requests(struct kgd_dev *kgd,
-+			uint8_t num_of_requests);
-+static int alloc_memory_of_scratch(struct kgd_dev *kgd,
-+				 uint64_t va, uint32_t vmid);
-+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
-+		uint8_t element_size, uint8_t index_stride, uint8_t mtype);
-+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
-+		uint32_t page_table_base);
-+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
-+
-+/* Because of REG_GET_FIELD() being used, we put this function in the
-+ * asic specific file.
-+ */
-+static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
-+		struct tile_config *config)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-+
-+	config->gb_addr_config = adev->gfx.config.gb_addr_config;
-+	config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
-+				MC_ARB_RAMCFG, NOOFBANK);
-+	config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
-+				MC_ARB_RAMCFG, NOOFRANKS);
-+
-+	config->tile_config_ptr = adev->gfx.config.tile_mode_array;
-+	config->num_tile_configs =
-+			ARRAY_SIZE(adev->gfx.config.tile_mode_array);
-+	config->macro_tile_config_ptr =
-+			adev->gfx.config.macrotile_mode_array;
-+	config->num_macro_tile_configs =
-+			ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
-+
-+	return 0;
-+}
- 
- static const struct kfd2kgd_calls kfd2kgd = {
- 	.init_gtt_mem_allocation = alloc_gtt_mem,
- 	.free_gtt_mem = free_gtt_mem,
--	.get_vmem_size = get_vmem_size,
-+	.get_local_mem_info = get_local_mem_info,
- 	.get_gpu_clock_counter = get_gpu_clock_counter,
- 	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
-+	.create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
-+	.destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
-+	.create_process_gpumem = create_process_gpumem,
-+	.destroy_process_gpumem = destroy_process_gpumem,
-+	.get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
-+	.open_graphic_handle = open_graphic_handle,
- 	.program_sh_mem_settings = kgd_program_sh_mem_settings,
- 	.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
- 	.init_pipeline = kgd_init_pipeline,
- 	.init_interrupts = kgd_init_interrupts,
- 	.hqd_load = kgd_hqd_load,
- 	.hqd_sdma_load = kgd_hqd_sdma_load,
-+	.hqd_dump = kgd_hqd_dump,
-+	.hqd_sdma_dump = kgd_hqd_sdma_dump,
- 	.hqd_is_occupied = kgd_hqd_is_occupied,
- 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
- 	.hqd_destroy = kgd_hqd_destroy,
-@@ -111,14 +188,56 @@ static const struct kfd2kgd_calls kfd2kgd = {
- 	.get_atc_vmid_pasid_mapping_valid =
- 			get_atc_vmid_pasid_mapping_valid,
- 	.write_vmid_invalidate_request = write_vmid_invalidate_request,
--	.get_fw_version = get_fw_version
-+	.invalidate_tlbs = invalidate_tlbs,
-+	.sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
-+	.alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
-+	.free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
-+	.map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
-+	.unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
-+	.get_fw_version = get_fw_version,
-+	.set_num_of_requests = set_num_of_requests,
-+	.get_cu_info = get_cu_info,
-+	.alloc_memory_of_scratch = alloc_memory_of_scratch,
-+	.write_config_static_mem = write_config_static_mem,
-+	.mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo,
-+	.map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
-+	.set_vm_context_page_table_base = set_vm_context_page_table_base,
-+	.pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table,
-+	.unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table,
-+	.get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info,
-+	.import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf,
-+	.export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf,
-+	.get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info,
-+	.submit_ib = amdgpu_amdkfd_submit_ib,
-+	.get_tile_config = amdgpu_amdkfd_get_tile_config,
-+	.restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos,
-+	.copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem,
-+	.get_vram_usage = amdgpu_amdkfd_get_vram_usage
- };
- 
--struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void)
-+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions()
- {
- 	return (struct kfd2kgd_calls *)&kfd2kgd;
- }
- 
-+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
-+				void *vm, struct kgd_mem **mem)
-+{
-+	return 0;
-+}
-+
-+/* Destroys the GPU allocation and frees the kgd_mem structure */
-+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem)
-+{
-+
-+}
-+
-+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
-+				int fd, uint32_t handle, struct kgd_mem **mem)
-+{
-+	return 0;
-+}
-+
- static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
- {
- 	return (struct amdgpu_device *)kgd;
-@@ -147,7 +266,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
- {
- 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
- 
--	uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
-+	uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
- 	uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
- 
- 	lock_srbm(kgd, mec, pipe, queue_id, 0);
-@@ -216,21 +335,28 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
- 	uint32_t mec;
- 	uint32_t pipe;
- 
--	mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
-+	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
- 	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
- 
- 	lock_srbm(kgd, mec, pipe, 0, 0);
- 
--	WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK);
-+	WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
-+			CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
- 
- 	unlock_srbm(kgd);
- 
- 	return 0;
- }
- 
--static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
-+static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m)
- {
--	return 0;
-+	uint32_t retval;
-+
-+	retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
-+		m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET;
-+	pr_debug("sdma base address: 0x%x\n", retval);
-+
-+	return retval;
- }
- 
- static inline struct vi_mqd *get_mqd(void *mqd)
-@@ -238,9 +364,9 @@ static inline struct vi_mqd *get_mqd(void *mqd)
- 	return (struct vi_mqd *)mqd;
- }
- 
--static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
-+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
- {
--	return (struct cik_sdma_rlc_registers *)mqd;
-+	return (struct vi_sdma_mqd *)mqd;
- }
- 
- static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
-@@ -252,16 +378,18 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
- 	struct vi_mqd *m;
- 	uint32_t *mqd_hqd;
- 	uint32_t reg, wptr_val, data;
-+	bool valid_wptr = false;
- 
- 	m = get_mqd(mqd);
- 
- 	acquire_queue(kgd, pipe_id, queue_id);
--	/*HIQ is set during driver init period with vmid set to 0. For SRIOV
--	* world switching support let the RLC know about the HIQ.
--	*
--	* Workaround: This causes reboots on CZ. Disable this on CZ, which
--	* doesn't support SRIOV anyway.
--	*/
-+
-+	/* HIQ is set during driver init period with vmid set to 0. For SRIOV
-+	 * world switching support let the RLC know about the HIQ.
-+	 *
-+	 * Workaround: This causes reboots on CZ. Disable this on CZ, which
-+	 * doesn't support SRIOV anyway.
-+	 */
- 	if (m->cp_hqd_vmid == 0 &&
- 		adev->asic_type != CHIP_CARRIZO) {
- 		uint32_t value, mec, pipe;
-@@ -304,7 +432,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
- 			     CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
- 	WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data);
- 
--	if (read_user_wptr(mm, wptr, wptr_val))
-+	/* read_user_ptr may take the mm->mmap_sem.
-+	 * release srbm_mutex to avoid circular dependency between
-+	 * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex.
-+	 */
-+	release_queue(kgd);
-+	valid_wptr = read_user_wptr(mm, wptr, wptr_val);
-+	acquire_queue(kgd, pipe_id, queue_id);
-+	if (valid_wptr)
- 		WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask);
- 
- 	data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
-@@ -315,8 +450,138 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
- 	return 0;
- }
- 
--static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
-+static int kgd_hqd_dump(struct kgd_dev *kgd,
-+			uint32_t pipe_id, uint32_t queue_id,
-+			uint32_t (**dump)[2], uint32_t *n_regs)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	uint32_t i = 0, reg;
-+#define HQD_N_REGS (54+4)
-+#define DUMP_REG(addr) do {				\
-+		if (WARN_ON_ONCE(i >= HQD_N_REGS))	\
-+			break;				\
-+		(*dump)[i][0] = (addr) << 2;		\
-+		(*dump)[i++][1] = RREG32(addr);		\
-+	} while (0)
-+
-+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
-+	if (*dump == NULL)
-+		return -ENOMEM;
-+
-+	acquire_queue(kgd, pipe_id, queue_id);
-+
-+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0);
-+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1);
-+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2);
-+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3);
-+
-+	for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++)
-+		DUMP_REG(reg);
-+
-+	release_queue(kgd);
-+
-+	WARN_ON_ONCE(i != HQD_N_REGS);
-+	*n_regs = i;
-+
-+	return 0;
-+}
-+
-+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
-+			     uint32_t __user *wptr, struct mm_struct *mm)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	struct vi_sdma_mqd *m;
-+	uint32_t sdma_base_addr;
-+	uint32_t temp, timeout = 2000;
-+	uint32_t data;
-+
-+	m = get_sdma_mqd(mqd);
-+	sdma_base_addr = get_sdma_base_addr(m);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
-+		m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
-+
-+	while (true) {
-+		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
-+		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
-+			break;
-+		if (timeout == 0)
-+			return -ETIME;
-+		msleep(10);
-+		timeout -= 10;
-+	}
-+	if (m->sdma_engine_id) {
-+		data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL);
-+		data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL,
-+				RESUME_CTX, 0);
-+		WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data);
-+	} else {
-+		data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL);
-+		data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
-+				RESUME_CTX, 0);
-+		WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
-+	}
-+
-+	data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
-+			     ENABLE, 1);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
-+
-+	if (read_user_wptr(mm, wptr, data))
-+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
-+	else
-+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
-+		       m->sdmax_rlcx_rb_rptr);
-+
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
-+				m->sdmax_rlcx_virtual_addr);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
-+			m->sdmax_rlcx_rb_base_hi);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
-+			m->sdmax_rlcx_rb_rptr_addr_lo);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
-+			m->sdmax_rlcx_rb_rptr_addr_hi);
-+
-+	data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
-+			     RB_ENABLE, 1);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
-+
-+	return 0;
-+}
-+
-+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
-+			     uint32_t engine_id, uint32_t queue_id,
-+			     uint32_t (**dump)[2], uint32_t *n_regs)
- {
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET +
-+		queue_id * KFD_VI_SDMA_QUEUE_OFFSET;
-+	uint32_t i = 0, reg;
-+#undef HQD_N_REGS
-+#define HQD_N_REGS (19+4+2+3+7)
-+
-+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
-+	if (*dump == NULL)
-+		return -ENOMEM;
-+
-+	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
-+		DUMP_REG(sdma_offset + reg);
-+	for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK;
-+	     reg++)
-+		DUMP_REG(sdma_offset + reg);
-+	for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI;
-+	     reg++)
-+		DUMP_REG(sdma_offset + reg);
-+	for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG;
-+	     reg++)
-+		DUMP_REG(sdma_offset + reg);
-+	for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL;
-+	     reg++)
-+		DUMP_REG(sdma_offset + reg);
-+
-+	WARN_ON_ONCE(i != HQD_N_REGS);
-+	*n_regs = i;
-+
- 	return 0;
- }
- 
-@@ -345,7 +610,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
- static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
- {
- 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
--	struct cik_sdma_rlc_registers *m;
-+	struct vi_sdma_mqd *m;
- 	uint32_t sdma_base_addr;
- 	uint32_t sdma_rlc_rb_cntl;
- 
-@@ -360,29 +625,102 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
- 	return false;
- }
- 
--static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
-+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
-+				enum kfd_preempt_type reset_type,
- 				unsigned int utimeout, uint32_t pipe_id,
- 				uint32_t queue_id)
- {
- 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
- 	uint32_t temp;
--	int timeout = utimeout;
-+	enum hqd_dequeue_request_type type;
-+	unsigned long flags, end_jiffies;
-+	int retry;
-+	struct vi_mqd *m = get_mqd(mqd);
- 
- 	acquire_queue(kgd, pipe_id, queue_id);
- 
--	WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type);
-+	if (m->cp_hqd_vmid == 0)
-+		WREG32_FIELD(RLC_CP_SCHEDULERS, scheduler1, 0);
- 
-+	switch (reset_type) {
-+	case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
-+		type = DRAIN_PIPE;
-+		break;
-+	case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
-+		type = RESET_WAVES;
-+		break;
-+	default:
-+		type = DRAIN_PIPE;
-+		break;
-+	}
-+
-+	/* Workaround: If IQ timer is active and the wait time is close to or
-+	 * equal to 0, dequeueing is not safe. Wait until either the wait time
-+	 * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is
-+	 * cleared before continuing. Also, ensure wait times are set to at
-+	 * least 0x3.
-+	 */
-+	local_irq_save(flags);
-+	preempt_disable();
-+	retry = 5000; /* wait for 500 usecs at maximum */
-+	while (true) {
-+		temp = RREG32(mmCP_HQD_IQ_TIMER);
-+		if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) {
-+			pr_debug("HW is processing IQ\n");
-+			goto loop;
-+		}
-+		if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) {
-+			if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE)
-+					== 3) /* SEM-rearm is safe */
-+				break;
-+			/* Wait time 3 is safe for CP, but our MMIO read/write
-+			 * time is close to 1 microsecond, so check for 10 to
-+			 * leave more buffer room
-+			 */
-+			if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME)
-+					>= 10)
-+				break;
-+			pr_debug("IQ timer is active\n");
-+		} else
-+			break;
-+loop:
-+		if (!retry) {
-+			pr_err("CP HQD IQ timer status time out\n");
-+			break;
-+		}
-+		ndelay(100);
-+		--retry;
-+	}
-+	retry = 1000;
-+	while (true) {
-+		temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST);
-+		if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK))
-+			break;
-+		pr_debug("Dequeue request is pending\n");
-+
-+		if (!retry) {
-+			pr_err("CP HQD dequeue request time out\n");
-+			break;
-+		}
-+		ndelay(100);
-+		--retry;
-+	}
-+	local_irq_restore(flags);
-+	preempt_enable();
-+
-+	WREG32(mmCP_HQD_DEQUEUE_REQUEST, type);
-+
-+	end_jiffies = (utimeout * HZ / 1000) + jiffies;
- 	while (true) {
- 		temp = RREG32(mmCP_HQD_ACTIVE);
--		if (temp & CP_HQD_ACTIVE__ACTIVE_MASK)
-+		if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
- 			break;
--		if (timeout <= 0) {
--			pr_err("kfd: cp queue preemption time out.\n");
-+		if (time_after(jiffies, end_jiffies)) {
-+			pr_err("cp queue preemption time out.\n");
- 			release_queue(kgd);
- 			return -ETIME;
- 		}
--		msleep(20);
--		timeout -= 20;
-+		usleep_range(500, 1000);
- 	}
- 
- 	release_queue(kgd);
-@@ -393,10 +731,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
- 				unsigned int utimeout)
- {
- 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
--	struct cik_sdma_rlc_registers *m;
-+	struct vi_sdma_mqd *m;
- 	uint32_t sdma_base_addr;
- 	uint32_t temp;
--	int timeout = utimeout;
-+	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
- 
- 	m = get_sdma_mqd(mqd);
- 	sdma_base_addr = get_sdma_base_addr(m);
-@@ -407,18 +745,19 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
- 
- 	while (true) {
- 		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
--		if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
-+		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
- 			break;
--		if (timeout <= 0)
-+		if (time_after(jiffies, end_jiffies))
- 			return -ETIME;
--		msleep(20);
--		timeout -= 20;
-+		usleep_range(500, 1000);
- 	}
- 
- 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
--	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
-+		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
-+		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
-+
-+	m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
- 
- 	return 0;
- }
-@@ -440,7 +779,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
- 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
- 
- 	reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid);
--	return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
-+	return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
- }
- 
- static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
-@@ -450,8 +789,83 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
- 	WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid);
- }
- 
-+/*
-+ * FIXME: Poliars test failed with this package, FIJI works fine
-+ * From the CP spec it does not official support the invalidation
-+ * with the specified pasid in the package,  so disable it for V8
-+ *
-+ */
-+#ifdef V8_SUPPORT_IT_OFFICIAL
-+static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid)
-+{
-+	signed long r;
-+	struct dma_fence *f;
-+	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
-+
-+	mutex_lock(&adev->gfx.kiq.ring_mutex);
-+	amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/
-+	amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0));
-+	amdgpu_ring_write(ring,
-+			PACKET3_INVALIDATE_TLBS_DST_SEL(1) |
-+			PACKET3_INVALIDATE_TLBS_PASID(pasid));
-+	amdgpu_fence_emit(ring, &f);
-+	amdgpu_ring_commit(ring);
-+	mutex_unlock(&adev->gfx.kiq.ring_mutex);
-+
-+	r = dma_fence_wait(f, false);
-+	if (r)
-+		DRM_ERROR("wait for kiq fence error: %ld.\n", r);
-+	dma_fence_put(f);
-+
-+	return r;
-+}
-+#endif
-+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+	int vmid;
-+
-+#ifdef V8_SUPPORT_IT_OFFICIAL
-+	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
-+
-+	if (ring->ready)
-+		return invalidate_tlbs_with_kiq(adev, pasid);
-+#endif
-+
-+	for (vmid = 0; vmid < 16; vmid++) {
-+		if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
-+			continue;
-+		if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) &
-+			ATC_VMID0_PASID_MAPPING__VALID_MASK) {
-+			if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) &
-+				ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) {
-+				WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid);
-+				break;
-+			}
-+		}
-+	}
-+
-+	return 0;
-+}
-+
- static int kgd_address_watch_disable(struct kgd_dev *kgd)
- {
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	union TCP_WATCH_CNTL_BITS cntl;
-+	unsigned int i;
-+
-+	cntl.u32All = 0;
-+
-+	cntl.bitfields.valid = 0;
-+	cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
-+	cntl.bitfields.atc = 1;
-+
-+	/* Turning off this address until we set all the registers */
-+	for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
-+		WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX
-+				+ ADDRESS_WATCH_REG_CNTL],
-+				cntl.u32All);
-+
- 	return 0;
- }
- 
-@@ -461,6 +875,32 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd,
- 					uint32_t addr_hi,
- 					uint32_t addr_lo)
- {
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	union TCP_WATCH_CNTL_BITS cntl;
-+
-+	cntl.u32All = cntl_val;
-+
-+	/* Turning off this watch point until we set all the registers */
-+	cntl.bitfields.valid = 0;
-+	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
-+			+ ADDRESS_WATCH_REG_CNTL],
-+			cntl.u32All);
-+
-+	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
-+			+ ADDRESS_WATCH_REG_ADDR_HI],
-+			addr_hi);
-+
-+	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
-+			+ ADDRESS_WATCH_REG_ADDR_LO],
-+			addr_lo);
-+
-+	/* Enable the watch point */
-+	cntl.bitfields.valid = 1;
-+
-+	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
-+			+ ADDRESS_WATCH_REG_CNTL],
-+			cntl.u32All);
-+
- 	return 0;
- }
- 
-@@ -493,6 +933,32 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
- 					unsigned int watch_point_id,
- 					unsigned int reg_offset)
- {
-+	return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset];
-+}
-+
-+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
-+		uint8_t element_size, uint8_t index_stride, uint8_t mtype)
-+{
-+	uint32_t reg;
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+
-+	reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT |
-+		element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT |
-+		index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT |
-+		mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT;
-+
-+	WREG32(mmSH_STATIC_MEM_CONFIG, reg);
-+	return 0;
-+}
-+static int alloc_memory_of_scratch(struct kgd_dev *kgd,
-+				 uint64_t va, uint32_t vmid)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+
-+	lock_srbm(kgd, 0, 0, 0, vmid);
-+	WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va);
-+	unlock_srbm(kgd);
-+
- 	return 0;
- }
- 
-@@ -501,47 +967,45 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
- 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
- 	const union amdgpu_firmware_header *hdr;
- 
--	BUG_ON(kgd == NULL);
--
- 	switch (type) {
- 	case KGD_ENGINE_PFP:
- 		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.pfp_fw->data;
-+						adev->gfx.pfp_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_ME:
- 		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.me_fw->data;
-+						adev->gfx.me_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_CE:
- 		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.ce_fw->data;
-+						adev->gfx.ce_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_MEC1:
- 		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.mec_fw->data;
-+						adev->gfx.mec_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_MEC2:
- 		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.mec2_fw->data;
-+						adev->gfx.mec2_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_RLC:
- 		hdr = (const union amdgpu_firmware_header *)
--							adev->gfx.rlc_fw->data;
-+						adev->gfx.rlc_fw->data;
- 		break;
- 
- 	case KGD_ENGINE_SDMA1:
- 		hdr = (const union amdgpu_firmware_header *)
--							adev->sdma.instance[0].fw->data;
-+						adev->sdma.instance[0].fw->data;
- 		break;
- 
- 	case KGD_ENGINE_SDMA2:
- 		hdr = (const union amdgpu_firmware_header *)
--							adev->sdma.instance[1].fw->data;
-+						adev->sdma.instance[1].fw->data;
- 		break;
- 
- 	default:
-@@ -554,3 +1018,21 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
- 	/* Only 12 bit in use*/
- 	return hdr->common.ucode_version;
- }
-+
-+static void set_num_of_requests(struct kgd_dev *kgd,
-+			uint8_t num_of_requests)
-+{
-+	pr_debug("This is a stub\n");
-+}
-+
-+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
-+		uint32_t page_table_base)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	/* TODO: Don't use hardcoded VMIDs */
-+	if (vmid < 8 || vmid > 15) {
-+		pr_err("trying to set page table base for wrong VMID\n");
-+		return;
-+	}
-+	WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base);
-+}
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h
-new file mode 100644
-index 0000000..3c94919
---- /dev/null
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h
-@@ -0,0 +1,62 @@
-+/*
-+ * Copyright 2015 Advanced Micro Devices, Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+
-+#ifndef AMDGPU_AMDKFD_GFX_V8_H_INCLUDED
-+#define AMDGPU_AMDKFD_GFX_V8_H_INCLUDED
-+
-+#include <linux/types.h>
-+
-+enum {
-+	MAX_TRAPID = 8,		/* 3 bits in the bitfield. */
-+	MAX_WATCH_ADDRESSES = 4
-+};
-+
-+enum {
-+	ADDRESS_WATCH_REG_ADDR_HI = 0,
-+	ADDRESS_WATCH_REG_ADDR_LO,
-+	ADDRESS_WATCH_REG_CNTL,
-+	ADDRESS_WATCH_REG_MAX
-+};
-+
-+/*  not defined in the VI reg file  */
-+enum {
-+	ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL,
-+	ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF,
-+	ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000,
-+	/* extend the mask to 26 bits in order to match the low address field */
-+	ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6,
-+	ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF
-+};
-+
-+union TCP_WATCH_CNTL_BITS {
-+	struct {
-+		uint32_t mask:24;
-+		uint32_t vmid:4;
-+		uint32_t atc:1;
-+		uint32_t mode:2;
-+		uint32_t valid:1;
-+	} bitfields, bits;
-+	uint32_t u32All;
-+	signed int i32All;
-+	float f32All;
-+};
-+#endif
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
-new file mode 100644
-index 0000000..edbae19
---- /dev/null
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
-@@ -0,0 +1,1227 @@
-+/*
-+ * Copyright 2014 Advanced Micro Devices, Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#undef pr_fmt
-+#define pr_fmt(fmt) "kfd2kgd: " fmt
-+
-+#include <linux/module.h>
-+#include <linux/fdtable.h>
-+#include <linux/uaccess.h>
-+#include <linux/firmware.h>
-+#include <drm/drmP.h>
-+#include "amdgpu.h"
-+#include "amdgpu_amdkfd.h"
-+#include "amdgpu_ucode.h"
-+#include "amdgpu_amdkfd_gfx_v8.h"
-+#include "vega10/soc15ip.h"
-+#include "vega10/GC/gc_9_0_offset.h"
-+#include "vega10/GC/gc_9_0_sh_mask.h"
-+#include "vega10/vega10_enum.h"
-+#include "vega10/SDMA0/sdma0_4_0_offset.h"
-+#include "vega10/SDMA0/sdma0_4_0_sh_mask.h"
-+#include "vega10/SDMA1/sdma1_4_0_offset.h"
-+#include "vega10/SDMA1/sdma1_4_0_sh_mask.h"
-+#include "vega10/ATHUB/athub_1_0_offset.h"
-+#include "vega10/ATHUB/athub_1_0_sh_mask.h"
-+#include "vega10/OSSSYS/osssys_4_0_offset.h"
-+#include "vega10/OSSSYS/osssys_4_0_sh_mask.h"
-+#include "soc15_common.h"
-+#include "v9_structs.h"
-+#include "soc15.h"
-+#include "soc15d.h"
-+
-+/* HACK: MMHUB and GC both have VM-related register with the same
-+ * names but different offsets. Define the MMHUB register we need here
-+ * with a prefix. A proper solution would be to move the functions
-+ * programming these registers into gfx_v9_0.c and mmhub_v1_0.c
-+ * respectively.
-+ */
-+#define mmMMHUB_VM_INVALIDATE_ENG16_REQ				0x06f3
-+#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX		0
-+
-+#define mmMMHUB_VM_INVALIDATE_ENG16_ACK				0x0705
-+#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX		0
-+
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32		0x072b
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX	0
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32		0x072c
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX	0
-+
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32		0x074b
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX	0
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32		0x074c
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX	0
-+
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32		0x076b
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX	0
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32		0x076c
-+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX	0
-+
-+#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32		0x0727
-+#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX	0
-+#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32		0x0728
-+#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX	0
-+
-+enum hqd_dequeue_request_type {
-+	NO_ACTION = 0,
-+	DRAIN_PIPE,
-+	RESET_WAVES,
-+	SAVE_WAVES
-+};
-+
-+static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = {
-+	mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL,
-+	mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL,
-+	mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL,
-+	mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL
-+};
-+
-+
-+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
-+		void *vm, struct kgd_mem **mem);
-+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem);
-+
-+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
-+				int fd, uint32_t handle, struct kgd_mem **mem);
-+
-+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
-+
-+/*
-+ * Register access functions
-+ */
-+
-+static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
-+		uint32_t sh_mem_config,
-+		uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
-+		uint32_t sh_mem_bases);
-+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
-+		unsigned int vmid);
-+static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
-+		uint32_t hpd_size, uint64_t hpd_gpu_addr);
-+static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
-+static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
-+			uint32_t queue_id, uint32_t __user *wptr,
-+			uint32_t wptr_shift, uint32_t wptr_mask,
-+			struct mm_struct *mm);
-+static int kgd_hqd_dump(struct kgd_dev *kgd,
-+			uint32_t pipe_id, uint32_t queue_id,
-+			uint32_t (**dump)[2], uint32_t *n_regs);
-+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
-+			     uint32_t __user *wptr, struct mm_struct *mm);
-+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
-+			     uint32_t engine_id, uint32_t queue_id,
-+			     uint32_t (**dump)[2], uint32_t *n_regs);
-+static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
-+		uint32_t pipe_id, uint32_t queue_id);
-+static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
-+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
-+				enum kfd_preempt_type reset_type,
-+				unsigned int utimeout, uint32_t pipe_id,
-+				uint32_t queue_id);
-+static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
-+				unsigned int utimeout);
-+static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
-+static uint32_t get_watch_base_addr(void);
-+static int kgd_address_watch_disable(struct kgd_dev *kgd);
-+static int kgd_address_watch_execute(struct kgd_dev *kgd,
-+					unsigned int watch_point_id,
-+					uint32_t cntl_val,
-+					uint32_t addr_hi,
-+					uint32_t addr_lo);
-+static int kgd_wave_control_execute(struct kgd_dev *kgd,
-+					uint32_t gfx_index_val,
-+					uint32_t sq_cmd);
-+static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
-+					unsigned int watch_point_id,
-+					unsigned int reg_offset);
-+
-+static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
-+		uint8_t vmid);
-+static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
-+		uint8_t vmid);
-+static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
-+static void set_num_of_requests(struct kgd_dev *kgd,
-+			uint8_t num_of_requests);
-+static int alloc_memory_of_scratch(struct kgd_dev *kgd,
-+				 uint64_t va, uint32_t vmid);
-+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
-+		uint8_t element_size, uint8_t index_stride, uint8_t mtype);
-+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
-+		uint32_t page_table_base);
-+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
-+
-+/* Because of REG_GET_FIELD() being used, we put this function in the
-+ * asic specific file.
-+ */
-+static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
-+		struct tile_config *config)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-+
-+	config->gb_addr_config = adev->gfx.config.gb_addr_config;
-+#if 0
-+/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but
-+ * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu
-+ * changes commented out related code, doing the same here for now but
-+ * need to sync with Ken et al
-+ */
-+	config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
-+				MC_ARB_RAMCFG, NOOFBANK);
-+	config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
-+				MC_ARB_RAMCFG, NOOFRANKS);
-+#endif
-+
-+	config->tile_config_ptr = adev->gfx.config.tile_mode_array;
-+	config->num_tile_configs =
-+			ARRAY_SIZE(adev->gfx.config.tile_mode_array);
-+	config->macro_tile_config_ptr =
-+			adev->gfx.config.macrotile_mode_array;
-+	config->num_macro_tile_configs =
-+			ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
-+
-+	return 0;
-+}
-+
-+static const struct kfd2kgd_calls kfd2kgd = {
-+	.init_gtt_mem_allocation = alloc_gtt_mem,
-+	.free_gtt_mem = free_gtt_mem,
-+	.get_local_mem_info = get_local_mem_info,
-+	.get_gpu_clock_counter = get_gpu_clock_counter,
-+	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
-+	.create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
-+	.destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
-+	.create_process_gpumem = create_process_gpumem,
-+	.destroy_process_gpumem = destroy_process_gpumem,
-+	.get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
-+	.open_graphic_handle = open_graphic_handle,
-+	.program_sh_mem_settings = kgd_program_sh_mem_settings,
-+	.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
-+	.init_pipeline = kgd_init_pipeline,
-+	.init_interrupts = kgd_init_interrupts,
-+	.hqd_load = kgd_hqd_load,
-+	.hqd_sdma_load = kgd_hqd_sdma_load,
-+	.hqd_dump = kgd_hqd_dump,
-+	.hqd_sdma_dump = kgd_hqd_sdma_dump,
-+	.hqd_is_occupied = kgd_hqd_is_occupied,
-+	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
-+	.hqd_destroy = kgd_hqd_destroy,
-+	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
-+	.address_watch_disable = kgd_address_watch_disable,
-+	.address_watch_execute = kgd_address_watch_execute,
-+	.wave_control_execute = kgd_wave_control_execute,
-+	.address_watch_get_offset = kgd_address_watch_get_offset,
-+	.get_atc_vmid_pasid_mapping_pasid =
-+			get_atc_vmid_pasid_mapping_pasid,
-+	.get_atc_vmid_pasid_mapping_valid =
-+			get_atc_vmid_pasid_mapping_valid,
-+	.write_vmid_invalidate_request = write_vmid_invalidate_request,
-+	.invalidate_tlbs = invalidate_tlbs,
-+	.sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
-+	.alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
-+	.free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
-+	.map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
-+	.unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
-+	.get_fw_version = get_fw_version,
-+	.set_num_of_requests = set_num_of_requests,
-+	.get_cu_info = get_cu_info,
-+	.alloc_memory_of_scratch = alloc_memory_of_scratch,
-+	.write_config_static_mem = write_config_static_mem,
-+	.mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo,
-+	.map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
-+	.set_vm_context_page_table_base = set_vm_context_page_table_base,
-+	.pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table,
-+	.unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table,
-+	.get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info,
-+	.import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf,
-+	.export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf,
-+	.get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info,
-+	.submit_ib = amdgpu_amdkfd_submit_ib,
-+	.get_tile_config = amdgpu_amdkfd_get_tile_config,
-+	.restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos,
-+	.copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem,
-+	.get_vram_usage = amdgpu_amdkfd_get_vram_usage
-+};
-+
-+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions()
-+{
-+	return (struct kfd2kgd_calls *)&kfd2kgd;
-+}
-+
-+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
-+				void *vm, struct kgd_mem **mem)
-+{
-+	return 0;
-+}
-+
-+/* Destroys the GPU allocation and frees the kgd_mem structure */
-+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem)
-+{
-+
-+}
-+
-+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
-+				int fd, uint32_t handle, struct kgd_mem **mem)
-+{
-+	return 0;
-+}
-+
-+static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
-+{
-+	return (struct amdgpu_device *)kgd;
-+}
-+
-+static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe,
-+			uint32_t queue, uint32_t vmid)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+
-+	mutex_lock(&adev->srbm_mutex);
-+	soc15_grbm_select(adev, mec, pipe, queue, vmid);
-+}
-+
-+static void unlock_srbm(struct kgd_dev *kgd)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+
-+	soc15_grbm_select(adev, 0, 0, 0, 0);
-+	mutex_unlock(&adev->srbm_mutex);
-+}
-+
-+static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
-+				uint32_t queue_id)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+
-+	uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
-+	uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
-+
-+	lock_srbm(kgd, mec, pipe, queue_id, 0);
-+}
-+
-+static uint32_t get_queue_mask(struct amdgpu_device *adev,
-+			       uint32_t pipe_id, uint32_t queue_id)
-+{
-+	unsigned int bit = (pipe_id * adev->gfx.mec.num_pipe_per_mec +
-+			    queue_id) & 31;
-+
-+	return ((uint32_t)1) << bit;
-+}
-+
-+static void release_queue(struct kgd_dev *kgd)
-+{
-+	unlock_srbm(kgd);
-+}
-+
-+static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
-+					uint32_t sh_mem_config,
-+					uint32_t sh_mem_ape1_base,
-+					uint32_t sh_mem_ape1_limit,
-+					uint32_t sh_mem_bases)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+
-+	lock_srbm(kgd, 0, 0, 0, vmid);
-+
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
-+	/* APE1 no longer exists on GFX9 */
-+
-+	unlock_srbm(kgd);
-+}
-+
-+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
-+					unsigned int vmid)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+
-+	/*
-+	 * We have to assume that there is no outstanding mapping.
-+	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
-+	 * a mapping is in progress or because a mapping finished
-+	 * and the SW cleared it.
-+	 * So the protocol is to always wait & clear.
-+	 */
-+	uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
-+			ATC_VMID0_PASID_MAPPING__VALID_MASK;
-+
-+	/*
-+	 * need to do this twice, once for gfx and once for mmhub
-+	 * for ATC add 16 to VMID for mmhub, for IH different registers.
-+	 * ATC_VMID0..15 registers are separate from ATC_VMID16..31.
-+	 */
-+
-+	WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid,
-+	       pasid_mapping);
-+
-+	while (!(RREG32(SOC15_REG_OFFSET(
-+				ATHUB, 0,
-+				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
-+		 (1U << vmid)))
-+		cpu_relax();
-+
-+	WREG32(SOC15_REG_OFFSET(ATHUB, 0,
-+				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
-+	       1U << vmid);
-+
-+	/* Mapping vmid to pasid also for IH block */
-+	WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid,
-+	       pasid_mapping);
-+
-+	WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid,
-+	       pasid_mapping);
-+
-+	while (!(RREG32(SOC15_REG_OFFSET(
-+				ATHUB, 0,
-+				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
-+		 (1U << (vmid + 16))))
-+		cpu_relax();
-+
-+	WREG32(SOC15_REG_OFFSET(ATHUB, 0,
-+				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
-+	       1U << (vmid + 16));
-+
-+	/* Mapping vmid to pasid also for IH block */
-+	WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid,
-+	       pasid_mapping);
-+	return 0;
-+}
-+
-+static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
-+				uint32_t hpd_size, uint64_t hpd_gpu_addr)
-+{
-+	/* amdgpu owns the per-pipe state */
-+	return 0;
-+}
-+
-+/* TODO - RING0 form of field is obsolete, seems to date back to SI
-+ * but still works
-+ */
-+
-+static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	uint32_t mec;
-+	uint32_t pipe;
-+
-+	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
-+	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
-+
-+	lock_srbm(kgd, mec, pipe, 0, 0);
-+
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL),
-+		CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
-+		CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
-+
-+	unlock_srbm(kgd);
-+
-+	return 0;
-+}
-+
-+static uint32_t get_sdma_base_addr(unsigned int engine_id,
-+				   unsigned int queue_id)
-+{
-+	static const uint32_t base[2] = {
-+		SOC15_REG_OFFSET(SDMA0, 0,
-+				 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL,
-+		SOC15_REG_OFFSET(SDMA1, 0,
-+				 mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL
-+	};
-+	uint32_t retval;
-+
-+	retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL -
-+					       mmSDMA0_RLC0_RB_CNTL);
-+
-+	pr_debug("sdma base address: 0x%x\n", retval);
-+
-+	return retval;
-+}
-+
-+static uint32_t get_watch_base_addr(void)
-+{
-+	uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) -
-+			mmTCP_WATCH0_ADDR_H;
-+
-+	pr_debug("kfd: reg watch base address: 0x%x\n", retval);
-+
-+	return retval;
-+}
-+
-+static inline struct v9_mqd *get_mqd(void *mqd)
-+{
-+	return (struct v9_mqd *)mqd;
-+}
-+
-+static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
-+{
-+	return (struct v9_sdma_mqd *)mqd;
-+}
-+
-+static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
-+			uint32_t queue_id, uint32_t __user *wptr,
-+			uint32_t wptr_shift, uint32_t wptr_mask,
-+			struct mm_struct *mm)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	struct v9_mqd *m;
-+	uint32_t *mqd_hqd;
-+	uint32_t reg, hqd_base, data;
-+
-+	m = get_mqd(mqd);
-+
-+	acquire_queue(kgd, pipe_id, queue_id);
-+
-+	/* HIQ is set during driver init period with vmid set to 0*/
-+	if (m->cp_hqd_vmid == 0) {
-+		uint32_t value, mec, pipe;
-+
-+		mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
-+		pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
-+
-+		pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
-+			mec, pipe, queue_id);
-+		value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS));
-+		value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1,
-+			((mec << 5) | (pipe << 3) | queue_id | 0x80));
-+		WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value);
-+	}
-+
-+	/* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
-+	mqd_hqd = &m->cp_mqd_base_addr_lo;
-+	hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
-+
-+	for (reg = hqd_base;
-+	     reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
-+		WREG32(reg, mqd_hqd[reg - hqd_base]);
-+
-+
-+	/* Activate doorbell logic before triggering WPTR poll. */
-+	data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
-+			     CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data);
-+
-+	if (wptr) {
-+		/* Don't read wptr with get_user because the user
-+		 * context may not be accessible (if this function
-+		 * runs in a work queue). Instead trigger a one-shot
-+		 * polling read from memory in the CP. This assumes
-+		 * that wptr is GPU-accessible in the queue's VMID via
-+		 * ATC or SVM. WPTR==RPTR before starting the poll so
-+		 * the CP starts fetching new commands from the right
-+		 * place.
-+		 *
-+		 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
-+		 * tricky. Assume that the queue didn't overflow. The
-+		 * number of valid bits in the 32-bit RPTR depends on
-+		 * the queue size. The remaining bits are taken from
-+		 * the saved 64-bit WPTR. If the WPTR wrapped, add the
-+		 * queue size.
-+		 */
-+		uint32_t queue_size =
-+			2 << REG_GET_FIELD(m->cp_hqd_pq_control,
-+					   CP_HQD_PQ_CONTROL, QUEUE_SIZE);
-+		uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1);
-+
-+		if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr)
-+			guessed_wptr += queue_size;
-+		guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
-+		guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
-+
-+		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
-+		       lower_32_bits(guessed_wptr));
-+		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
-+		       upper_32_bits(guessed_wptr));
-+		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
-+		       lower_32_bits((uint64_t)wptr));
-+		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
-+		       upper_32_bits((uint64_t)wptr));
-+		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1),
-+		       get_queue_mask(adev, pipe_id, queue_id));
-+	}
-+
-+	/* Start the EOP fetcher */
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
-+	       REG_SET_FIELD(m->cp_hqd_eop_rptr,
-+			     CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
-+
-+	data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data);
-+
-+	release_queue(kgd);
-+
-+	return 0;
-+}
-+
-+static int kgd_hqd_dump(struct kgd_dev *kgd,
-+			uint32_t pipe_id, uint32_t queue_id,
-+			uint32_t (**dump)[2], uint32_t *n_regs)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	uint32_t i = 0, reg;
-+#define HQD_N_REGS 56
-+#define DUMP_REG(addr) do {				\
-+		if (WARN_ON_ONCE(i >= HQD_N_REGS))	\
-+			break;				\
-+		(*dump)[i][0] = (addr) << 2;		\
-+		(*dump)[i++][1] = RREG32(addr);		\
-+	} while (0)
-+
-+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
-+	if (*dump == NULL)
-+		return -ENOMEM;
-+
-+	acquire_queue(kgd, pipe_id, queue_id);
-+
-+	for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
-+	     reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
-+		DUMP_REG(reg);
-+
-+	release_queue(kgd);
-+
-+	WARN_ON_ONCE(i != HQD_N_REGS);
-+	*n_regs = i;
-+
-+	return 0;
-+}
-+
-+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
-+			     uint32_t __user *wptr, struct mm_struct *mm)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	struct v9_sdma_mqd *m;
-+	uint32_t sdma_base_addr, sdmax_gfx_context_cntl;
-+	uint32_t temp, timeout = 2000;
-+	uint32_t data;
-+	uint64_t data64;
-+	uint64_t __user *wptr64 = (uint64_t __user *)wptr;
-+
-+	m = get_sdma_mqd(mqd);
-+	sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id,
-+					    m->sdma_queue_id);
-+	sdmax_gfx_context_cntl = m->sdma_engine_id ?
-+		SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) :
-+		SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL);
-+
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
-+		m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
-+
-+	while (true) {
-+		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
-+		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
-+			break;
-+		if (timeout == 0)
-+			return -ETIME;
-+		msleep(10);
-+		timeout -= 10;
-+	}
-+	data = RREG32(sdmax_gfx_context_cntl);
-+	data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
-+			     RESUME_CTX, 0);
-+	WREG32(sdmax_gfx_context_cntl, data);
-+
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET,
-+	       m->sdmax_rlcx_doorbell_offset);
-+
-+	data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
-+			     ENABLE, 1);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI,
-+				m->sdmax_rlcx_rb_rptr_hi);
-+
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
-+	if (read_user_wptr(mm, wptr64, data64)) {
-+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
-+		       lower_32_bits(data64));
-+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
-+		       upper_32_bits(data64));
-+	} else {
-+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
-+		       m->sdmax_rlcx_rb_rptr);
-+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
-+		       m->sdmax_rlcx_rb_rptr_hi);
-+	}
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
-+
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
-+			m->sdmax_rlcx_rb_base_hi);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
-+			m->sdmax_rlcx_rb_rptr_addr_lo);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
-+			m->sdmax_rlcx_rb_rptr_addr_hi);
-+
-+	data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
-+			     RB_ENABLE, 1);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
-+
-+	return 0;
-+}
-+
-+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
-+			     uint32_t engine_id, uint32_t queue_id,
-+			     uint32_t (**dump)[2], uint32_t *n_regs)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	uint32_t sdma_base_addr = get_sdma_base_addr(engine_id, queue_id);
-+	uint32_t i = 0, reg;
-+#undef HQD_N_REGS
-+#define HQD_N_REGS (19+6+7+10)
-+
-+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
-+	if (*dump == NULL)
-+		return -ENOMEM;
-+
-+	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
-+		DUMP_REG(sdma_base_addr + reg);
-+	for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
-+		DUMP_REG(sdma_base_addr + reg);
-+	for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
-+	     reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
-+		DUMP_REG(sdma_base_addr + reg);
-+	for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
-+	     reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
-+		DUMP_REG(sdma_base_addr + reg);
-+
-+	WARN_ON_ONCE(i != HQD_N_REGS);
-+	*n_regs = i;
-+
-+	return 0;
-+}
-+
-+static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
-+				uint32_t pipe_id, uint32_t queue_id)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	uint32_t act;
-+	bool retval = false;
-+	uint32_t low, high;
-+
-+	acquire_queue(kgd, pipe_id, queue_id);
-+	act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
-+	if (act) {
-+		low = lower_32_bits(queue_address >> 8);
-+		high = upper_32_bits(queue_address >> 8);
-+
-+		if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) &&
-+		   high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI)))
-+			retval = true;
-+	}
-+	release_queue(kgd);
-+	return retval;
-+}
-+
-+static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	struct v9_sdma_mqd *m;
-+	uint32_t sdma_base_addr;
-+	uint32_t sdma_rlc_rb_cntl;
-+
-+	m = get_sdma_mqd(mqd);
-+	sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id,
-+					    m->sdma_queue_id);
-+
-+	sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
-+
-+	if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
-+		return true;
-+
-+	return false;
-+}
-+
-+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
-+				enum kfd_preempt_type reset_type,
-+				unsigned int utimeout, uint32_t pipe_id,
-+				uint32_t queue_id)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	enum hqd_dequeue_request_type type;
-+	unsigned long end_jiffies;
-+	uint32_t temp;
-+	struct v9_mqd *m = get_mqd(mqd);
-+
-+#if 0
-+	unsigned long flags;
-+	int retry;
-+#endif
-+
-+	acquire_queue(kgd, pipe_id, queue_id);
-+
-+	if (m->cp_hqd_vmid == 0)
-+		WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
-+
-+	switch (reset_type) {
-+	case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
-+		type = DRAIN_PIPE;
-+		break;
-+	case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
-+		type = RESET_WAVES;
-+		break;
-+	default:
-+		type = DRAIN_PIPE;
-+		break;
-+	}
-+
-+#if 0 /* Is this still needed? */
-+	/* Workaround: If IQ timer is active and the wait time is close to or
-+	 * equal to 0, dequeueing is not safe. Wait until either the wait time
-+	 * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is
-+	 * cleared before continuing. Also, ensure wait times are set to at
-+	 * least 0x3.
-+	 */
-+	local_irq_save(flags);
-+	preempt_disable();
-+	retry = 5000; /* wait for 500 usecs at maximum */
-+	while (true) {
-+		temp = RREG32(mmCP_HQD_IQ_TIMER);
-+		if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) {
-+			pr_debug("HW is processing IQ\n");
-+			goto loop;
-+		}
-+		if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) {
-+			if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE)
-+					== 3) /* SEM-rearm is safe */
-+				break;
-+			/* Wait time 3 is safe for CP, but our MMIO read/write
-+			 * time is close to 1 microsecond, so check for 10 to
-+			 * leave more buffer room
-+			 */
-+			if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME)
-+					>= 10)
-+				break;
-+			pr_debug("IQ timer is active\n");
-+		} else
-+			break;
-+loop:
-+		if (!retry) {
-+			pr_err("CP HQD IQ timer status time out\n");
-+			break;
-+		}
-+		ndelay(100);
-+		--retry;
-+	}
-+	retry = 1000;
-+	while (true) {
-+		temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST);
-+		if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK))
-+			break;
-+		pr_debug("Dequeue request is pending\n");
-+
-+		if (!retry) {
-+			pr_err("CP HQD dequeue request time out\n");
-+			break;
-+		}
-+		ndelay(100);
-+		--retry;
-+	}
-+	local_irq_restore(flags);
-+	preempt_enable();
-+#endif
-+
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
-+
-+	end_jiffies = (utimeout * HZ / 1000) + jiffies;
-+	while (true) {
-+		temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
-+		if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
-+			break;
-+		if (time_after(jiffies, end_jiffies)) {
-+			pr_err("cp queue preemption time out.\n");
-+			release_queue(kgd);
-+			return -ETIME;
-+		}
-+		usleep_range(500, 1000);
-+	}
-+
-+	release_queue(kgd);
-+	return 0;
-+}
-+
-+static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
-+				unsigned int utimeout)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	struct v9_sdma_mqd *m;
-+	uint32_t sdma_base_addr;
-+	uint32_t temp;
-+	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
-+
-+	m = get_sdma_mqd(mqd);
-+	sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id,
-+					    m->sdma_queue_id);
-+
-+	temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
-+	temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp);
-+
-+	while (true) {
-+		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
-+		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
-+			break;
-+		if (time_after(jiffies, end_jiffies))
-+			return -ETIME;
-+		usleep_range(500, 1000);
-+	}
-+
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
-+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
-+		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
-+		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
-+
-+	m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
-+	m->sdmax_rlcx_rb_rptr_hi =
-+		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI);
-+
-+	return 0;
-+}
-+
-+static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
-+							uint8_t vmid)
-+{
-+	uint32_t reg;
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+
-+	reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
-+		     + vmid);
-+	return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
-+}
-+
-+static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
-+								uint8_t vmid)
-+{
-+	uint32_t reg;
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+
-+	reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
-+		     + vmid);
-+	return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
-+}
-+
-+static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+	uint32_t req = (1 << vmid) |
-+		(1 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* light */
-+		VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK |
-+		VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK |
-+		VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK |
-+		VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK |
-+		VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK;
-+
-+	spin_lock(&adev->tlb_invalidation_lock);
-+
-+	/* Use light weight invalidation.
-+	 *
-+	 * TODO 1: agree on the right set of invalidation registers for
-+	 * KFD use. Use the last one for now. Invalidate both GC and
-+	 * MMHUB.
-+	 *
-+	 * TODO 2: support range-based invalidation, requires kfg2kgd
-+	 * interface change
-+	 */
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32),
-+				0xffffffff);
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32),
-+				0x0000001f);
-+
-+	WREG32(SOC15_REG_OFFSET(MMHUB, 0,
-+				mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32),
-+				0xffffffff);
-+	WREG32(SOC15_REG_OFFSET(MMHUB, 0,
-+				mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32),
-+				0x0000001f);
-+
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req);
-+
-+	WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ),
-+				req);
-+
-+	while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) &
-+					(1 << vmid)))
-+		cpu_relax();
-+
-+	while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0,
-+					mmMMHUB_VM_INVALIDATE_ENG16_ACK)) &
-+					(1 << vmid)))
-+		cpu_relax();
-+
-+	spin_unlock(&adev->tlb_invalidation_lock);
-+
-+}
-+
-+static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid)
-+{
-+	signed long r;
-+	struct dma_fence *f;
-+	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
-+
-+	mutex_lock(&adev->gfx.kiq.ring_mutex);
-+	amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/
-+	amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0));
-+	amdgpu_ring_write(ring,
-+			PACKET3_INVALIDATE_TLBS_DST_SEL(1) |
-+			PACKET3_INVALIDATE_TLBS_ALL_HUB(1) |
-+			PACKET3_INVALIDATE_TLBS_PASID(pasid) |
-+			PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(2));
-+	amdgpu_fence_emit(ring, &f);
-+	amdgpu_ring_commit(ring);
-+	mutex_unlock(&adev->gfx.kiq.ring_mutex);
-+
-+	r = dma_fence_wait(f, false);
-+	if (r)
-+		DRM_ERROR("wait for kiq fence error: %ld.\n", r);
-+	dma_fence_put(f);
-+
-+	return r;
-+}
-+
-+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+	int vmid;
-+	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
-+
-+	if (ring->ready)
-+		return invalidate_tlbs_with_kiq(adev, pasid);
-+
-+	for (vmid = 0; vmid < 16; vmid++) {
-+		if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
-+			continue;
-+		if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) {
-+			if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid)
-+				== pasid) {
-+				write_vmid_invalidate_request(kgd, vmid);
-+				break;
-+			}
-+		}
-+	}
-+
-+	return 0;
-+}
-+
-+static int kgd_address_watch_disable(struct kgd_dev *kgd)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	union TCP_WATCH_CNTL_BITS cntl;
-+	unsigned int i;
-+	uint32_t watch_base_addr;
-+
-+	cntl.u32All = 0;
-+
-+	cntl.bitfields.valid = 0;
-+	cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
-+	cntl.bitfields.atc = 1;
-+
-+	watch_base_addr = get_watch_base_addr();
-+	/* Turning off this address until we set all the registers */
-+	for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
-+		WREG32(watch_base_addr +
-+				watchRegs[i * ADDRESS_WATCH_REG_MAX +
-+						ADDRESS_WATCH_REG_CNTL],
-+			cntl.u32All);
-+
-+	return 0;
-+}
-+
-+static int kgd_address_watch_execute(struct kgd_dev *kgd,
-+					unsigned int watch_point_id,
-+					uint32_t cntl_val,
-+					uint32_t addr_hi,
-+					uint32_t addr_lo)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	union TCP_WATCH_CNTL_BITS cntl;
-+	uint32_t watch_base_addr;
-+
-+	watch_base_addr = get_watch_base_addr();
-+	cntl.u32All = cntl_val;
-+
-+	/* Turning off this watch point until we set all the registers */
-+	cntl.bitfields.valid = 0;
-+	WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
-+			cntl.u32All);
-+
-+	WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI],
-+			addr_hi);
-+
-+	WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO],
-+			addr_lo);
-+
-+	/* Enable the watch point */
-+	cntl.bitfields.valid = 1;
-+
-+	WREG32(watch_base_addr +
-+			watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
-+				ADDRESS_WATCH_REG_CNTL],
-+			cntl.u32All);
-+
-+	return 0;
-+}
-+
-+static int kgd_wave_control_execute(struct kgd_dev *kgd,
-+					uint32_t gfx_index_val,
-+					uint32_t sq_cmd)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	uint32_t data = 0;
-+
-+	mutex_lock(&adev->grbm_idx_mutex);
-+
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val);
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd);
-+
-+	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
-+		INSTANCE_BROADCAST_WRITES, 1);
-+	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
-+		SH_BROADCAST_WRITES, 1);
-+	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
-+		SE_BROADCAST_WRITES, 1);
-+
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
-+	mutex_unlock(&adev->grbm_idx_mutex);
-+
-+	return 0;
-+}
-+
-+static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
-+					unsigned int watch_point_id,
-+					unsigned int reg_offset)
-+{
-+	return get_watch_base_addr() +
-+		watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset];
-+}
-+
-+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
-+		uint8_t element_size, uint8_t index_stride, uint8_t mtype)
-+{
-+	/* No longer needed on GFXv9. These values are now hard-coded,
-+	 * except for the MTYPE which comes from the page table.
-+	 */
-+
-+	return 0;
-+}
-+static int alloc_memory_of_scratch(struct kgd_dev *kgd,
-+				 uint64_t va, uint32_t vmid)
-+{
-+	/* No longer needed on GFXv9. The scratch base address is
-+	 * passed to the shader by the CP. It's the user mode driver's
-+	 * responsibility.
-+	 */
-+
-+	return 0;
-+}
-+
-+/* FIXME: Does this need to be ASIC-specific code? */
-+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+	const union amdgpu_firmware_header *hdr;
-+
-+	switch (type) {
-+	case KGD_ENGINE_PFP:
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data;
-+		break;
-+
-+	case KGD_ENGINE_ME:
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data;
-+		break;
-+
-+	case KGD_ENGINE_CE:
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data;
-+		break;
-+
-+	case KGD_ENGINE_MEC1:
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data;
-+		break;
-+
-+	case KGD_ENGINE_MEC2:
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data;
-+		break;
-+
-+	case KGD_ENGINE_RLC:
-+		hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data;
-+		break;
-+
-+	case KGD_ENGINE_SDMA1:
-+		hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data;
-+		break;
-+
-+	case KGD_ENGINE_SDMA2:
-+		hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data;
-+		break;
-+
-+	default:
-+		return 0;
-+	}
-+
-+	if (hdr == NULL)
-+		return 0;
-+
-+	/* Only 12 bit in use*/
-+	return hdr->common.ucode_version;
-+}
-+
-+static void set_num_of_requests(struct kgd_dev *kgd,
-+			uint8_t num_of_requests)
-+{
-+	pr_debug("This is a stub\n");
-+}
-+
-+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
-+		uint32_t page_table_base)
-+{
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+	uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT |
-+		AMDGPU_PTE_VALID;
-+
-+	/* TODO: Don't use hardcoded VMIDs */
-+	if (vmid < 8 || vmid > 15) {
-+		pr_err("trying to set page table base for wrong VMID %u\n",
-+		       vmid);
-+		return;
-+	}
-+
-+	/* TODO: take advantage of per-process address space size. For
-+	 * now, all processes share the same address space size, like
-+	 * on GFX8 and older.
-+	 */
-+	WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
-+	WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
-+
-+	WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2),
-+			lower_32_bits(adev->vm_manager.max_pfn - 1));
-+	WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2),
-+			upper_32_bits(adev->vm_manager.max_pfn - 1));
-+
-+	WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
-+	WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
-+
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
-+
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2),
-+			lower_32_bits(adev->vm_manager.max_pfn - 1));
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2),
-+			upper_32_bits(adev->vm_manager.max_pfn - 1));
-+
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
-+	WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
-+}
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
-new file mode 100644
-index 0000000..7df892d
---- /dev/null
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
-@@ -0,0 +1,2578 @@
-+/*
-+ * Copyright 2014 Advanced Micro Devices, Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+
-+#undef pr_fmt
-+#define pr_fmt(fmt) "kfd2kgd: " fmt
-+
-+#include <linux/module.h>
-+#include <linux/fdtable.h>
-+#include <linux/uaccess.h>
-+#include <linux/firmware.h>
-+#include <linux/list.h>
-+#include <linux/sched/mm.h>
-+#include <drm/drmP.h>
-+#include <linux/dma-buf.h>
-+#include <linux/pagemap.h>
-+#include "amdgpu_amdkfd.h"
-+#include "amdgpu_ucode.h"
-+#include "gca/gfx_8_0_sh_mask.h"
-+#include "gca/gfx_8_0_d.h"
-+#include "gca/gfx_8_0_enum.h"
-+#include "oss/oss_3_0_sh_mask.h"
-+#include "oss/oss_3_0_d.h"
-+#include "gmc/gmc_8_1_sh_mask.h"
-+#include "gmc/gmc_8_1_d.h"
-+
-+/* Special VM and GART address alignment needed for VI pre-Fiji due to
-+ * a HW bug.
-+ */
-+#define VI_BO_SIZE_ALIGN (0x8000)
-+
-+/* BO flag to indicate a KFD userptr BO */
-+#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63)
-+
-+/* Impose limit on how much memory KFD can use */
-+struct kfd_mem_usage_limit {
-+	uint64_t max_system_mem_limit;
-+	uint64_t max_userptr_mem_limit;
-+	int64_t system_mem_used;
-+	int64_t userptr_mem_used;
-+	spinlock_t mem_limit_lock;
-+};
-+
-+static struct kfd_mem_usage_limit kfd_mem_limit;
-+
-+/* Struct used for amdgpu_amdkfd_bo_validate */
-+struct amdgpu_vm_parser {
-+	uint32_t        domain;
-+	bool            wait;
-+};
-+
-+static const char * const domain_bit_to_string[] = {
-+		"CPU",
-+		"GTT",
-+		"VRAM",
-+		"GDS",
-+		"GWS",
-+		"OA"
-+};
-+
-+#define domain_string(domain) domain_bit_to_string[ffs(domain)-1]
-+
-+static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work);
-+
-+
-+static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
-+{
-+	return (struct amdgpu_device *)kgd;
-+}
-+
-+static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm,
-+		struct kgd_mem *mem)
-+{
-+	struct kfd_bo_va_list *entry;
-+
-+	list_for_each_entry(entry, &mem->bo_va_list, bo_list)
-+		if (entry->bo_va->base.vm == avm)
-+			return false;
-+
-+	return true;
-+}
-+
-+/* Set memory usage limits. Current, limits are
-+ *  System (kernel) memory - 15/16th System RAM
-+ *  Userptr memory - 15/16th System RAM
-+ */
-+void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
-+{
-+	struct sysinfo si;
-+	uint64_t mem;
-+
-+	si_meminfo(&si);
-+	mem = si.totalram - si.totalhigh;
-+	mem *= si.mem_unit;
-+
-+	spin_lock_init(&kfd_mem_limit.mem_limit_lock);
-+	kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4); /* 15/16 */
-+	kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 4); /* 15/16 */
-+	pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n",
-+		(kfd_mem_limit.max_system_mem_limit >> 20),
-+		(kfd_mem_limit.max_userptr_mem_limit >> 20));
-+}
-+
-+static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev,
-+					      uint64_t size, u32 domain)
-+{
-+	size_t acc_size;
-+	int ret = 0;
-+
-+	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
-+				       sizeof(struct amdgpu_bo));
-+
-+	spin_lock(&kfd_mem_limit.mem_limit_lock);
-+	if (domain == AMDGPU_GEM_DOMAIN_GTT) {
-+		if (kfd_mem_limit.system_mem_used + (acc_size + size) >
-+			kfd_mem_limit.max_system_mem_limit) {
-+			ret = -ENOMEM;
-+			goto err_no_mem;
-+		}
-+		kfd_mem_limit.system_mem_used += (acc_size + size);
-+	} else if (domain == AMDGPU_GEM_DOMAIN_CPU) {
-+		if ((kfd_mem_limit.system_mem_used + acc_size >
-+			kfd_mem_limit.max_system_mem_limit) ||
-+			(kfd_mem_limit.userptr_mem_used + (size + acc_size) >
-+			kfd_mem_limit.max_userptr_mem_limit)) {
-+			ret = -ENOMEM;
-+			goto err_no_mem;
-+		}
-+		kfd_mem_limit.system_mem_used += acc_size;
-+		kfd_mem_limit.userptr_mem_used += size;
-+	}
-+err_no_mem:
-+	spin_unlock(&kfd_mem_limit.mem_limit_lock);
-+	return ret;
-+}
-+
-+static void unreserve_system_mem_limit(struct amdgpu_device *adev,
-+				       uint64_t size, u32 domain)
-+{
-+	size_t acc_size;
-+
-+	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
-+				       sizeof(struct amdgpu_bo));
-+
-+	spin_lock(&kfd_mem_limit.mem_limit_lock);
-+	if (domain == AMDGPU_GEM_DOMAIN_GTT) {
-+		kfd_mem_limit.system_mem_used -= (acc_size + size);
-+	} else if (domain == AMDGPU_GEM_DOMAIN_CPU) {
-+		kfd_mem_limit.system_mem_used -= acc_size;
-+		kfd_mem_limit.userptr_mem_used -= size;
-+	}
-+	WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
-+		  "kfd system memory accounting unbalanced");
-+	WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0,
-+		  "kfd userptr memory accounting unbalanced");
-+
-+	spin_unlock(&kfd_mem_limit.mem_limit_lock);
-+}
-+
-+void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo)
-+{
-+	spin_lock(&kfd_mem_limit.mem_limit_lock);
-+
-+	if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) {
-+		kfd_mem_limit.system_mem_used -= bo->tbo.acc_size;
-+		kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo);
-+	} else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) {
-+		kfd_mem_limit.system_mem_used -=
-+			(bo->tbo.acc_size + amdgpu_bo_size(bo));
-+	}
-+	WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
-+		  "kfd system memory accounting unbalanced");
-+	WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0,
-+		  "kfd userptr memory accounting unbalanced");
-+
-+	spin_unlock(&kfd_mem_limit.mem_limit_lock);
-+}
-+
-+
-+/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence(s) from BO's
-+ *  reservation object.
-+ *
-+ * @bo: [IN] Remove eviction fence(s) from this BO
-+ * @ef: [IN] If ef is specified, then this eviction fence is removed if it
-+ *  is present in the shared list.
-+ * @ef_list: [OUT] Returns list of eviction fences. These fences are removed
-+ *  from BO's reservation object shared list.
-+ * @ef_count: [OUT] Number of fences in ef_list.
-+ *
-+ * NOTE: If called with ef_list, then amdgpu_amdkfd_add_eviction_fence must be
-+ *  called to restore the eviction fences and to avoid memory leak. This is
-+ *  useful for shared BOs.
-+ * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held.
-+ */
-+static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
-+					struct amdgpu_amdkfd_fence *ef,
-+					struct amdgpu_amdkfd_fence ***ef_list,
-+					unsigned int *ef_count)
-+{
-+	struct reservation_object_list *fobj;
-+	struct reservation_object *resv;
-+	unsigned int i = 0, j = 0, k = 0, shared_count;
-+	unsigned int count = 0;
-+	struct amdgpu_amdkfd_fence **fence_list;
-+
-+	if (!ef && !ef_list)
-+		return -EINVAL;
-+
-+	if (ef_list) {
-+		*ef_list = NULL;
-+		*ef_count = 0;
-+	}
-+
-+	resv = bo->tbo.resv;
-+	fobj = reservation_object_get_list(resv);
-+
-+	if (!fobj)
-+		return 0;
-+
-+	preempt_disable();
-+	write_seqcount_begin(&resv->seq);
-+
-+	/* Go through all the shared fences in the resevation object. If
-+	 * ef is specified and it exists in the list, remove it and reduce the
-+	 * count. If ef is not specified, then get the count of eviction fences
-+	 * present.
-+	 */
-+	shared_count = fobj->shared_count;
-+	for (i = 0; i < shared_count; ++i) {
-+		struct dma_fence *f;
-+
-+		f = rcu_dereference_protected(fobj->shared[i],
-+					      reservation_object_held(resv));
-+
-+		if (ef) {
-+			if (f->context == ef->base.context) {
-+				dma_fence_put(f);
-+				fobj->shared_count--;
-+			} else
-+				RCU_INIT_POINTER(fobj->shared[j++], f);
-+
-+		} else if (to_amdgpu_amdkfd_fence(f))
-+			count++;
-+	}
-+	write_seqcount_end(&resv->seq);
-+	preempt_enable();
-+
-+	if (ef || !count)
-+		return 0;
-+
-+	/* Alloc memory for count number of eviction fence pointers. Fill the
-+	 * ef_list array and ef_count
-+	 */
-+
-+	fence_list = kcalloc(count, sizeof(struct amdgpu_amdkfd_fence *),
-+			     GFP_KERNEL);
-+	if (!fence_list)
-+		return -ENOMEM;
-+
-+	preempt_disable();
-+	write_seqcount_begin(&resv->seq);
-+
-+	j = 0;
-+	for (i = 0; i < shared_count; ++i) {
-+		struct dma_fence *f;
-+		struct amdgpu_amdkfd_fence *efence;
-+
-+		f = rcu_dereference_protected(fobj->shared[i],
-+			reservation_object_held(resv));
-+
-+		efence = to_amdgpu_amdkfd_fence(f);
-+		if (efence) {
-+			fence_list[k++] = efence;
-+			fobj->shared_count--;
-+		} else
-+			RCU_INIT_POINTER(fobj->shared[j++], f);
-+	}
-+
-+	write_seqcount_end(&resv->seq);
-+	preempt_enable();
-+
-+	*ef_list = fence_list;
-+	*ef_count = k;
-+
-+	return 0;
-+}
-+
-+/* amdgpu_amdkfd_add_eviction_fence - Adds eviction fence(s) back into BO's
-+ *  reservation object.
-+ *
-+ * @bo: [IN] Add eviction fences to this BO
-+ * @ef_list: [IN] List of eviction fences to be added
-+ * @ef_count: [IN] Number of fences in ef_list.
-+ *
-+ * NOTE: Must call amdgpu_amdkfd_remove_eviction_fence before calling this
-+ *  function.
-+ */
-+static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo,
-+				struct amdgpu_amdkfd_fence **ef_list,
-+				unsigned int ef_count)
-+{
-+	int i;
-+
-+	if (!ef_list || !ef_count)
-+		return;
-+
-+	for (i = 0; i < ef_count; i++) {
-+		amdgpu_bo_fence(bo, &ef_list[i]->base, true);
-+		/* Readding the fence takes an additional reference. Drop that
-+		 * reference.
-+		 */
-+		dma_fence_put(&ef_list[i]->base);
-+	}
-+
-+	kfree(ef_list);
-+}
-+
-+static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
-+				     bool wait)
-+{
-+	int ret;
-+
-+	if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm),
-+		 "Called with userptr BO"))
-+		return -EINVAL;
-+
-+	amdgpu_ttm_placement_from_domain(bo, domain);
-+
-+	ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false);
-+	if (ret)
-+		goto validate_fail;
-+	if (wait) {
-+		struct amdgpu_amdkfd_fence **ef_list;
-+		unsigned int ef_count;
-+
-+		ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list,
-+							  &ef_count);
-+		if (ret)
-+			goto validate_fail;
-+
-+		ttm_bo_wait(&bo->tbo, false, false);
-+		amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count);
-+	}
-+
-+validate_fail:
-+	return ret;
-+}
-+
-+static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo)
-+{
-+	struct amdgpu_vm_parser *p = param;
-+
-+	return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait);
-+}
-+
-+/* vm_validate_pt_pd_bos - Validate page table and directory BOs
-+ *
-+ * Also updates page directory entries so we don't need to do this
-+ * again later until the page directory is validated again (e.g. after
-+ * an eviction or allocating new page tables).
-+ */
-+static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
-+{
-+	struct amdgpu_bo *pd = vm->root.base.bo;
-+	struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
-+	struct amdgpu_vm_parser param;
-+	int ret;
-+
-+	param.domain = AMDGPU_GEM_DOMAIN_VRAM;
-+	param.wait = false;
-+
-+	ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate,
-+					&param);
-+	if (ret) {
-+		pr_err("amdgpu: failed to validate PT BOs\n");
-+		return ret;
-+	}
-+
-+	ret = amdgpu_amdkfd_validate(&param, pd);
-+	if (ret) {
-+		pr_err("amdgpu: failed to validate PD\n");
-+		return ret;
-+	}
-+
-+	ret = amdgpu_vm_update_directories(adev, vm);
-+	if (ret != 0)
-+		return ret;
-+
-+	return 0;
-+}
-+
-+/* add_bo_to_vm - Add a BO to a VM
-+ *
-+ * Everything that needs to bo done only once when a BO is first added
-+ * to a VM. It can later be mapped and unmapped many times without
-+ * repeating these steps.
-+ *
-+ * 1. Allocate and initialize BO VA entry data structure
-+ * 2. Add BO to the VM
-+ * 3. Determine ASIC-specific PTE flags
-+ * 4. Alloc page tables and directories if needed
-+ * 4a.  Validate new page tables and directories and update directories
-+ */
-+static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem,
-+		struct amdgpu_vm *avm, bool is_aql,
-+		struct kfd_bo_va_list **p_bo_va_entry)
-+{
-+	int ret;
-+	struct kfd_bo_va_list *bo_va_entry;
-+	struct amdkfd_vm *kvm = container_of(avm,
-+					     struct amdkfd_vm, base);
-+	struct amdgpu_bo *pd = avm->root.base.bo;
-+	struct amdgpu_bo *bo = mem->bo;
-+	uint64_t va = mem->va;
-+	struct list_head *list_bo_va = &mem->bo_va_list;
-+	unsigned long bo_size = bo->tbo.mem.size;
-+
-+	if (!va) {
-+		pr_err("Invalid VA when adding BO to VM\n");
-+		return -EINVAL;
-+	}
-+
-+	if (is_aql)
-+		va += bo_size;
-+
-+	bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL);
-+	if (!bo_va_entry)
-+		return -ENOMEM;
-+
-+	pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
-+			va + bo_size, avm);
-+
-+	/* Add BO to VM internal data structures*/
-+	bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo);
-+	if (bo_va_entry->bo_va == NULL) {
-+		ret = -EINVAL;
-+		pr_err("Failed to add BO object to VM. ret == %d\n",
-+				ret);
-+		goto err_vmadd;
-+	}
-+
-+	bo_va_entry->va = va;
-+	bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev,
-+							 mem->mapping_flags);
-+	bo_va_entry->kgd_dev = (void *)adev;
-+	list_add(&bo_va_entry->bo_list, list_bo_va);
-+
-+	if (p_bo_va_entry)
-+		*p_bo_va_entry = bo_va_entry;
-+
-+	/* Allocate new page tables if neeeded and validate
-+	 * them. Clearing of new page tables and validate need to wait
-+	 * on move fences. We don't want that to trigger the eviction
-+	 * fence, so remove it temporarily.
-+	 */
-+	amdgpu_amdkfd_remove_eviction_fence(pd,
-+					kvm->process_info->eviction_fence,
-+					NULL, NULL);
-+
-+	ret = amdgpu_vm_alloc_pts(adev, avm, va, amdgpu_bo_size(bo));
-+	if (ret) {
-+		pr_err("Failed to allocate pts, err=%d\n", ret);
-+		goto err_alloc_pts;
-+	}
-+
-+	ret = vm_validate_pt_pd_bos(avm);
-+	if (ret != 0) {
-+		pr_err("validate_pt_pd_bos() failed\n");
-+		goto err_alloc_pts;
-+	}
-+
-+	/* Add the eviction fence back */
-+	amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true);
-+
-+	return 0;
-+
-+err_alloc_pts:
-+	amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true);
-+	amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va);
-+	list_del(&bo_va_entry->bo_list);
-+err_vmadd:
-+	kfree(bo_va_entry);
-+	return ret;
-+}
-+
-+static void remove_bo_from_vm(struct amdgpu_device *adev,
-+		struct kfd_bo_va_list *entry, unsigned long size)
-+{
-+	pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n",
-+			entry->va,
-+			entry->va + size, entry);
-+	amdgpu_vm_bo_rmv(adev, entry->bo_va);
-+	list_del(&entry->bo_list);
-+	kfree(entry);
-+}
-+
-+static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
-+				struct amdkfd_process_info *process_info,
-+				bool userptr)
-+{
-+	struct ttm_validate_buffer *entry = &mem->validate_list;
-+	struct amdgpu_bo *bo = mem->bo;
-+
-+	INIT_LIST_HEAD(&entry->head);
-+	entry->shared = true;
-+	entry->bo = &bo->tbo;
-+	mutex_lock(&process_info->lock);
-+	if (userptr)
-+		list_add_tail(&entry->head, &process_info->userptr_valid_list);
-+	else
-+		list_add_tail(&entry->head, &process_info->kfd_bo_list);
-+	mutex_unlock(&process_info->lock);
-+}
-+
-+/* Initializes user pages. It registers the MMU notifier and validates
-+ * the userptr BO in the GTT domain.
-+ *
-+ * The BO must already be on the userptr_valid_list. Otherwise an
-+ * eviction and restore may happen that leaves the new BO unmapped
-+ * with the user mode queues running.
-+ *
-+ * Takes the process_info->lock to protect against concurrent restore
-+ * workers.
-+ *
-+ * Returns 0 for success, negative errno for errors.
-+ */
-+static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
-+			   uint64_t user_addr)
-+{
-+	struct amdkfd_process_info *process_info = mem->process_info;
-+	struct amdgpu_bo *bo = mem->bo;
-+	int ret = 0;
-+
-+	mutex_lock(&process_info->lock);
-+
-+	ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0);
-+	if (ret) {
-+		pr_err("%s: Failed to set userptr: %d\n", __func__, ret);
-+		goto out;
-+	}
-+
-+	ret = amdgpu_mn_register(bo, user_addr);
-+	if (ret) {
-+		pr_err("%s: Failed to register MMU notifier: %d\n",
-+		       __func__, ret);
-+		goto out;
-+	}
-+
-+	/* If no restore worker is running concurrently, user_pages
-+	 * should not be allocated
-+	 */
-+	WARN(mem->user_pages, "Leaking user_pages array");
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
-+	mem->user_pages = drm_calloc_large(bo->tbo.ttm->num_pages,
-+					   sizeof(struct page *));
-+#else
-+	mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages,
-+					   sizeof(struct page *),
-+					   GFP_KERNEL | __GFP_ZERO);
-+#endif
-+	if (!mem->user_pages) {
-+		pr_err("%s: Failed to allocate pages array\n", __func__);
-+		ret = -ENOMEM;
-+		goto unregister_out;
-+	}
-+
-+	ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages);
-+	if (ret) {
-+		pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
-+		goto free_out;
-+	}
-+
-+	amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages);
-+
-+	ret = amdgpu_bo_reserve(bo, true);
-+	if (ret) {
-+		pr_err("%s: Failed to reserve BO\n", __func__);
-+		goto release_out;
-+	}
-+	amdgpu_ttm_placement_from_domain(bo, mem->domain);
-+	ret = ttm_bo_validate(&bo->tbo, &bo->placement,
-+			      true, false);
-+	if (ret)
-+		pr_err("%s: failed to validate BO\n", __func__);
-+	amdgpu_bo_unreserve(bo);
-+
-+release_out:
-+	if (ret)
-+		release_pages(mem->user_pages, bo->tbo.ttm->num_pages, 0);
-+free_out:
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
-+	drm_free_large(mem->user_pages);
-+#else
-+	kvfree(mem->user_pages);
-+#endif
-+	mem->user_pages = NULL;
-+unregister_out:
-+	if (ret)
-+		amdgpu_mn_unregister(bo);
-+out:
-+	mutex_unlock(&process_info->lock);
-+	return ret;
-+}
-+
-+static int __map_bo_to_kernel(struct amdgpu_bo *bo, u32 domain, void **kptr)
-+{
-+	int ret;
-+
-+	ret = amdgpu_bo_reserve(bo, true);
-+	if (ret) {
-+		pr_err("Failed to reserve bo. ret %d\n", ret);
-+		return ret;
-+	}
-+
-+	ret = amdgpu_bo_pin(bo, domain, NULL);
-+	if (ret) {
-+		pr_err("Failed to pin bo. ret %d\n", ret);
-+		goto pin_failed;
-+	}
-+
-+	ret = amdgpu_bo_kmap(bo, kptr);
-+	if (ret) {
-+		pr_err("Failed to map bo to kernel. ret %d\n", ret);
-+		goto kmap_failed;
-+	}
-+
-+	amdgpu_bo_unreserve(bo);
-+
-+	return ret;
-+
-+kmap_failed:
-+	amdgpu_bo_unpin(bo);
-+pin_failed:
-+	amdgpu_bo_unreserve(bo);
-+
-+	return ret;
-+}
-+
-+static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va,
-+		uint64_t size, void *vm, struct kgd_mem **mem,
-+		uint64_t *offset, u32 domain, u64 flags,
-+		struct sg_table *sg, bool aql_queue,
-+		bool readonly, bool execute, bool coherent, bool no_sub,
-+		bool userptr)
-+{
-+	struct amdgpu_device *adev;
-+	int ret;
-+	struct amdgpu_bo *bo;
-+	uint64_t user_addr = 0;
-+	int byte_align;
-+	u32 alloc_domain;
-+	uint32_t mapping_flags;
-+	struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm;
-+
-+	if (aql_queue)
-+		size = size >> 1;
-+	if (userptr) {
-+		if (!offset || !*offset)
-+			return -EINVAL;
-+		user_addr = *offset;
-+	}
-+
-+	adev = get_amdgpu_device(kgd);
-+	byte_align = (adev->family == AMDGPU_FAMILY_VI &&
-+			adev->asic_type != CHIP_FIJI &&
-+			adev->asic_type != CHIP_POLARIS10 &&
-+			adev->asic_type != CHIP_POLARIS11) ?
-+			VI_BO_SIZE_ALIGN : 1;
-+
-+	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
-+	if (*mem == NULL) {
-+		ret = -ENOMEM;
-+		goto err;
-+	}
-+	INIT_LIST_HEAD(&(*mem)->bo_va_list);
-+	mutex_init(&(*mem)->lock);
-+	(*mem)->coherent = coherent;
-+	(*mem)->no_substitute = no_sub;
-+	(*mem)->aql_queue = aql_queue;
-+
-+	mapping_flags = AMDGPU_VM_PAGE_READABLE;
-+	if (!readonly)
-+		mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;
-+	if (execute)
-+		mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
-+	if (coherent)
-+		mapping_flags |= AMDGPU_VM_MTYPE_UC;
-+	else
-+		mapping_flags |= AMDGPU_VM_MTYPE_NC;
-+
-+	(*mem)->mapping_flags = mapping_flags;
-+
-+	alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain;
-+
-+	amdgpu_sync_create(&(*mem)->sync);
-+
-+	ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain);
-+	if (ret) {
-+		pr_err("Insufficient system memory\n");
-+		goto err_bo_create;
-+	}
-+
-+	pr_debug("\t create BO VA 0x%llx size 0x%llx domain %s\n",
-+			va, size, domain_string(alloc_domain));
-+
-+	/* Allocate buffer object. Userptr objects need to start out
-+	 * in the CPU domain, get moved to GTT when pinned.
-+	 */
-+	ret = amdgpu_bo_create(adev, size, byte_align, false,
-+				alloc_domain,
-+			       flags, sg, NULL, 0, &bo);
-+	if (ret != 0) {
-+		pr_err("Failed to create BO on domain %s. ret %d\n",
-+				domain_string(alloc_domain), ret);
-+		unreserve_system_mem_limit(adev, size, alloc_domain);
-+		goto err_bo_create;
-+	}
-+	bo->kfd_bo = *mem;
-+	(*mem)->bo = bo;
-+	if (userptr)
-+		bo->flags |= AMDGPU_AMDKFD_USERPTR_BO;
-+
-+	(*mem)->va = va;
-+	(*mem)->domain = domain;
-+	(*mem)->mapped_to_gpu_memory = 0;
-+	(*mem)->process_info = kfd_vm->process_info;
-+	add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, userptr);
-+
-+	if (userptr) {
-+		ret = init_user_pages(*mem, current->mm, user_addr);
-+		if (ret) {
-+			mutex_lock(&kfd_vm->process_info->lock);
-+			list_del(&(*mem)->validate_list.head);
-+			mutex_unlock(&kfd_vm->process_info->lock);
-+			goto allocate_init_user_pages_failed;
-+		}
-+	}
-+
-+	if (offset)
-+		*offset = amdgpu_bo_mmap_offset(bo);
-+
-+	return 0;
-+
-+allocate_init_user_pages_failed:
-+	amdgpu_bo_unref(&bo);
-+err_bo_create:
-+	kfree(*mem);
-+err:
-+	return ret;
-+}
-+
-+/* Reserving a BO and its page table BOs must happen atomically to
-+ * avoid deadlocks. When updating userptrs we need to temporarily
-+ * back-off the reservation and then reacquire it. Track all the
-+ * reservation info in a context structure. Buffers can be mapped to
-+ * multiple VMs simultaneously (buffers being restored on multiple
-+ * GPUs).
-+ */
-+struct bo_vm_reservation_context {
-+	struct amdgpu_bo_list_entry kfd_bo;
-+	unsigned int n_vms;
-+	struct amdgpu_bo_list_entry *vm_pd;
-+	struct ww_acquire_ctx ticket;
-+	struct list_head list, duplicates;
-+	struct amdgpu_sync *sync;
-+	bool reserved;
-+};
-+
-+/**
-+ * reserve_bo_and_vm - reserve a BO and a VM unconditionally.
-+ * @mem: KFD BO structure.
-+ * @vm: the VM to reserve.
-+ * @ctx: the struct that will be used in unreserve_bo_and_vms().
-+ */
-+static int reserve_bo_and_vm(struct kgd_mem *mem,
-+			      struct amdgpu_vm *vm,
-+			      struct bo_vm_reservation_context *ctx)
-+{
-+	struct amdgpu_bo *bo = mem->bo;
-+	int ret;
-+
-+	WARN_ON(!vm);
-+
-+	ctx->reserved = false;
-+	ctx->n_vms = 1;
-+	ctx->sync = &mem->sync;
-+
-+	INIT_LIST_HEAD(&ctx->list);
-+	INIT_LIST_HEAD(&ctx->duplicates);
-+
-+	ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry)
-+			      * ctx->n_vms, GFP_KERNEL);
-+	if (ctx->vm_pd == NULL)
-+		return -ENOMEM;
-+
-+	ctx->kfd_bo.robj = bo;
-+	ctx->kfd_bo.priority = 0;
-+	ctx->kfd_bo.tv.bo = &bo->tbo;
-+	ctx->kfd_bo.tv.shared = true;
-+	ctx->kfd_bo.user_pages = NULL;
-+	list_add(&ctx->kfd_bo.tv.head, &ctx->list);
-+
-+	amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]);
-+
-+	ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
-+				     false, &ctx->duplicates);
-+	if (!ret)
-+		ctx->reserved = true;
-+	else
-+		pr_err("Failed to reserve buffers in ttm\n");
-+
-+	if (ret) {
-+		kfree(ctx->vm_pd);
-+		ctx->vm_pd = NULL;
-+	}
-+
-+	return ret;
-+}
-+
-+enum VA_TYPE {
-+	VA_NOT_MAPPED = 0,
-+	VA_MAPPED,
-+	VA_DO_NOT_CARE,
-+};
-+
-+/**
-+ * reserve_bo_and_vm - reserve a BO and some VMs that the BO has been added
-+ * to, conditionally based on map_type.
-+ * @mem: KFD BO structure.
-+ * @vm: the VM to reserve. If NULL, then all VMs associated with the BO
-+ * is used. Otherwise, a single VM associated with the BO.
-+ * @map_type: the mapping status that will be used to filter the VMs.
-+ * @ctx: the struct that will be used in unreserve_bo_and_vms().
-+ */
-+static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
-+			      struct amdgpu_vm *vm, enum VA_TYPE map_type,
-+			      struct bo_vm_reservation_context *ctx)
-+{
-+	struct amdgpu_bo *bo = mem->bo;
-+	struct kfd_bo_va_list *entry;
-+	unsigned int i;
-+	int ret;
-+
-+	ctx->reserved = false;
-+	ctx->n_vms = 0;
-+	ctx->vm_pd = NULL;
-+	ctx->sync = &mem->sync;
-+
-+	INIT_LIST_HEAD(&ctx->list);
-+	INIT_LIST_HEAD(&ctx->duplicates);
-+
-+	list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
-+		if ((vm && vm != entry->bo_va->base.vm) ||
-+			(entry->is_mapped != map_type
-+			&& map_type != VA_DO_NOT_CARE))
-+			continue;
-+
-+		ctx->n_vms++;
-+	}
-+
-+	if (ctx->n_vms != 0) {
-+		ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry)
-+			      * ctx->n_vms, GFP_KERNEL);
-+		if (ctx->vm_pd == NULL)
-+			return -ENOMEM;
-+	}
-+
-+	ctx->kfd_bo.robj = bo;
-+	ctx->kfd_bo.priority = 0;
-+	ctx->kfd_bo.tv.bo = &bo->tbo;
-+	ctx->kfd_bo.tv.shared = true;
-+	ctx->kfd_bo.user_pages = NULL;
-+	list_add(&ctx->kfd_bo.tv.head, &ctx->list);
-+
-+	i = 0;
-+	list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
-+		if ((vm && vm != entry->bo_va->base.vm) ||
-+			(entry->is_mapped != map_type
-+			&& map_type != VA_DO_NOT_CARE))
-+			continue;
-+
-+		amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list,
-+				&ctx->vm_pd[i]);
-+		i++;
-+	}
-+
-+	ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
-+				     false, &ctx->duplicates);
-+	if (!ret)
-+		ctx->reserved = true;
-+	else
-+		pr_err("Failed to reserve buffers in ttm.\n");
-+
-+	if (ret) {
-+		kfree(ctx->vm_pd);
-+		ctx->vm_pd = NULL;
-+	}
-+
-+	return ret;
-+}
-+
-+static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
-+				 bool wait, bool intr)
-+{
-+	int ret = 0;
-+
-+	if (wait)
-+		ret = amdgpu_sync_wait(ctx->sync, intr);
-+
-+	if (ctx->reserved)
-+		ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list);
-+	kfree(ctx->vm_pd);
-+
-+	ctx->sync = NULL;
-+
-+	ctx->reserved = false;
-+	ctx->vm_pd = NULL;
-+
-+	return ret;
-+}
-+
-+static int unmap_bo_from_gpuvm(struct amdgpu_device *adev,
-+				struct kfd_bo_va_list *entry,
-+				struct amdgpu_sync *sync)
-+{
-+	struct amdgpu_bo_va *bo_va = entry->bo_va;
-+	struct amdgpu_vm *vm = bo_va->base.vm;
-+	struct amdkfd_vm *kvm = container_of(vm, struct amdkfd_vm, base);
-+	struct amdgpu_bo *pd = vm->root.base.bo;
-+
-+	/* Remove eviction fence from PD (and thereby from PTs too as they
-+	 * share the resv. object. Otherwise during PT update job (see
-+	 * amdgpu_vm_bo_update_mapping), eviction fence will get added to
-+	 * job->sync object
-+	 */
-+	amdgpu_amdkfd_remove_eviction_fence(pd,
-+					    kvm->process_info->eviction_fence,
-+					    NULL, NULL);
-+	amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
-+
-+	amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
-+
-+	/* Add the eviction fence back */
-+	amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true);
-+
-+	amdgpu_sync_fence(adev, sync, bo_va->last_pt_update);
-+
-+	/* Sync objects can't handle multiple GPUs (contexts) updating
-+	 * sync->last_vm_update. Fortunately we don't need it for
-+	 * KFD's purposes, so we can just drop that fence.
-+	 */
-+	if (sync->last_vm_update) {
-+		dma_fence_put(sync->last_vm_update);
-+		sync->last_vm_update = NULL;
-+	}
-+
-+	return 0;
-+}
-+
-+static int update_gpuvm_pte(struct amdgpu_device *adev,
-+		struct kfd_bo_va_list *entry,
-+		struct amdgpu_sync *sync)
-+{
-+	int ret;
-+	struct amdgpu_vm *vm;
-+	struct amdgpu_bo_va *bo_va;
-+	struct amdgpu_bo *bo;
-+
-+	bo_va = entry->bo_va;
-+	vm = bo_va->base.vm;
-+	bo = bo_va->base.bo;
-+
-+	/* Update the page tables  */
-+	ret = amdgpu_vm_bo_update(adev, bo_va, false);
-+	if (ret != 0) {
-+		pr_err("amdgpu_vm_bo_update failed\n");
-+		return ret;
-+	}
-+
-+	amdgpu_sync_fence(adev, sync, bo_va->last_pt_update);
-+
-+	/* Sync objects can't handle multiple GPUs (contexts) updating
-+	 * sync->last_vm_update. Fortunately we don't need it for
-+	 * KFD's purposes, so we can just drop that fence.
-+	 */
-+	if (sync->last_vm_update) {
-+		dma_fence_put(sync->last_vm_update);
-+		sync->last_vm_update = NULL;
-+	}
-+
-+	return 0;
-+}
-+
-+static int map_bo_to_gpuvm(struct amdgpu_device *adev,
-+		struct kfd_bo_va_list *entry, struct amdgpu_sync *sync,
-+		bool no_update_pte)
-+{
-+	int ret;
-+
-+	/* Set virtual address for the allocation */
-+	ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0,
-+			amdgpu_bo_size(entry->bo_va->base.bo), entry->pte_flags);
-+	if (ret != 0) {
-+		pr_err("Failed to map VA 0x%llx in vm. ret %d\n",
-+				entry->va, ret);
-+		return ret;
-+	}
-+
-+	if (no_update_pte)
-+		return 0;
-+
-+	ret = update_gpuvm_pte(adev, entry, sync);
-+	if (ret != 0) {
-+		pr_err("update_gpuvm_pte() failed\n");
-+		goto update_gpuvm_pte_failed;
-+	}
-+
-+	return 0;
-+
-+update_gpuvm_pte_failed:
-+	unmap_bo_from_gpuvm(adev, entry, sync);
-+	return ret;
-+}
-+
-+static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size)
-+{
-+	struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL);
-+
-+	if (!sg)
-+		return NULL;
-+	if (sg_alloc_table(sg, 1, GFP_KERNEL)) {
-+		kfree(sg);
-+		return NULL;
-+	}
-+	sg->sgl->dma_address = addr;
-+	sg->sgl->length = size;
-+#ifdef CONFIG_NEED_SG_DMA_LENGTH
-+	sg->sgl->dma_length = size;
-+#endif
-+	return sg;
-+}
-+
-+int amdgpu_amdkfd_gpuvm_sync_memory(
-+		struct kgd_dev *kgd, struct kgd_mem *mem, bool intr)
-+{
-+	int ret = 0;
-+	struct amdgpu_sync sync;
-+	struct amdgpu_device *adev;
-+
-+	adev = get_amdgpu_device(kgd);
-+	amdgpu_sync_create(&sync);
-+
-+	mutex_lock(&mem->lock);
-+	amdgpu_sync_clone(adev, &mem->sync, &sync);
-+	mutex_unlock(&mem->lock);
-+
-+	ret = amdgpu_sync_wait(&sync, intr);
-+	amdgpu_sync_free(&sync);
-+	return ret;
-+}
-+
-+#define BOOL_TO_STR(b)	(b == true) ? "true" : "false"
-+
-+int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
-+		struct kgd_dev *kgd, uint64_t va, uint64_t size,
-+		void *vm, struct kgd_mem **mem,
-+		uint64_t *offset, uint32_t flags)
-+{
-+	bool aql_queue, public, readonly, execute, coherent, no_sub, userptr;
-+	u64 alloc_flag;
-+	uint32_t domain;
-+	uint64_t *temp_offset;
-+	struct sg_table *sg = NULL;
-+
-+	if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) {
-+		pr_err("current hw doesn't support paged memory\n");
-+		return -EINVAL;
-+	}
-+
-+	domain = 0;
-+	alloc_flag = 0;
-+	temp_offset = NULL;
-+
-+	aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false;
-+	public    = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false;
-+	readonly  = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false;
-+	execute   = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false;
-+	coherent  = (flags & ALLOC_MEM_FLAGS_COHERENT) ? true : false;
-+	no_sub    = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false;
-+	userptr   = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false;
-+
-+	/*
-+	 * Check on which domain to allocate BO
-+	 */
-+	if (flags & ALLOC_MEM_FLAGS_VRAM) {
-+		domain = AMDGPU_GEM_DOMAIN_VRAM;
-+		alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
-+		if (public) {
-+			alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
-+			temp_offset = offset;
-+		}
-+		alloc_flag |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
-+	} else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) {
-+		domain = AMDGPU_GEM_DOMAIN_GTT;
-+		alloc_flag = 0;
-+		temp_offset = offset;
-+	} else if (flags & ALLOC_MEM_FLAGS_DOORBELL) {
-+		domain = AMDGPU_GEM_DOMAIN_GTT;
-+		alloc_flag = 0;
-+		temp_offset = offset;
-+		if (size > UINT_MAX)
-+			return -EINVAL;
-+		sg = create_doorbell_sg(*offset, size);
-+		if (!sg)
-+			return -ENOMEM;
-+	}
-+
-+	if (offset && !userptr)
-+		*offset = 0;
-+
-+	pr_debug("Allocate VA 0x%llx - 0x%llx domain %s aql %s\n",
-+			va, va + size, domain_string(domain),
-+			BOOL_TO_STR(aql_queue));
-+
-+	pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s coherent %s no_sub %s\n",
-+			alloc_flag, BOOL_TO_STR(public),
-+			BOOL_TO_STR(readonly), BOOL_TO_STR(execute),
-+			BOOL_TO_STR(coherent), BOOL_TO_STR(no_sub));
-+
-+	return __alloc_memory_of_gpu(kgd, va, size, vm, mem,
-+			temp_offset, domain,
-+			alloc_flag, sg,
-+			aql_queue, readonly, execute,
-+			coherent, no_sub, userptr);
-+}
-+
-+int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
-+		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
-+{
-+	struct amdgpu_device *adev;
-+	struct kfd_bo_va_list *entry, *tmp;
-+	struct bo_vm_reservation_context ctx;
-+	int ret = 0;
-+	struct ttm_validate_buffer *bo_list_entry;
-+	struct amdkfd_process_info *process_info;
-+	unsigned long bo_size;
-+
-+	adev = get_amdgpu_device(kgd);
-+	process_info = ((struct amdkfd_vm *)vm)->process_info;
-+
-+	bo_size = mem->bo->tbo.mem.size;
-+
-+	mutex_lock(&mem->lock);
-+
-+	if (mem->mapped_to_gpu_memory > 0) {
-+		pr_err("BO VA 0x%llx size 0x%lx is already mapped to vm %p.\n",
-+				mem->va, bo_size, vm);
-+		mutex_unlock(&mem->lock);
-+		return -EBUSY;
-+	}
-+
-+	mutex_unlock(&mem->lock);
-+	/* lock is not needed after this, since mem is unused and will
-+	 * be freed anyway
-+	 */
-+
-+	/* No more MMU notifiers */
-+	amdgpu_mn_unregister(mem->bo);
-+
-+	/* Make sure restore workers don't access the BO any more */
-+	bo_list_entry = &mem->validate_list;
-+	mutex_lock(&process_info->lock);
-+	list_del(&bo_list_entry->head);
-+	mutex_unlock(&process_info->lock);
-+
-+	/* Free user pages if necessary */
-+	if (mem->user_pages) {
-+		pr_debug("%s: Freeing user_pages array\n", __func__);
-+		if (mem->user_pages[0])
-+			release_pages(mem->user_pages,
-+				      mem->bo->tbo.ttm->num_pages, 0);
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
-+		drm_free_large(mem->user_pages);
-+#else
-+		kvfree(mem->user_pages);
-+#endif
-+	}
-+
-+	ret = reserve_bo_and_cond_vms(mem, NULL, VA_DO_NOT_CARE, &ctx);
-+	if (unlikely(ret != 0))
-+		return ret;
-+
-+	/* The eviction fence should be removed by the last unmap.
-+	 * TODO: Log an error condition if the bo still has the eviction fence
-+	 * attached
-+	 */
-+	amdgpu_amdkfd_remove_eviction_fence(mem->bo,
-+					process_info->eviction_fence,
-+					NULL, NULL);
-+	pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
-+		mem->va + bo_size * (1 + mem->aql_queue));
-+
-+	/* Remove from VM internal data structures */
-+	list_for_each_entry_safe(entry, tmp, &mem->bo_va_list, bo_list) {
-+		remove_bo_from_vm((struct amdgpu_device *)entry->kgd_dev,
-+				entry, bo_size);
-+	}
-+
-+	ret = unreserve_bo_and_vms(&ctx, false, false);
-+
-+	/* Free the sync object */
-+	amdgpu_sync_free(&mem->sync);
-+
-+	/* If the SG is not NULL, it's one we created for a doorbell
-+	 * BO. We need to free it.
-+	 */
-+	if (mem->bo->tbo.sg) {
-+		sg_free_table(mem->bo->tbo.sg);
-+		kfree(mem->bo->tbo.sg);
-+	}
-+
-+	/* Free the BO*/
-+	amdgpu_bo_unref(&mem->bo);
-+	kfree(mem);
-+
-+	return ret;
-+}
-+
-+int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
-+		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
-+{
-+	struct amdgpu_device *adev;
-+	int ret;
-+	struct amdgpu_bo *bo;
-+	uint32_t domain;
-+	struct kfd_bo_va_list *entry;
-+	struct bo_vm_reservation_context ctx;
-+	struct kfd_bo_va_list *bo_va_entry = NULL;
-+	struct kfd_bo_va_list *bo_va_entry_aql = NULL;
-+	struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm;
-+	unsigned long bo_size;
-+	bool is_invalid_userptr;
-+
-+	adev = get_amdgpu_device(kgd);
-+
-+	/* Make sure restore is not running concurrently. Since we
-+	 * don't map invalid userptr BOs, we rely on the next restore
-+	 * worker to do the mapping
-+	 */
-+	mutex_lock(&mem->process_info->lock);
-+
-+	/* Lock mmap-sem. If we find an invalid userptr BO, we can be
-+	 * sure that the MMU notifier is no longer running
-+	 * concurrently and the queues are actually stopped
-+	 */
-+	down_read(&current->mm->mmap_sem);
-+	is_invalid_userptr = atomic_read(&mem->invalid);
-+	up_read(&current->mm->mmap_sem);
-+
-+	mutex_lock(&mem->lock);
-+
-+	bo = mem->bo;
-+
-+	if (!bo) {
-+		pr_err("Invalid BO when mapping memory to GPU\n");
-+		return -EINVAL;
-+	}
-+
-+	domain = mem->domain;
-+	bo_size = bo->tbo.mem.size;
-+
-+	pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n",
-+			mem->va,
-+			mem->va + bo_size * (1 + mem->aql_queue),
-+			vm, domain_string(domain));
-+
-+	ret = reserve_bo_and_vm(mem, vm, &ctx);
-+	if (unlikely(ret != 0))
-+		goto bo_reserve_failed;
-+
-+	/* Userptr can be marked as "not invalid", but not actually be
-+	 * validated yet (still in the system domain). In that case
-+	 * the queues are still stopped and we can leave mapping for
-+	 * the next restore worker
-+	 */
-+	if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM)
-+		is_invalid_userptr = true;
-+
-+	if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, mem)) {
-+		ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, false,
-+				&bo_va_entry);
-+		if (ret != 0)
-+			goto add_bo_to_vm_failed;
-+		if (mem->aql_queue) {
-+			ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm,
-+					true, &bo_va_entry_aql);
-+			if (ret != 0)
-+				goto add_bo_to_vm_failed_aql;
-+		}
-+	}
-+
-+	if (mem->mapped_to_gpu_memory == 0 &&
-+	    !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
-+		/* Validate BO only once. The eviction fence gets added to BO
-+		 * the first time it is mapped. Validate will wait for all
-+		 * background evictions to complete.
-+		 */
-+		ret = amdgpu_amdkfd_bo_validate(bo, domain, true);
-+		if (ret) {
-+			pr_debug("Validate failed\n");
-+			goto map_bo_to_gpuvm_failed;
-+		}
-+	}
-+
-+	list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
-+		if (entry->bo_va->base.vm == vm && !entry->is_mapped) {
-+			pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n",
-+					entry->va, entry->va + bo_size,
-+					entry);
-+
-+			ret = map_bo_to_gpuvm(adev, entry, ctx.sync,
-+					      is_invalid_userptr);
-+			if (ret != 0) {
-+				pr_err("Failed to map radeon bo to gpuvm\n");
-+				goto map_bo_to_gpuvm_failed;
-+			}
-+			entry->is_mapped = true;
-+			mem->mapped_to_gpu_memory++;
-+			pr_debug("\t INC mapping count %d\n",
-+					mem->mapped_to_gpu_memory);
-+		}
-+	}
-+
-+	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) == NULL)
-+		amdgpu_bo_fence(bo,
-+				&kfd_vm->process_info->eviction_fence->base,
-+				true);
-+	ret = unreserve_bo_and_vms(&ctx, false, false);
-+
-+	mutex_unlock(&mem->process_info->lock);
-+	mutex_unlock(&mem->lock);
-+	return ret;
-+
-+map_bo_to_gpuvm_failed:
-+	if (bo_va_entry_aql)
-+		remove_bo_from_vm(adev, bo_va_entry_aql, bo_size);
-+add_bo_to_vm_failed_aql:
-+	if (bo_va_entry)
-+		remove_bo_from_vm(adev, bo_va_entry, bo_size);
-+add_bo_to_vm_failed:
-+	unreserve_bo_and_vms(&ctx, false, false);
-+bo_reserve_failed:
-+	mutex_unlock(&mem->process_info->lock);
-+	mutex_unlock(&mem->lock);
-+	return ret;
-+}
-+
-+static u64 get_vm_pd_gpu_offset(void *vm)
-+{
-+	struct amdgpu_vm *avm = (struct amdgpu_vm *) vm;
-+	struct amdgpu_device *adev =
-+		amdgpu_ttm_adev(avm->root.base.bo->tbo.bdev);
-+	u64 offset;
-+
-+	BUG_ON(avm == NULL);
-+
-+	amdgpu_bo_reserve(avm->root.base.bo, false);
-+
-+	offset = amdgpu_bo_gpu_offset(avm->root.base.bo);
-+
-+	amdgpu_bo_unreserve(avm->root.base.bo);
-+
-+	/* On some ASICs the FB doesn't start at 0. Adjust FB offset
-+	 * to an actual MC address.
-+	 */
-+	if (adev->gart.gart_funcs->get_vm_pde)
-+		offset = amdgpu_gart_get_vm_pde(adev, offset);
-+
-+	return offset;
-+}
-+
-+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm,
-+					  void **process_info,
-+					  struct dma_fence **ef)
-+{
-+	int ret;
-+	struct amdkfd_vm *new_vm;
-+	struct amdkfd_process_info *info;
-+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-+
-+	new_vm = kzalloc(sizeof(*new_vm), GFP_KERNEL);
-+	if (new_vm == NULL)
-+		return -ENOMEM;
-+
-+	/* Initialize the VM context, allocate the page directory and zero it */
-+	ret = amdgpu_vm_init(adev, &new_vm->base, AMDGPU_VM_CONTEXT_COMPUTE);
-+	if (ret != 0) {
-+		pr_err("Failed init vm ret %d\n", ret);
-+		/* Undo everything related to the new VM context */
-+		goto vm_init_fail;
-+	}
-+	new_vm->adev = adev;
-+
-+	if (!*process_info) {
-+		info = kzalloc(sizeof(*info), GFP_KERNEL);
-+		if (!info) {
-+			pr_err("Failed to create amdkfd_process_info");
-+			ret = -ENOMEM;
-+			goto alloc_process_info_fail;
-+		}
-+
-+		mutex_init(&info->lock);
-+		INIT_LIST_HEAD(&info->vm_list_head);
-+		INIT_LIST_HEAD(&info->kfd_bo_list);
-+		INIT_LIST_HEAD(&info->userptr_valid_list);
-+		INIT_LIST_HEAD(&info->userptr_inval_list);
-+
-+		info->eviction_fence =
-+			amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
-+						   current->mm);
-+		if (info->eviction_fence == NULL) {
-+			pr_err("Failed to create eviction fence\n");
-+			goto create_evict_fence_fail;
-+		}
-+
-+		info->pid = get_task_pid(current->group_leader,
-+					 PIDTYPE_PID);
-+		atomic_set(&info->evicted_bos, 0);
-+		INIT_DELAYED_WORK(&info->work,
-+				  amdgpu_amdkfd_restore_userptr_worker);
-+
-+		*process_info = info;
-+		*ef = dma_fence_get(&info->eviction_fence->base);
-+	}
-+
-+	new_vm->process_info = *process_info;
-+
-+	mutex_lock(&new_vm->process_info->lock);
-+	list_add_tail(&new_vm->vm_list_node,
-+			&(new_vm->process_info->vm_list_head));
-+	new_vm->process_info->n_vms++;
-+	mutex_unlock(&new_vm->process_info->lock);
-+
-+	*vm = (void *) new_vm;
-+
-+	pr_debug("Created process vm %p\n", *vm);
-+
-+	return ret;
-+
-+create_evict_fence_fail:
-+	kfree(info);
-+alloc_process_info_fail:
-+	amdgpu_vm_fini(adev, &new_vm->base);
-+vm_init_fail:
-+	kfree(new_vm);
-+	return ret;
-+
-+}
-+
-+void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-+	struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *) vm;
-+	struct amdgpu_vm *avm = &kfd_vm->base;
-+	struct amdgpu_bo *pd;
-+	struct amdkfd_process_info *process_info;
-+
-+	if (WARN_ON(!kgd || !vm))
-+		return;
-+
-+	pr_debug("Destroying process vm %p\n", vm);
-+	/* Release eviction fence from PD */
-+	pd = avm->root.base.bo;
-+	amdgpu_bo_reserve(pd, false);
-+	amdgpu_bo_fence(pd, NULL, false);
-+	amdgpu_bo_unreserve(pd);
-+
-+	process_info = kfd_vm->process_info;
-+
-+	mutex_lock(&process_info->lock);
-+	process_info->n_vms--;
-+	list_del(&kfd_vm->vm_list_node);
-+	mutex_unlock(&process_info->lock);
-+
-+	/* Release per-process resources */
-+	if (!process_info->n_vms) {
-+		WARN_ON(!list_empty(&process_info->kfd_bo_list));
-+		WARN_ON(!list_empty(&process_info->userptr_valid_list));
-+		WARN_ON(!list_empty(&process_info->userptr_inval_list));
-+
-+		dma_fence_put(&process_info->eviction_fence->base);
-+		cancel_delayed_work_sync(&process_info->work);
-+		put_pid(process_info->pid);
-+		kfree(process_info);
-+	}
-+
-+	/* Release the VM context */
-+	amdgpu_vm_fini(adev, avm);
-+	kfree(vm);
-+}
-+
-+uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm)
-+{
-+	return get_vm_pd_gpu_offset(vm) >> AMDGPU_GPU_PAGE_SHIFT;
-+}
-+
-+int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
-+					      struct kfd_vm_fault_info *mem)
-+{
-+	struct amdgpu_device *adev;
-+
-+	adev = (struct amdgpu_device *) kgd;
-+	if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) {
-+		*mem = *adev->mc.vm_fault_info;
-+		mb();
-+		atomic_set(&adev->mc.vm_fault_info_updated, 0);
-+	}
-+	return 0;
-+}
-+
-+static bool is_mem_on_local_device(struct kgd_dev *kgd,
-+		struct list_head *bo_va_list, void *vm)
-+{
-+	struct kfd_bo_va_list *entry;
-+
-+	list_for_each_entry(entry, bo_va_list, bo_list) {
-+		if (entry->kgd_dev == kgd && entry->bo_va->base.vm == vm)
-+			return true;
-+	}
-+
-+	return false;
-+}
-+
-+int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
-+		struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
-+{
-+	struct kfd_bo_va_list *entry;
-+	struct amdgpu_device *adev;
-+	unsigned int mapped_before;
-+	int ret = 0;
-+	struct bo_vm_reservation_context ctx;
-+	struct amdkfd_process_info *process_info;
-+	unsigned long bo_size;
-+
-+	adev = (struct amdgpu_device *) kgd;
-+	process_info = ((struct amdkfd_vm *)vm)->process_info;
-+
-+	bo_size = mem->bo->tbo.mem.size;
-+
-+	mutex_lock(&mem->lock);
-+
-+	/*
-+	 * Make sure that this BO mapped on KGD before unmappping it
-+	 */
-+	if (!is_mem_on_local_device(kgd, &mem->bo_va_list, vm)) {
-+		ret = -EINVAL;
-+		goto out;
-+	}
-+
-+	if (mem->mapped_to_gpu_memory == 0) {
-+		pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n",
-+				mem->va, bo_size, vm);
-+		ret = -EINVAL;
-+		goto out;
-+	}
-+	mapped_before = mem->mapped_to_gpu_memory;
-+
-+	ret = reserve_bo_and_cond_vms(mem, vm, VA_MAPPED, &ctx);
-+	if (unlikely(ret != 0))
-+		goto out;
-+
-+	pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n",
-+		mem->va,
-+		mem->va + bo_size * (1 + mem->aql_queue),
-+		vm);
-+
-+	list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
-+		if (entry->bo_va->base.vm == vm && entry->is_mapped) {
-+			pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
-+					entry->va,
-+					entry->va + bo_size,
-+					entry);
-+
-+			ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync);
-+			if (ret == 0) {
-+				entry->is_mapped = false;
-+			} else {
-+				pr_err("failed to unmap VA 0x%llx\n",
-+						mem->va);
-+				goto unreserve_out;
-+			}
-+
-+			mem->mapped_to_gpu_memory--;
-+			pr_debug("\t DEC mapping count %d\n",
-+					mem->mapped_to_gpu_memory);
-+		}
-+	}
-+
-+	/* If BO is unmapped from all VMs, unfence it. It can be evicted if
-+	 * required.
-+	 */
-+	if (mem->mapped_to_gpu_memory == 0 &&
-+	    !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm))
-+		amdgpu_amdkfd_remove_eviction_fence(mem->bo,
-+						process_info->eviction_fence,
-+						    NULL, NULL);
-+
-+	if (mapped_before == mem->mapped_to_gpu_memory) {
-+		pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n",
-+			mem->va, bo_size, vm);
-+		ret = -EINVAL;
-+	}
-+
-+unreserve_out:
-+	unreserve_bo_and_vms(&ctx, false, false);
-+out:
-+	mutex_unlock(&mem->lock);
-+	return ret;
-+}
-+
-+int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma)
-+{
-+	struct amdgpu_device *adev;
-+
-+	adev = get_amdgpu_device(kgd);
-+	if (!adev) {
-+		pr_err("Could not get amdgpu device in %s\n", __func__);
-+		return -ENODEV;
-+	}
-+
-+	return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev);
-+}
-+
-+int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
-+		struct kgd_mem *mem, void **kptr)
-+{
-+	int ret;
-+	struct amdgpu_bo *bo = mem->bo;
-+
-+	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
-+		pr_err("userptr can't be mapped to kernel\n");
-+		return -EINVAL;
-+	}
-+
-+	/* delete kgd_mem from kfd_bo_list to avoid re-validating
-+	 * this BO in BO's restoring after eviction.
-+	 */
-+	mutex_lock(&mem->process_info->lock);
-+
-+	list_del_init(&mem->validate_list.head);
-+
-+	ret = __map_bo_to_kernel(bo, AMDGPU_GEM_DOMAIN_GTT, kptr);
-+	if (!ret)
-+		mem->kptr = *kptr;
-+
-+	mutex_unlock(&mem->process_info->lock);
-+
-+	return ret;
-+}
-+
-+static int pin_bo_wo_map(struct kgd_mem *mem)
-+{
-+	struct amdgpu_bo *bo = mem->bo;
-+	int ret = 0;
-+
-+	ret = amdgpu_bo_reserve(bo, false);
-+	if (unlikely(ret != 0))
-+		return ret;
-+
-+	ret = amdgpu_bo_pin(bo, mem->domain, NULL);
-+	amdgpu_bo_unreserve(bo);
-+
-+	return ret;
-+}
-+
-+static void unpin_bo_wo_map(struct kgd_mem *mem)
-+{
-+	struct amdgpu_bo *bo = mem->bo;
-+	int ret = 0;
-+
-+	ret = amdgpu_bo_reserve(bo, false);
-+	if (unlikely(ret != 0))
-+		return;
-+
-+	amdgpu_bo_unpin(bo);
-+	amdgpu_bo_unreserve(bo);
-+}
-+
-+#define AMD_GPU_PAGE_SHIFT	PAGE_SHIFT
-+#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT)
-+
-+static int get_sg_table(struct amdgpu_device *adev,
-+		struct kgd_mem *mem, uint64_t offset,
-+		uint64_t size, struct sg_table **ret_sg)
-+{
-+	struct amdgpu_bo *bo = mem->bo;
-+	struct sg_table *sg = NULL;
-+	unsigned long bus_addr;
-+	unsigned int chunks;
-+	unsigned int i;
-+	struct scatterlist *s;
-+	uint64_t offset_in_page;
-+	unsigned int page_size;
-+	int ret;
-+
-+	sg = kmalloc(sizeof(*sg), GFP_KERNEL);
-+	if (!sg) {
-+		ret = -ENOMEM;
-+		goto out;
-+	}
-+
-+	if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM)
-+		page_size = AMD_GPU_PAGE_SIZE;
-+	else
-+		page_size = PAGE_SIZE;
-+
-+
-+	offset_in_page = offset & (page_size - 1);
-+	chunks = (size  + offset_in_page + page_size - 1)
-+			/ page_size;
-+
-+	ret = sg_alloc_table(sg, chunks, GFP_KERNEL);
-+	if (unlikely(ret))
-+		goto out;
-+
-+	if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) {
-+		bus_addr = bo->tbo.offset + adev->mc.aper_base + offset;
-+
-+		for_each_sg(sg->sgl, s, sg->orig_nents, i) {
-+			uint64_t chunk_size, length;
-+
-+			chunk_size = page_size - offset_in_page;
-+			length = min(size, chunk_size);
-+
-+			sg_set_page(s, NULL, length, offset_in_page);
-+			s->dma_address = bus_addr;
-+			s->dma_length = length;
-+
-+			size -= length;
-+			offset_in_page = 0;
-+			bus_addr += length;
-+		}
-+	} else {
-+		struct page **pages;
-+		unsigned int cur_page;
-+
-+		pages = bo->tbo.ttm->pages;
-+
-+		cur_page = offset / page_size;
-+		for_each_sg(sg->sgl, s, sg->orig_nents, i) {
-+			uint64_t chunk_size, length;
-+
-+			chunk_size = page_size - offset_in_page;
-+			length = min(size, chunk_size);
-+
-+			sg_set_page(s, pages[cur_page], length, offset_in_page);
-+			s->dma_address = page_to_phys(pages[cur_page]);
-+			s->dma_length = length;
-+
-+			size -= length;
-+			offset_in_page = 0;
-+			cur_page++;
-+		}
-+	}
-+
-+	*ret_sg = sg;
-+	return 0;
-+out:
-+	kfree(sg);
-+	*ret_sg = NULL;
-+	return ret;
-+}
-+
-+int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd,
-+		struct kgd_mem *mem, uint64_t offset,
-+		uint64_t size, struct sg_table **ret_sg)
-+{
-+	int ret;
-+	struct amdgpu_device *adev;
-+
-+	ret = pin_bo_wo_map(mem);
-+	if (unlikely(ret != 0))
-+		return ret;
-+
-+	adev = get_amdgpu_device(kgd);
-+
-+	ret = get_sg_table(adev, mem, offset, size, ret_sg);
-+	if (ret)
-+		unpin_bo_wo_map(mem);
-+
-+	return ret;
-+}
-+
-+void amdgpu_amdkfd_gpuvm_unpin_put_sg_table(
-+		struct kgd_mem *mem, struct sg_table *sg)
-+{
-+	sg_free_table(sg);
-+	kfree(sg);
-+
-+	unpin_bo_wo_map(mem);
-+}
-+
-+int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
-+				      struct dma_buf *dma_buf,
-+				      uint64_t va, void *vm,
-+				      struct kgd_mem **mem, uint64_t *size,
-+				      uint64_t *mmap_offset)
-+{
-+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-+	struct drm_gem_object *obj;
-+	struct amdgpu_bo *bo;
-+	struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm;
-+
-+	if (dma_buf->ops != &drm_gem_prime_dmabuf_ops)
-+		/* Can't handle non-graphics buffers */
-+		return -EINVAL;
-+
-+	obj = dma_buf->priv;
-+	if (obj->dev->dev_private != adev)
-+		/* Can't handle buffers from other devices */
-+		return -EINVAL;
-+
-+	bo = gem_to_amdgpu_bo(obj);
-+	if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
-+				    AMDGPU_GEM_DOMAIN_GTT |
-+				    AMDGPU_GEM_DOMAIN_DGMA)))
-+		/* Only VRAM and GTT BOs are supported */
-+		return -EINVAL;
-+
-+	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
-+	if (*mem == NULL)
-+		return -ENOMEM;
-+
-+	if (size)
-+		*size = amdgpu_bo_size(bo);
-+
-+	if (mmap_offset)
-+		*mmap_offset = amdgpu_bo_mmap_offset(bo);
-+
-+	INIT_LIST_HEAD(&(*mem)->bo_va_list);
-+	mutex_init(&(*mem)->lock);
-+	(*mem)->mapping_flags =
-+		AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
-+		AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC;
-+
-+	(*mem)->bo = amdgpu_bo_ref(bo);
-+	(*mem)->va = va;
-+	if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)
-+		(*mem)->domain = AMDGPU_GEM_DOMAIN_VRAM;
-+	else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT)
-+		(*mem)->domain = AMDGPU_GEM_DOMAIN_GTT;
-+	else
-+		(*mem)->domain = AMDGPU_GEM_DOMAIN_DGMA;
-+	(*mem)->mapped_to_gpu_memory = 0;
-+	(*mem)->process_info = kfd_vm->process_info;
-+	add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, false);
-+	amdgpu_sync_create(&(*mem)->sync);
-+
-+	return 0;
-+}
-+
-+int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm,
-+				      struct kgd_mem *mem,
-+				      struct dma_buf **dmabuf)
-+{
-+	struct amdgpu_device *adev = NULL;
-+	struct amdgpu_bo *bo = NULL;
-+	struct drm_gem_object *gobj = NULL;
-+
-+	if (!dmabuf || !kgd || !vm || !mem)
-+		return -EINVAL;
-+
-+	adev = get_amdgpu_device(kgd);
-+	bo = mem->bo;
-+
-+	gobj = amdgpu_gem_prime_foreign_bo(adev, bo);
-+	if (gobj == NULL) {
-+		pr_err("Export BO failed. Unable to find/create GEM object\n");
-+		return -EINVAL;
-+	}
-+
-+	*dmabuf = amdgpu_gem_prime_export(adev->ddev, gobj, 0);
-+	return 0;
-+}
-+
-+static int process_validate_vms(struct amdkfd_process_info *process_info)
-+{
-+	struct amdkfd_vm *peer_vm;
-+	int ret;
-+
-+	list_for_each_entry(peer_vm, &process_info->vm_list_head,
-+			    vm_list_node) {
-+		ret = vm_validate_pt_pd_bos(&peer_vm->base);
-+		if (ret)
-+			return ret;
-+	}
-+
-+	return 0;
-+}
-+
-+/* Evict a userptr BO by stopping the queues if necessary
-+ *
-+ * Runs in MMU notifier, may be in RECLAIM_FS context. This means it
-+ * cannot do any memory allocations, and cannot take any locks that
-+ * are held elsewhere while allocating memory. Therefore this is as
-+ * simple as possible, using atomic counters.
-+ *
-+ * It doesn't do anything to the BO itself. The real work happens in
-+ * restore, where we get updated page addresses. This function only
-+ * ensures that GPU access to the BO is stopped.
-+ */
-+int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
-+				struct mm_struct *mm)
-+{
-+	struct amdkfd_process_info *process_info = mem->process_info;
-+	int invalid, evicted_bos;
-+	int r = 0;
-+
-+	invalid = atomic_inc_return(&mem->invalid);
-+	evicted_bos = atomic_inc_return(&process_info->evicted_bos);
-+	if (evicted_bos == 1) {
-+		/* First eviction, stop the queues */
-+		r = kgd2kfd->quiesce_mm(NULL, mm);
-+		if (r != 0)
-+			pr_err("Failed to quiesce KFD\n");
-+		schedule_delayed_work(&process_info->work, 1);
-+	}
-+
-+	return r;
-+}
-+
-+/* Update invalid userptr BOs
-+ *
-+ * Moves invalidated (evicted) userptr BOs from userptr_valid_list to
-+ * userptr_inval_list and updates user pages for all BOs that have
-+ * been invalidated since their last update.
-+ */
-+static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
-+				     struct mm_struct *mm)
-+{
-+	struct kgd_mem *mem, *tmp_mem;
-+	struct amdgpu_bo *bo;
-+	int invalid, ret;
-+
-+	/* Move all invalidated BOs to the userptr_inval_list and
-+	 * release their user pages by migration to the CPU domain
-+	 */
-+	list_for_each_entry_safe(mem, tmp_mem,
-+				 &process_info->userptr_valid_list,
-+				 validate_list.head) {
-+		if (!atomic_read(&mem->invalid))
-+			continue; /* BO is still valid */
-+
-+		bo = mem->bo;
-+
-+		if (amdgpu_bo_reserve(bo, true))
-+			return -EAGAIN;
-+		amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
-+		ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false);
-+		amdgpu_bo_unreserve(bo);
-+		if (ret) {
-+			pr_err("%s: Failed to invalidate userptr BO\n",
-+			       __func__);
-+			return -EAGAIN;
-+		}
-+
-+		list_move_tail(&mem->validate_list.head,
-+			       &process_info->userptr_inval_list);
-+	}
-+
-+	if (list_empty(&process_info->userptr_inval_list))
-+		return 0; /* All evicted userptr BOs were freed */
-+
-+	/* Go through userptr_inval_list and update any invalid user_pages */
-+	list_for_each_entry(mem, &process_info->userptr_inval_list,
-+			    validate_list.head) {
-+		invalid = atomic_read(&mem->invalid);
-+		if (!invalid)
-+			/* BO hasn't been invalidated since the last
-+			 * revalidation attempt. Keep its BO list.
-+			 */
-+			continue;
-+
-+		bo = mem->bo;
-+
-+		if (!mem->user_pages) {
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
-+			mem->user_pages =
-+				drm_calloc_large(bo->tbo.ttm->num_pages,
-+						 sizeof(struct page *));
-+#else
-+			mem->user_pages =
-+				kvmalloc_array(bo->tbo.ttm->num_pages,
-+					   sizeof(struct page *),
-+					   GFP_KERNEL | __GFP_ZERO);
-+#endif
-+			if (!mem->user_pages) {
-+				pr_err("%s: Failed to allocate pages array\n",
-+				       __func__);
-+				return -ENOMEM;
-+			}
-+		} else if (mem->user_pages[0]) {
-+			release_pages(mem->user_pages,
-+				      bo->tbo.ttm->num_pages, 0);
-+		}
-+
-+		/* Get updated user pages */
-+		ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
-+						   mem->user_pages);
-+		if (ret) {
-+			mem->user_pages[0] = NULL;
-+			pr_info("%s: Failed to get user pages: %d\n",
-+				__func__, ret);
-+			/* Pretend it succeeded. It will fail later
-+			 * with a VM fault if the GPU tries to access
-+			 * it. Better than hanging indefinitely with
-+			 * stalled user mode queues.
-+			 */
-+		}
-+
-+		/* Mark the BO as valid unless it was invalidated
-+		 * again concurrently
-+		 */
-+		if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid)
-+			return -EAGAIN;
-+	}
-+	return 0;
-+}
-+
-+/* Validate invalid userptr BOs
-+ *
-+ * Validates BOs on the userptr_inval_list, and moves them back to the
-+ * userptr_valid_list. Also updates GPUVM page tables with new page
-+ * addresses and waits for the page table updates to complete.
-+ */
-+static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
-+{
-+	struct amdgpu_bo_list_entry *pd_bo_list_entries;
-+	struct list_head resv_list, duplicates;
-+	struct ww_acquire_ctx ticket;
-+	struct amdgpu_sync sync;
-+
-+	struct amdkfd_vm *peer_vm;
-+	struct kgd_mem *mem, *tmp_mem;
-+	struct amdgpu_bo *bo;
-+	int i, ret;
-+
-+	pd_bo_list_entries = kcalloc(process_info->n_vms,
-+				     sizeof(struct amdgpu_bo_list_entry),
-+				     GFP_KERNEL);
-+	if (!pd_bo_list_entries) {
-+		pr_err("%s: Failed to allocate PD BO list entries\n", __func__);
-+		return -ENOMEM;
-+	}
-+
-+	INIT_LIST_HEAD(&resv_list);
-+	INIT_LIST_HEAD(&duplicates);
-+
-+	/* Get all the page directory BOs that need to be reserved */
-+	i = 0;
-+	list_for_each_entry(peer_vm, &process_info->vm_list_head,
-+			    vm_list_node)
-+		amdgpu_vm_get_pd_bo(&peer_vm->base, &resv_list,
-+				    &pd_bo_list_entries[i++]);
-+	/* Add the userptr_inval_list entries to resv_list */
-+	list_for_each_entry(mem, &process_info->userptr_inval_list,
-+			    validate_list.head) {
-+		list_add_tail(&mem->resv_list.head, &resv_list);
-+		mem->resv_list.bo = mem->validate_list.bo;
-+		mem->resv_list.shared = mem->validate_list.shared;
-+	}
-+
-+	/* Reserve all BOs and page tables for validation */
-+	ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates);
-+	WARN(!list_empty(&duplicates), "Duplicates should be empty");
-+	if (ret)
-+		goto out;
-+
-+	amdgpu_sync_create(&sync);
-+
-+	/* Avoid triggering eviction fences when unmapping invalid
-+	 * userptr BOs (waits for all fences, doesn't use
-+	 * FENCE_OWNER_VM)
-+	 */
-+	list_for_each_entry(peer_vm, &process_info->vm_list_head,
-+			    vm_list_node)
-+		amdgpu_amdkfd_remove_eviction_fence(peer_vm->base.root.base.bo,
-+						process_info->eviction_fence,
-+						NULL, NULL);
-+
-+	ret = process_validate_vms(process_info);
-+	if (ret)
-+		goto unreserve_out;
-+
-+	/* Validate BOs and update GPUVM page tables */
-+	list_for_each_entry_safe(mem, tmp_mem,
-+				 &process_info->userptr_inval_list,
-+				 validate_list.head) {
-+		struct kfd_bo_va_list *bo_va_entry;
-+
-+		bo = mem->bo;
-+
-+		/* Copy pages array and validate the BO if we got user pages */
-+		if (mem->user_pages[0]) {
-+			amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm,
-+						     mem->user_pages);
-+			amdgpu_ttm_placement_from_domain(bo, mem->domain);
-+			ret = ttm_bo_validate(&bo->tbo, &bo->placement,
-+					      false, false);
-+			if (ret) {
-+				pr_err("%s: failed to validate BO\n", __func__);
-+				goto unreserve_out;
-+			}
-+		}
-+
-+		/* Validate succeeded, now the BO owns the pages, free
-+		 * our copy of the pointer array. Put this BO back on
-+		 * the userptr_valid_list. If we need to revalidate
-+		 * it, we need to start from scratch.
-+		 */
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
-+		drm_free_large(mem->user_pages);
-+#else
-+		kvfree(mem->user_pages);
-+#endif
-+		mem->user_pages = NULL;
-+		list_move_tail(&mem->validate_list.head,
-+			       &process_info->userptr_valid_list);
-+
-+		/* Update mapping. If the BO was not validated
-+		 * (because we couldn't get user pages), this will
-+		 * clear the page table entries, which will result in
-+		 * VM faults if the GPU tries to access the invalid
-+		 * memory.
-+		 */
-+		list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) {
-+			if (!bo_va_entry->is_mapped)
-+				continue;
-+
-+			ret = update_gpuvm_pte((struct amdgpu_device *)
-+					       bo_va_entry->kgd_dev,
-+					       bo_va_entry, &sync);
-+			if (ret) {
-+				pr_err("%s: update PTE failed\n", __func__);
-+				/* make sure this gets validated again */
-+				atomic_inc(&mem->invalid);
-+				goto unreserve_out;
-+			}
-+		}
-+	}
-+unreserve_out:
-+	list_for_each_entry(peer_vm, &process_info->vm_list_head,
-+			    vm_list_node)
-+		amdgpu_bo_fence(peer_vm->base.root.base.bo,
-+				&process_info->eviction_fence->base, true);
-+	ttm_eu_backoff_reservation(&ticket, &resv_list);
-+	amdgpu_sync_wait(&sync, false);
-+	amdgpu_sync_free(&sync);
-+out:
-+	kfree(pd_bo_list_entries);
-+
-+	return ret;
-+}
-+
-+/* Worker callback to restore evicted userptr BOs
-+ *
-+ * Tries to update and validate all userptr BOs. If successful and no
-+ * concurrent evictions happened, the queues are restarted. Otherwise,
-+ * reschedule for another attempt later.
-+ */
-+static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
-+{
-+	struct delayed_work *dwork = to_delayed_work(work);
-+	struct amdkfd_process_info *process_info =
-+		container_of(dwork, struct amdkfd_process_info, work);
-+	struct task_struct *usertask;
-+	struct mm_struct *mm;
-+	int evicted_bos;
-+
-+	evicted_bos = atomic_read(&process_info->evicted_bos);
-+	if (!evicted_bos)
-+		return;
-+
-+	/* Reference task and mm in case of concurrent process termination */
-+	usertask = get_pid_task(process_info->pid, PIDTYPE_PID);
-+	if (!usertask)
-+		return;
-+	mm = get_task_mm(usertask);
-+	if (!mm) {
-+		put_task_struct(usertask);
-+		return;
-+	}
-+
-+	mutex_lock(&process_info->lock);
-+
-+	if (update_invalid_user_pages(process_info, mm))
-+		goto unlock_out;
-+	/* userptr_inval_list can be empty if all evicted userptr BOs
-+	 * have been freed. In that case there is nothing to validate
-+	 * and we can just restart the queues.
-+	 */
-+	if (!list_empty(&process_info->userptr_inval_list)) {
-+		if (atomic_read(&process_info->evicted_bos) != evicted_bos)
-+			goto unlock_out; /* Concurrent eviction, try again */
-+
-+		if (validate_invalid_user_pages(process_info))
-+			goto unlock_out;
-+	}
-+	/* Final check for concurrent evicton and atomic update. If
-+	 * another eviction happens after successful update, it will
-+	 * be a first eviction that calls quiesce_mm. The eviction
-+	 * reference counting inside KFD will handle this case.
-+	 */
-+	if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) !=
-+	    evicted_bos)
-+		goto unlock_out;
-+	evicted_bos = 0;
-+	if (kgd2kfd->resume_mm(NULL, mm)) {
-+		pr_err("%s: Failed to resume KFD\n", __func__);
-+		/* No recovery from this failure. Probably the CP is
-+		 * hanging. No point trying again.
-+		 */
-+	}
-+unlock_out:
-+	mutex_unlock(&process_info->lock);
-+	mmput(mm);
-+	put_task_struct(usertask);
-+
-+	/* If validation failed, reschedule another attempt */
-+	if (evicted_bos)
-+		schedule_delayed_work(&process_info->work, 1);
-+}
-+
-+/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
-+ *   KFD process identified by process_info
-+ *
-+ * @process_info: amdkfd_process_info of the KFD process
-+ *
-+ * After memory eviction, restore thread calls this function. The function
-+ * should be called when the Process is still valid. BO restore involves -
-+ *
-+ * 1.  Release old eviction fence and create new one
-+ * 2.  Get two copies of PD BO list from all the VMs. Keep one copy as pd_list.
-+ * 3   Use the second PD list and kfd_bo_list to create a list (ctx.list) of
-+ *     BOs that need to be reserved.
-+ * 4.  Reserve all the BOs
-+ * 5.  Validate of PD and PT BOs.
-+ * 6.  Validate all KFD BOs using kfd_bo_list and Map them and add new fence
-+ * 7.  Add fence to all PD and PT BOs.
-+ * 8.  Unreserve all BOs
-+ */
-+
-+int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
-+{
-+	struct amdgpu_bo_list_entry *pd_bo_list;
-+	struct amdkfd_process_info *process_info = info;
-+	struct amdkfd_vm *peer_vm;
-+	struct kgd_mem *mem;
-+	struct bo_vm_reservation_context ctx;
-+	struct amdgpu_amdkfd_fence *new_fence;
-+	int ret = 0, i;
-+	struct list_head duplicate_save;
-+	struct amdgpu_sync sync_obj;
-+
-+	INIT_LIST_HEAD(&duplicate_save);
-+	INIT_LIST_HEAD(&ctx.list);
-+	INIT_LIST_HEAD(&ctx.duplicates);
-+
-+	pd_bo_list = kcalloc(process_info->n_vms,
-+			     sizeof(struct amdgpu_bo_list_entry),
-+			     GFP_KERNEL);
-+	if (pd_bo_list == NULL)
-+		return -ENOMEM;
-+
-+	i = 0;
-+	mutex_lock(&process_info->lock);
-+	list_for_each_entry(peer_vm, &process_info->vm_list_head,
-+			vm_list_node)
-+		amdgpu_vm_get_pd_bo(&peer_vm->base, &ctx.list,
-+				    &pd_bo_list[i++]);
-+
-+	/* Reserve all BOs and page tables/directory. Add all BOs from
-+	 * kfd_bo_list to ctx.list
-+	 */
-+	list_for_each_entry(mem, &process_info->kfd_bo_list,
-+			    validate_list.head) {
-+
-+		list_add_tail(&mem->resv_list.head, &ctx.list);
-+		mem->resv_list.bo = mem->validate_list.bo;
-+		mem->resv_list.shared = mem->validate_list.shared;
-+	}
-+
-+	ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list,
-+				     false, &duplicate_save);
-+	if (ret) {
-+		pr_debug("Memory eviction: TTM Reserve Failed. Try again\n");
-+		goto ttm_reserve_fail;
-+	}
-+
-+	amdgpu_sync_create(&sync_obj);
-+	ctx.sync = &sync_obj;
-+
-+	/* Validate PDs and PTs */
-+	ret = process_validate_vms(process_info);
-+	if (ret)
-+		goto validate_map_fail;
-+
-+	/* Wait for PD/PTs validate to finish */
-+	/* FIXME: I think this isn't needed */
-+	list_for_each_entry(peer_vm, &process_info->vm_list_head,
-+			    vm_list_node) {
-+		struct amdgpu_bo *bo = peer_vm->base.root.base.bo;
-+
-+		ttm_bo_wait(&bo->tbo, false, false);
-+	}
-+
-+	/* Validate BOs and map them to GPUVM (update VM page tables). */
-+	list_for_each_entry(mem, &process_info->kfd_bo_list,
-+			    validate_list.head) {
-+
-+		struct amdgpu_bo *bo = mem->bo;
-+		uint32_t domain = mem->domain;
-+		struct kfd_bo_va_list *bo_va_entry;
-+
-+		ret = amdgpu_amdkfd_bo_validate(bo, domain, false);
-+		if (ret) {
-+			pr_debug("Memory eviction: Validate BOs failed. Try again\n");
-+			goto validate_map_fail;
-+		}
-+
-+		list_for_each_entry(bo_va_entry, &mem->bo_va_list,
-+				    bo_list) {
-+			ret = update_gpuvm_pte((struct amdgpu_device *)
-+					      bo_va_entry->kgd_dev,
-+					      bo_va_entry,
-+					      ctx.sync);
-+			if (ret) {
-+				pr_debug("Memory eviction: update PTE failed. Try again\n");
-+				goto validate_map_fail;
-+			}
-+		}
-+	}
-+
-+	amdgpu_sync_wait(ctx.sync, false);
-+
-+	/* Release old eviction fence and create new one, because fence only
-+	 * goes from unsignaled to signaled, fence cannot be reused.
-+	 * Use context and mm from the old fence.
-+	 */
-+	new_fence = amdgpu_amdkfd_fence_create(
-+				process_info->eviction_fence->base.context,
-+				process_info->eviction_fence->mm);
-+	if (!new_fence) {
-+		pr_err("Failed to create eviction fence\n");
-+		ret = -ENOMEM;
-+		goto validate_map_fail;
-+	}
-+	dma_fence_put(&process_info->eviction_fence->base);
-+	process_info->eviction_fence = new_fence;
-+	*ef = dma_fence_get(&new_fence->base);
-+
-+	/* Wait for validate to finish and attach new eviction fence */
-+	list_for_each_entry(mem, &process_info->kfd_bo_list,
-+		validate_list.head)
-+		ttm_bo_wait(&mem->bo->tbo, false, false);
-+	list_for_each_entry(mem, &process_info->kfd_bo_list,
-+		validate_list.head)
-+		amdgpu_bo_fence(mem->bo,
-+			&process_info->eviction_fence->base, true);
-+
-+	/* Attach eviction fence to PD / PT BOs */
-+	list_for_each_entry(peer_vm, &process_info->vm_list_head,
-+			    vm_list_node) {
-+		struct amdgpu_bo *bo = peer_vm->base.root.base.bo;
-+
-+		amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true);
-+	}
-+validate_map_fail:
-+	ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list);
-+	amdgpu_sync_free(&sync_obj);
-+ttm_reserve_fail:
-+	mutex_unlock(&process_info->lock);
-+evict_fence_fail:
-+	kfree(pd_bo_list);
-+	return ret;
-+}
-+
-+int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem,
-+				  uint64_t src_offset, struct kgd_mem *dst_mem,
-+				  uint64_t dst_offset, uint64_t size,
-+				  struct dma_fence **f, uint64_t *actual_size)
-+{
-+	struct amdgpu_device *adev = NULL;
-+	struct ttm_mem_reg *src = NULL, *dst = NULL;
-+	struct ttm_buffer_object *src_ttm_bo, *dst_ttm_bo;
-+	struct drm_mm_node *src_mm, *dst_mm;
-+	struct amdgpu_ring *ring;
-+	struct ww_acquire_ctx ticket;
-+	struct list_head list;
-+	struct ttm_validate_buffer resv_list[2];
-+	uint64_t src_start, dst_start;
-+	uint64_t src_left, dst_left, cur_copy_size, total_copy_size = 0;
-+	struct dma_fence *fence = NULL;
-+	int r;
-+
-+	if (!kgd || !src_mem || !dst_mem)
-+		return -EINVAL;
-+
-+	if (actual_size)
-+		*actual_size = 0;
-+
-+	adev = get_amdgpu_device(kgd);
-+	src_ttm_bo = &src_mem->bo->tbo;
-+	dst_ttm_bo = &dst_mem->bo->tbo;
-+	src = &src_ttm_bo->mem;
-+	dst = &dst_ttm_bo->mem;
-+	src_mm = (struct drm_mm_node *)src->mm_node;
-+	dst_mm = (struct drm_mm_node *)dst->mm_node;
-+
-+	ring = adev->mman.buffer_funcs_ring;
-+
-+	INIT_LIST_HEAD(&list);
-+
-+	resv_list[0].bo = src_ttm_bo;
-+	resv_list[0].shared = true;
-+	resv_list[1].bo = dst_ttm_bo;
-+	resv_list[1].shared = true;
-+
-+	list_add_tail(&resv_list[0].head, &list);
-+	list_add_tail(&resv_list[1].head, &list);
-+
-+	if (!ring->ready) {
-+		pr_err("Trying to move memory with ring turned off.\n");
-+		return -EINVAL;
-+	}
-+
-+	r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL);
-+	if (r) {
-+		pr_err("Copy buffer failed. Unable to reserve bo (%d)\n", r);
-+		return r;
-+	}
-+
-+	switch (src->mem_type) {
-+	case TTM_PL_TT:
-+		r = amdgpu_ttm_bind(src_ttm_bo, src);
-+		if (r) {
-+			DRM_ERROR("Copy failed. Cannot bind to gart\n");
-+			goto copy_fail;
-+		}
-+		break;
-+	case TTM_PL_VRAM:
-+		/* VRAM could be scattered. Find the node in which the offset
-+		 * belongs to
-+		 */
-+		while (src_offset >= (src_mm->size << PAGE_SHIFT)) {
-+			src_offset -= (src_mm->size << PAGE_SHIFT);
-+			++src_mm;
-+		}
-+		break;
-+	default:
-+		DRM_ERROR("Unknown placement %d\n", src->mem_type);
-+		r = -EINVAL;
-+		goto copy_fail;
-+	}
-+	src_start = src_mm->start << PAGE_SHIFT;
-+	src_start += src_ttm_bo->bdev->man[src->mem_type].gpu_offset;
-+	src_start += src_offset;
-+	src_left = (src_mm->size << PAGE_SHIFT) - src_offset;
-+
-+	switch (dst->mem_type) {
-+	case TTM_PL_TT:
-+		r = amdgpu_ttm_bind(dst_ttm_bo, dst);
-+		if (r) {
-+			DRM_ERROR("Copy failed. Cannot bind to gart\n");
-+			goto copy_fail;
-+		}
-+		break;
-+	case TTM_PL_VRAM:
-+		while (dst_offset >= (dst_mm->size << PAGE_SHIFT)) {
-+			dst_offset -= (dst_mm->size << PAGE_SHIFT);
-+			++dst_mm;
-+		}
-+		break;
-+	default:
-+		DRM_ERROR("Unknown placement %d\n", dst->mem_type);
-+		r = -EINVAL;
-+		goto copy_fail;
-+	}
-+	dst_start = dst_mm->start << PAGE_SHIFT;
-+	dst_start += dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset;
-+	dst_start += dst_offset;
-+	dst_left = (dst_mm->size << PAGE_SHIFT) - dst_offset;
-+
-+	do {
-+		struct dma_fence *next;
-+
-+		/* src_left/dst_left: amount of space left in the current node
-+		 * Copy minimum of (src_left, dst_left, amount of bytes left to
-+		 * copy)
-+		 */
-+		cur_copy_size = min3(src_left, dst_left,
-+				    (size - total_copy_size));
-+
-+		r = amdgpu_copy_buffer(ring, src_start, dst_start,
-+			cur_copy_size, NULL, &next, false, false);
-+		if (r)
-+			break;
-+
-+		/* Just keep the last fence */
-+		dma_fence_put(fence);
-+		fence = next;
-+
-+		total_copy_size += cur_copy_size;
-+		/* Required amount of bytes copied. Done. */
-+		if (total_copy_size >= size)
-+			break;
-+
-+		/* If end of src or dst node is reached, move to next node */
-+		src_left -= cur_copy_size;
-+		if (!src_left) {
-+			++src_mm;
-+			src_start = src_mm->start << PAGE_SHIFT;
-+			src_start +=
-+				src_ttm_bo->bdev->man[src->mem_type].gpu_offset;
-+			src_left = src_mm->size << PAGE_SHIFT;
-+		} else
-+			src_start += cur_copy_size;
-+
-+		dst_left -= cur_copy_size;
-+		if (!dst_left) {
-+			++dst_mm;
-+			dst_start = dst_mm->start << PAGE_SHIFT;
-+			dst_start +=
-+				dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset;
-+			dst_left = dst_mm->size << PAGE_SHIFT;
-+		} else
-+			dst_start += cur_copy_size;
-+
-+	} while (total_copy_size < size);
-+
-+	/* Failure could occur after partial copy. So fill in amount copied
-+	 * and fence, still fill-in
-+	 */
-+	if (actual_size)
-+		*actual_size = total_copy_size;
-+
-+	if (fence) {
-+		amdgpu_bo_fence(src_mem->bo, fence, true);
-+		amdgpu_bo_fence(dst_mem->bo, fence, true);
-+	}
-+
-+	if (f)
-+		*f = fence;
-+
-+copy_fail:
-+	ttm_eu_backoff_reservation(&ticket, &list);
-+	return r;
-+}
-+
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
-index 9c472c5..2be2e05 100644
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
-@@ -817,11 +817,7 @@ static struct drm_driver kms_driver = {
- 	.driver_features =
- 	    DRIVER_USE_AGP |
- 	    DRIVER_HAVE_IRQ | DRIVER_IRQ_SHARED | DRIVER_GEM |
--#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)
- 	    DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ,
--#else
--	    DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET,
--#endif
- 	.load = amdgpu_driver_load_kms,
- 	.open = amdgpu_driver_open_kms,
- 	.postclose = amdgpu_driver_postclose_kms,
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
-old mode 100644
-new mode 100755
-index 283dc1b..f421505
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
-@@ -36,6 +36,7 @@
- #include <drm/drm_cache.h>
- #include "amdgpu.h"
- #include "amdgpu_trace.h"
-+#include "amdgpu_amdkfd.h"
- 
- static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo)
- {
-@@ -46,6 +47,8 @@ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo)
- 
- 	if (bo->tbo.mem.mem_type == AMDGPU_PL_DGMA_IMPORT)
- 		kfree(tbo->mem.bus.addr);
-+	if (bo->kfd_bo)
-+		amdgpu_amdkfd_unreserve_system_memory_limit(bo);
- 	amdgpu_bo_kunmap(bo);
- 
- 	if (bo->gem_base.import_attach)
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
-old mode 100644
-new mode 100755
-index 8a91658..f73dba5
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
-@@ -89,6 +89,7 @@ struct amdgpu_bo {
- 
- 	struct ttm_bo_kmap_obj		dma_buf_vmap;
- 	struct amdgpu_mn		*mn;
-+	struct kgd_mem			*kfd_bo;
- 
- 	union {
- 		struct list_head	mn_list;
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
-old mode 100644
-new mode 100755
-index 322d2529..af8e544
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
-@@ -36,6 +36,7 @@
- /* some special values for the owner field */
- #define AMDGPU_FENCE_OWNER_UNDEFINED	((void*)0ul)
- #define AMDGPU_FENCE_OWNER_VM		((void*)1ul)
-+#define AMDGPU_FENCE_OWNER_KFD		((void *)2ul)
- 
- #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
- #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
-old mode 100644
-new mode 100755
-index c586f44..7ee8247
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
-@@ -31,6 +31,7 @@
- #include <drm/drmP.h>
- #include "amdgpu.h"
- #include "amdgpu_trace.h"
-+#include "amdgpu_amdkfd.h"
- 
- struct amdgpu_sync_entry {
- 	struct hlist_node	node;
-@@ -84,11 +85,20 @@ static bool amdgpu_sync_same_dev(struct amdgpu_device *adev,
-  */
- static void *amdgpu_sync_get_owner(struct dma_fence *f)
- {
--	struct amd_sched_fence *s_fence = to_amd_sched_fence(f);
-+	struct amd_sched_fence *s_fence;
-+	struct amdgpu_amdkfd_fence *kfd_fence;
-+
-+	if (f == NULL)
-+		return AMDGPU_FENCE_OWNER_UNDEFINED;
- 
-+	s_fence = to_amd_sched_fence(f);
- 	if (s_fence)
- 		return s_fence->owner;
- 
-+	kfd_fence = to_amdgpu_amdkfd_fence(f);
-+	if (kfd_fence)
-+		return AMDGPU_FENCE_OWNER_KFD;
-+
- 	return AMDGPU_FENCE_OWNER_UNDEFINED;
- }
- 
-@@ -171,7 +181,8 @@ int amdgpu_sync_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync,
-  * @resv: reservation object with embedded fence
-  * @shared: true if we should only sync to the exclusive fence
-  *
-- * Sync to the fence
-+ * Sync to the fence except if it is KFD eviction fence and owner is
-+ * AMDGPU_FENCE_OWNER_VM.
-  */
- int amdgpu_sync_resv(struct amdgpu_device *adev,
- 		     struct amdgpu_sync *sync,
-@@ -198,11 +209,15 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
- 	for (i = 0; i < flist->shared_count; ++i) {
- 		f = rcu_dereference_protected(flist->shared[i],
- 					      reservation_object_held(resv));
-+		fence_owner = amdgpu_sync_get_owner(f);
-+		if (fence_owner == AMDGPU_FENCE_OWNER_KFD &&
-+		    owner != AMDGPU_FENCE_OWNER_UNDEFINED)
-+			continue;
-+
- 		if (amdgpu_sync_same_dev(adev, f)) {
- 			/* VM updates are only interesting
- 			 * for other VM updates and moves.
- 			 */
--			fence_owner = amdgpu_sync_get_owner(f);
- 			if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
- 			    (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
- 			    ((owner == AMDGPU_FENCE_OWNER_VM) !=
-@@ -297,6 +312,31 @@ struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync)
- 	return NULL;
- }
- 
-+int amdgpu_sync_clone(struct amdgpu_device *adev,
-+					 struct amdgpu_sync *source,
-+					 struct amdgpu_sync *clone)
-+{
-+	struct amdgpu_sync_entry *e;
-+	struct hlist_node *tmp;
-+	struct dma_fence *f;
-+	int i, r;
-+
-+	hash_for_each_safe(source->fences, i, tmp, e, node) {
-+
-+		f = e->fence;
-+		if (!dma_fence_is_signaled(f)) {
-+			r = amdgpu_sync_fence(adev, clone, f);
-+			if (r)
-+				return r;
-+		} else {
-+			hash_del(&e->node);
-+			dma_fence_put(f);
-+			kmem_cache_free(amdgpu_sync_slab, e);
-+		}
-+	}
-+	return 0;
-+}
-+
- int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr)
- {
- 	struct amdgpu_sync_entry *e;
-diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
-old mode 100644
-new mode 100755
-index dc76879..8e29bc7
---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
-@@ -49,6 +49,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
- struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync,
- 				     struct amdgpu_ring *ring);
- struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync);
-+int amdgpu_sync_clone(struct amdgpu_device *adev, struct amdgpu_sync *source,
-+					 struct amdgpu_sync *clone);
- int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr);
- void amdgpu_sync_free(struct amdgpu_sync *sync);
- int amdgpu_sync_init(void);
-diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
-old mode 100644
-new mode 100755
-index 9f34fab..f22f7a8
---- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
-+++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
-@@ -272,6 +272,7 @@
- #              define PACKET3_INVALIDATE_TLBS_DST_SEL(x)     ((x) << 0)
- #              define PACKET3_INVALIDATE_TLBS_ALL_HUB(x)     ((x) << 4)
- #              define PACKET3_INVALIDATE_TLBS_PASID(x)       ((x) << 5)
-+#              define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x)  ((x) << 29)
- #define PACKET3_SET_RESOURCES				0xA0
- /* 1. header
-  * 2. CONTROL
-diff --git a/drivers/gpu/drm/amd/amdgpu/vid.h b/drivers/gpu/drm/amd/amdgpu/vid.h
-old mode 100644
-new mode 100755
-index 323e21c..d09592a
---- a/drivers/gpu/drm/amd/amdgpu/vid.h
-+++ b/drivers/gpu/drm/amd/amdgpu/vid.h
-@@ -27,6 +27,8 @@
- #define SDMA1_REGISTER_OFFSET                             0x200 /* not a register */
- #define SDMA_MAX_INSTANCE 2
- 
-+#define KFD_VI_SDMA_QUEUE_OFFSET                      0x80 /* not a register */
-+
- /* crtc instance offsets */
- #define CRTC0_REGISTER_OFFSET                 (0x1b9c - 0x1b9c)
- #define CRTC1_REGISTER_OFFSET                 (0x1d9c - 0x1b9c)
-diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
-old mode 100644
-new mode 100755
-index f55a0f8..dba08ec
---- a/drivers/gpu/drm/amd/amdkfd/Makefile
-+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
-@@ -26,5 +26,3 @@ amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o
- 
- obj-$(CONFIG_HSA_AMD)	+= amdkfd.o
- 
--AMDKFD_FULL_PATH = $(src)
--include $(AMDKFD_FULL_PATH)/backport/Makefile
-diff --git a/drivers/gpu/drm/amd/amdkfd/backport/backport.h b/drivers/gpu/drm/amd/amdkfd/backport/backport.h
-index 8b13b98..e1f8c1d 100644
---- a/drivers/gpu/drm/amd/amdkfd/backport/backport.h
-+++ b/drivers/gpu/drm/amd/amdkfd/backport/backport.h
-@@ -2,12 +2,5 @@
- #define AMDKFD_BACKPORT_H
- 
- #include <linux/version.h>
--#if defined(BUILD_AS_DKMS)
--#include <kcl/kcl_amd_asic_type.h>
--#endif
--#include <kcl/kcl_compat.h>
--#include <kcl/kcl_pci.h>
--#include <kcl/kcl_mn.h>
--#include <kcl/kcl_fence.h>
- 
- #endif
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
-index b2795af..207a05e 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
-@@ -25,9 +25,7 @@
- #include <linux/err.h>
- #include <linux/fs.h>
- #include <linux/sched.h>
--#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
- #include <linux/sched/mm.h>
--#endif
- #include <linux/slab.h>
- #include <linux/uaccess.h>
- #include <linux/compat.h>
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
-index 5f597a6..4e94081 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
-@@ -811,11 +811,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
- 	 */
- 	pgdat = NODE_DATA(numa_node_id);
- 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
--		mem_in_bytes += pgdat->node_zones[zone_type].present_pages;
--#else
- 		mem_in_bytes += pgdat->node_zones[zone_type].managed_pages;
--#endif
- 	mem_in_bytes <<= PAGE_SHIFT;
- 
- 	sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
-index c6b447d..6b3a1fa 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
-@@ -326,11 +326,6 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
- 
- static int kfd_resume(struct kfd_dev *kfd);
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
--void kfd_init_processes_srcu(void);
--void kfd_cleanup_processes_srcu(void);
--#endif
--
- static const struct kfd_device_info *lookup_device_info(unsigned short did)
- {
- 	size_t i;
-@@ -633,10 +628,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
- 
- 	kfd_ib_mem_init(kfd);
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
--	kfd_init_processes_srcu();
--#endif
--
- 	if (kfd_resume(kfd)) {
- 		dev_err(kfd_device, "Error resuming kfd\n");
- 		goto kfd_resume_error;
-@@ -678,9 +669,6 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
- {
- 	if (kfd->init_complete) {
- 		kgd2kfd_suspend(kfd);
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
--		kfd_cleanup_processes_srcu();
--#endif
- 		kfd_cwsr_fini(kfd);
- 		device_queue_manager_uninit(kfd->dqm);
- 		kfd_interrupt_exit(kfd);
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
-index 8debe6e..7eacf42 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
-@@ -24,10 +24,8 @@
- #include <linux/slab.h>
- #include <linux/types.h>
- #include <linux/uaccess.h>
--#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
- #include <linux/sched/mm.h>
- #include <linux/sched/signal.h>
--#endif
- #include <linux/mman.h>
- #include <linux/memory.h>
- #include "kfd_priv.h"
-@@ -269,13 +267,7 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
- {
- 	struct kfd_event *ev;
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *node;
--
--	hash_for_each_possible(p->events, ev, node, events, id)
--#else
- 	hash_for_each_possible(p->events, ev, events, id)
--#endif
- 		if (ev->event_id == id)
- 			return ev;
- 
-@@ -420,13 +412,7 @@ static void destroy_events(struct kfd_process *p)
- 	struct hlist_node *tmp;
- 	unsigned int hash_bkt;
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *node;
--
--	hash_for_each_safe(p->events, hash_bkt, node, tmp, ev, events)
--#else
- 	hash_for_each_safe(p->events, hash_bkt, tmp, ev, events)
--#endif
- 		destroy_event(p, ev);
- }
- 
-@@ -972,16 +958,9 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
- 	int bkt;
- 	bool send_signal = true;
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *node;
--	ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
--
--	hash_for_each(p->events, bkt, node, ev, events)
--#else
- 	ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
- 
- 	hash_for_each(p->events, bkt, ev, events)
--#endif
- 		if (ev->type == type) {
- 			send_signal = false;
- 			dev_dbg(kfd_device,
-@@ -1114,9 +1093,6 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
- 	int bkt;
- 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
- 	struct kfd_hsa_memory_exception_data memory_exception_data;
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *node;
--#endif
- 
- 	if (!p)
- 		return; /* Presumably process exited. */
-@@ -1136,11 +1112,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
- 	}
- 	mutex_lock(&p->event_mutex);
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	hash_for_each(p->events, bkt, node, ev, events) {
--#else
- 	hash_for_each(p->events, bkt, ev, events) {
--#endif
- 		if (ev->type == KFD_EVENT_TYPE_MEMORY) {
- 			ev->memory_exception_data = memory_exception_data;
- 			set_event(ev);
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
-index 4f4392a..47dcf4a 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
-@@ -61,11 +61,7 @@ int kfd_interrupt_init(struct kfd_dev *kfd)
- 		return r;
- 	}
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
--	kfd->ih_wq = create_rt_workqueue("KFD IH");
--#else
- 	kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1);
--#endif
- 	spin_lock_init(&kfd->interrupt_lock);
- 
- 	INIT_WORK(&kfd->interrupt_work, interrupt_wq);
-@@ -115,15 +111,9 @@ bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry)
- 	count = kfifo_in(&kfd->ih_fifo, ih_ring_entry,
- 				kfd->device_info->ih_ring_entry_size);
- 	if (count != kfd->device_info->ih_ring_entry_size) {
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
--		dev_err(kfd_chardev(),
--			"Interrupt ring overflow, dropping interrupt %d\n",
--			count);
--#else
- 		dev_err_ratelimited(kfd_chardev(),
- 			"Interrupt ring overflow, dropping interrupt %d\n",
- 			count);
--#endif
- 		return false;
- 	}
- 
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
-index c6be3ba..e67eb9f 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
-@@ -192,21 +192,13 @@ int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p,
- {
- 	int r;
- 	struct kfd_ipc_obj *entry, *found = NULL;
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *tmp_node;
--#endif
- 
- 	mutex_lock(&kfd_ipc_handles.lock);
- 	/* Convert the user provided handle to hash key and search only in that
- 	 * bucket
- 	 */
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	hlist_for_each_entry(entry, tmp_node,
--		&kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) {
--#else
- 	hlist_for_each_entry(entry,
- 		&kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) {
--#endif
- 		if (!memcmp(entry->share_handle, share_handle,
- 			    sizeof(entry->share_handle))) {
- 			found = entry;
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
-index 64bf653..5724d33 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
-@@ -465,19 +465,15 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
- 
- static int debugfs_show_mqd(struct seq_file *m, void *data)
- {
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
- 	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
- 		     data, sizeof(struct cik_mqd), false);
--#endif
- 	return 0;
- }
- 
- static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
- {
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
- 	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
- 		     data, sizeof(struct cik_sdma_rlc_registers), false);
--#endif
- 	return 0;
- }
- 
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
-index 0713cac..6c302d2 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
-@@ -455,19 +455,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
- 
- static int debugfs_show_mqd(struct seq_file *m, void *data)
- {
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
- 	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
- 		     data, sizeof(struct v9_mqd), false);
--#endif
- 	return 0;
- }
- 
- static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
- {
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
- 	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
- 		     data, sizeof(struct v9_sdma_mqd), false);
--#endif
- 	return 0;
- }
- 
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
-index a5ba6f7..5c26e5a 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
-@@ -468,19 +468,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
- 
- static int debugfs_show_mqd(struct seq_file *m, void *data)
- {
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
- 	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
- 		     data, sizeof(struct vi_mqd), false);
--#endif
- 	return 0;
- }
- 
- static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
- {
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
- 	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
- 		     data, sizeof(struct vi_sdma_mqd), false);
--#endif
- 	return 0;
- }
- 
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
-index 9fcb6fb..7cca7b4 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
-@@ -410,10 +410,8 @@ int pm_debugfs_runlist(struct seq_file *m, void *data)
- 		return 0;
- 	}
- 
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
- 	seq_hex_dump(m, "  ", DUMP_PREFIX_OFFSET, 32, 4,
- 		     pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false);
--#endif
- 
- 	return 0;
- }
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
-old mode 100644
-new mode 100755
-index ebe311e..88fdfc9
---- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
-@@ -36,11 +36,7 @@
- #include <linux/interval_tree.h>
- #include <linux/seq_file.h>
- #include <linux/kref.h>
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
--#include <linux/kfifo-new.h>
--#else
- #include <linux/kfifo.h>
--#endif
- #include <kgd_kfd_interface.h>
- 
- #include <drm/amd_rdma.h>
-@@ -727,7 +723,7 @@ struct kfd_process {
- 	size_t signal_event_count;
- 	bool signal_event_limit_reached;
- 
--	struct rb_root bo_interval_tree;
-+	struct rb_root_cached bo_interval_tree;
- 
- 	/* Information used for memory eviction */
- 	void *process_info;
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
-index b458995..c798fa3 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
-@@ -23,10 +23,8 @@
- #include <linux/mutex.h>
- #include <linux/log2.h>
- #include <linux/sched.h>
--#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
- #include <linux/sched/mm.h>
- #include <linux/sched/task.h>
--#endif
- #include <linux/slab.h>
- #if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
- #include <linux/amd-iommu.h>
-@@ -50,20 +48,7 @@ struct mm_struct;
- static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
- static DEFINE_MUTEX(kfd_processes_mutex);
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
--static struct srcu_struct kfd_processes_srcu;
--void kfd_init_processes_srcu(void)
--{
--	init_srcu_struct(&kfd_processes_srcu);
--}
--
--void kfd_cleanup_processes_srcu(void)
--{
--	cleanup_srcu_struct(&kfd_processes_srcu);
--}
--#else
- DEFINE_STATIC_SRCU(kfd_processes_srcu);
--#endif
- 
- static struct workqueue_struct *kfd_process_wq;
- 
-@@ -81,11 +66,7 @@ static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep);
- void kfd_process_create_wq(void)
- {
- 	if (!kfd_process_wq)
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
--		kfd_process_wq = create_workqueue("kfd_process_wq");
--#else
- 		kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
--#endif
- }
- 
- void kfd_process_destroy_wq(void)
-@@ -273,15 +254,8 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
- {
- 	struct kfd_process *process;
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *node;
--
--	hash_for_each_possible_rcu(kfd_processes_table, process, node,
--					kfd_processes, (uintptr_t)mm)
--#else
- 	hash_for_each_possible_rcu(kfd_processes_table, process,
- 					kfd_processes, (uintptr_t)mm)
--#endif
- 		if (process->mm == mm)
- 			return process;
- 
-@@ -586,7 +560,7 @@ static struct kfd_process *create_process(const struct task_struct *thread,
- 	if (!process)
- 		goto err_alloc_process;
- 
--	process->bo_interval_tree = RB_ROOT;
-+	process->bo_interval_tree = RB_ROOT_CACHED;
- 
- 	process->pasid = kfd_pasid_alloc();
- 	if (process->pasid == 0)
-@@ -1026,13 +1000,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
- 
- 	int idx = srcu_read_lock(&kfd_processes_srcu);
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *node;
--
--	hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) {
--#else
- 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
--#endif
- 		if (p->pasid == pasid) {
- 			kref_get(&p->ref);
- 			ret_p = p;
-@@ -1051,13 +1019,7 @@ void kfd_suspend_all_processes(void)
- 	unsigned int temp;
- 	int idx = srcu_read_lock(&kfd_processes_srcu);
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *node;
--
--	hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) {
--#else
- 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
--#endif
- 		if (cancel_delayed_work_sync(&p->eviction_work.dwork))
- 			dma_fence_put(p->eviction_work.quiesce_fence);
- 		cancel_delayed_work_sync(&p->restore_work);
-@@ -1077,13 +1039,7 @@ int kfd_resume_all_processes(void)
- 	unsigned int temp;
- 	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *node;
--
--	hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) {
--#else
- 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
--#endif
- 		if (!schedule_delayed_work(&p->restore_work, 0)) {
- 			pr_err("Restore process %d failed during resume\n",
- 			       p->pasid);
-@@ -1171,13 +1127,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
- 
- 	int idx = srcu_read_lock(&kfd_processes_srcu);
- 
--#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
--	struct hlist_node *node;
--
--	hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) {
--#else
- 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
--#endif
- 		seq_printf(m, "Process %d PASID %d:\n",
- 			   p->lead_thread->tgid, p->pasid);
- 
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
-index ffd8e0f..d08e3de 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
-@@ -122,9 +122,7 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev)
- 	struct kfd_mem_properties *mem;
- 	struct kfd_cache_properties *cache;
- 	struct kfd_iolink_properties *iolink;
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	struct kfd_perf_properties *perf;
--#endif
- 
- 	list_del(&dev->list);
- 
-@@ -149,14 +147,12 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev)
- 		kfree(iolink);
- 	}
- 
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	while (dev->perf_props.next != &dev->perf_props) {
- 		perf = container_of(dev->perf_props.next,
- 				struct kfd_perf_properties, list);
- 		list_del(&perf->list);
- 		kfree(perf);
- 	}
--#endif
- 
- 	kfree(dev);
- }
-@@ -192,9 +188,7 @@ struct kfd_topology_device *kfd_create_topology_device(
- 	INIT_LIST_HEAD(&dev->mem_props);
- 	INIT_LIST_HEAD(&dev->cache_props);
- 	INIT_LIST_HEAD(&dev->io_link_props);
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	INIT_LIST_HEAD(&dev->perf_props);
--#endif
- 
- 	list_add_tail(&dev->list, device_list);
- 
-@@ -374,7 +368,6 @@ static struct kobj_type cache_type = {
- 	.sysfs_ops = &cache_ops,
- };
- 
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- /****** Sysfs of Performance Counters ******/
- 
- struct kfd_perf_attr {
-@@ -407,7 +400,6 @@ static struct kfd_perf_attr perf_attr_iommu[] = {
- 	KFD_PERF_DESC(counter_ids, 0),
- };
- /****************************************/
--#endif
- 
- static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
- 		char *buffer)
-@@ -546,9 +538,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
- 	struct kfd_iolink_properties *iolink;
- 	struct kfd_cache_properties *cache;
- 	struct kfd_mem_properties *mem;
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	struct kfd_perf_properties *perf;
--#endif
- 
- 	if (dev->kobj_iolink) {
- 		list_for_each_entry(iolink, &dev->io_link_props, list)
-@@ -590,7 +580,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
- 		dev->kobj_mem = NULL;
- 	}
- 
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	if (dev->kobj_perf) {
- 		list_for_each_entry(perf, &dev->perf_props, list) {
- 			kfree(perf->attr_group);
-@@ -600,7 +589,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
- 		kobject_put(dev->kobj_perf);
- 		dev->kobj_perf = NULL;
- 	}
--#endif
- 
- 	if (dev->kobj_node) {
- 		sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid);
-@@ -618,11 +606,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
- 	struct kfd_iolink_properties *iolink;
- 	struct kfd_cache_properties *cache;
- 	struct kfd_mem_properties *mem;
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	struct kfd_perf_properties *perf;
- 	uint32_t num_attrs;
- 	struct attribute **attrs;
--#endif
- 	int ret;
- 	uint32_t i;
- 
-@@ -653,11 +639,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
- 	if (!dev->kobj_iolink)
- 		return -ENOMEM;
- 
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node);
- 	if (!dev->kobj_perf)
- 		return -ENOMEM;
--#endif
- 
- 	/*
- 	 * Creating sysfs files for node properties
-@@ -749,7 +733,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
- 		i++;
- 	}
- 
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	/* All hardware blocks have the same number of attributes. */
- 	num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr);
- 	list_for_each_entry(perf, &dev->perf_props, list) {
-@@ -775,7 +758,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
- 		if (ret < 0)
- 			return ret;
- 	}
--#endif
- 
- 	return 0;
- }
-@@ -942,7 +924,6 @@ static void find_system_memory(const struct dmi_header *dm,
- 	}
- }
- 
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- /*
-  * Performance counters information is not part of CRAT but we would like to
-  * put them in the sysfs under topology directory for Thunk to get the data.
-@@ -966,7 +947,6 @@ static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev)
- 
- 	return 0;
- }
--#endif
- 
- /* kfd_add_non_crat_information - Add information that is not currently
-  *	defined in CRAT but is necessary for KFD topology
-@@ -1074,11 +1054,9 @@ int kfd_topology_init(void)
- 		}
- 	}
- 
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	kdev = list_first_entry(&temp_topology_device_list,
- 				struct kfd_topology_device, list);
- 	kfd_add_perf_to_topology(kdev);
--#endif
- 
- 	down_write(&topology_lock);
- 	kfd_topology_update_device_list(&temp_topology_device_list,
-diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
-index b59b32c..f22d420 100644
---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
-+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
-@@ -141,14 +141,12 @@ struct kfd_iolink_properties {
- 	struct attribute	attr;
- };
- 
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- struct kfd_perf_properties {
- 	struct list_head	list;
- 	char			block_name[16];
- 	uint32_t		max_concurrent;
- 	struct attribute_group	*attr_group;
- };
--#endif
- 
- struct kfd_topology_device {
- 	struct list_head		list;
-@@ -160,17 +158,13 @@ struct kfd_topology_device {
- 	struct list_head		cache_props;
- 	uint32_t			io_link_count;
- 	struct list_head		io_link_props;
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	struct list_head		perf_props;
--#endif
- 	struct kfd_dev			*gpu;
- 	struct kobject			*kobj_node;
- 	struct kobject			*kobj_mem;
- 	struct kobject			*kobj_cache;
- 	struct kobject			*kobj_iolink;
--#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
- 	struct kobject			*kobj_perf;
--#endif
- 	struct attribute		attr_gpuid;
- 	struct attribute		attr_name;
- 	struct attribute		attr_props;
-diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-index 2780641..977b21b 100644
---- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
-@@ -707,11 +707,7 @@ static int dm_display_resume(struct drm_device *ddev)
- 
- err:
- 	DRM_ERROR("Restoring old state failed with %i\n", ret);
--#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
--	drm_atomic_state_free(state);
--#else
- 	drm_atomic_state_put(state);
--#endif
- 
- 	return ret;
- }
-diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
-old mode 100644
-new mode 100755
-index 36f3766..b6cf2d5
---- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
-+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
-@@ -30,6 +30,7 @@
- 
- #include <linux/types.h>
- #include <linux/bitmap.h>
-+#include <linux/dma-buf.h>
- 
- struct pci_dev;
- 
-@@ -40,6 +41,46 @@ struct kfd_dev;
- struct kgd_dev;
- 
- struct kgd_mem;
-+struct kfd_process_device;
-+struct amdgpu_bo;
-+
-+enum kfd_preempt_type {
-+	KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN = 0,
-+	KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
-+};
-+
-+struct kfd_vm_fault_info {
-+	uint64_t	page_addr;
-+	uint32_t	vmid;
-+	uint32_t	mc_id;
-+	uint32_t	status;
-+	bool		prot_valid;
-+	bool		prot_read;
-+	bool		prot_write;
-+	bool		prot_exec;
-+};
-+
-+struct kfd_cu_info {
-+	uint32_t num_shader_engines;
-+	uint32_t num_shader_arrays_per_engine;
-+	uint32_t num_cu_per_sh;
-+	uint32_t cu_active_number;
-+	uint32_t cu_ao_mask;
-+	uint32_t simd_per_cu;
-+	uint32_t max_waves_per_simd;
-+	uint32_t wave_front_size;
-+	uint32_t max_scratch_slots_per_cu;
-+	uint32_t lds_size;
-+	uint32_t cu_bitmap[4][4];
-+};
-+
-+/* For getting GPU local memory information from KGD */
-+struct kfd_local_mem_info {
-+	uint64_t local_mem_size_private;
-+	uint64_t local_mem_size_public;
-+	uint32_t vram_width;
-+	uint32_t mem_clk_max;
-+};
- 
- enum kgd_memory_pool {
- 	KGD_POOL_SYSTEM_CACHEABLE = 1,
-@@ -72,6 +113,21 @@ struct kgd2kfd_shared_resources {
- 	/* Bit n == 1 means Queue n is available for KFD */
- 	DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES);
- 
-+	/* Doorbell assignments (SOC15 and later chips only). Only
-+	 * specific doorbells are routed to each SDMA engine. Others
-+	 * are routed to IH and VCN. They are not usable by the CP.
-+	 *
-+	 * Any doorbell number D that satisfies the following condition
-+	 * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val
-+	 *
-+	 * KFD currently uses 1024 (= 0x3ff) doorbells per process. If
-+	 * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means
-+	 * mask would be set to 0x1f8 and val set to 0x0f0.
-+	 */
-+	unsigned int sdma_doorbell[2][2];
-+	unsigned int reserved_doorbell_mask;
-+	unsigned int reserved_doorbell_val;
-+
- 	/* Base address of doorbell aperture. */
- 	phys_addr_t doorbell_physical_address;
- 
-@@ -80,8 +136,41 @@ struct kgd2kfd_shared_resources {
- 
- 	/* Number of bytes at start of aperture reserved for KGD. */
- 	size_t doorbell_start_offset;
-+
-+	/* GPUVM address space size in bytes */
-+	uint64_t gpuvm_size;
- };
- 
-+struct tile_config {
-+	uint32_t *tile_config_ptr;
-+	uint32_t *macro_tile_config_ptr;
-+	uint32_t num_tile_configs;
-+	uint32_t num_macro_tile_configs;
-+
-+	uint32_t gb_addr_config;
-+	uint32_t num_banks;
-+	uint32_t num_ranks;
-+};
-+
-+/*
-+ * Allocation flag domains currently only VRAM and GTT domain supported
-+ */
-+#define ALLOC_MEM_FLAGS_VRAM			(1 << 0)
-+#define ALLOC_MEM_FLAGS_GTT				(1 << 1)
-+#define ALLOC_MEM_FLAGS_USERPTR			(1 << 2)
-+#define ALLOC_MEM_FLAGS_DOORBELL		(1 << 3)
-+
-+/*
-+ * Allocation flags attributes/access options.
-+ */
-+#define ALLOC_MEM_FLAGS_NONPAGED		(1 << 31)
-+#define ALLOC_MEM_FLAGS_READONLY		(1 << 30)
-+#define ALLOC_MEM_FLAGS_PUBLIC			(1 << 29)
-+#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE	(1 << 28)
-+#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM	(1 << 27)
-+#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS	(1 << 26)
-+#define ALLOC_MEM_FLAGS_COHERENT	(1 << 25)
-+
- /**
-  * struct kfd2kgd_calls
-  *
-@@ -90,7 +179,7 @@ struct kgd2kfd_shared_resources {
-  *
-  * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture
-  *
-- * @get_vmem_size: Retrieves (physical) size of VRAM
-+ * @get_local_mem_info: Retrieves information about GPU local memory
-  *
-  * @get_gpu_clock_counter: Retrieves GPU clock counter
-  *
-@@ -112,6 +201,12 @@ struct kgd2kfd_shared_resources {
-  * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot.
-  * used only for no HWS mode.
-  *
-+ * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs.
-+ * Array is allocated with kmalloc, needs to be freed with kfree by caller.
-+ *
-+ * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs.
-+ * Array is allocated with kmalloc, needs to be freed with kfree by caller.
-+ *
-  * @hqd_is_occupies: Checks if a hqd slot is occupied.
-  *
-  * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot.
-@@ -121,8 +216,34 @@ struct kgd2kfd_shared_resources {
-  * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that
-  * SDMA hqd slot.
-  *
-+ * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs
-+ *
-+ * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs
-+ *
-  * @get_fw_version: Returns FW versions from the header
-  *
-+ * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to
-+ * IOMMU when address translation failed
-+ *
-+ * @get_cu_info: Retrieves activated cu info
-+ *
-+ * @get_dmabuf_info: Returns information about a dmabuf if it was
-+ * created by the GPU driver
-+ *
-+ * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object
-+ * Supports only DMA buffers created by GPU driver on the same GPU
-+ *
-+ * @export_dmabuf: Emports a KFD BO for sharing with other process
-+ *
-+ * @submit_ib: Submits an IB to the engine specified by inserting the IB to
-+ * the corresonded ring (ring type).
-+ *
-+ * @restore_process_bos: Restore all BOs that belongs to the process
-+ *
-+ * @copy_mem_to_mem: Copies size bytes from source BO to destination BO
-+ *
-+ * @get_vram_usage: Returns current VRAM usage
-+ *
-  * This structure contains function pointers to services that the kgd driver
-  * provides to amdkfd driver.
-  *
-@@ -134,11 +255,23 @@ struct kfd2kgd_calls {
- 
- 	void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj);
- 
--	uint64_t (*get_vmem_size)(struct kgd_dev *kgd);
-+	void(*get_local_mem_info)(struct kgd_dev *kgd,
-+			struct kfd_local_mem_info *mem_info);
- 	uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd);
- 
- 	uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd);
- 
-+	int (*create_process_vm)(struct kgd_dev *kgd, void **vm,
-+				 void **process_info, struct dma_fence **ef);
-+	void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm);
-+
-+	int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem);
-+	void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem);
-+
-+	uint32_t (*get_process_page_dir)(void *vm);
-+
-+	int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem);
-+
- 	/* Register access functions */
- 	void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid,
- 			uint32_t sh_mem_config,	uint32_t sh_mem_ape1_base,
-@@ -151,16 +284,28 @@ struct kfd2kgd_calls {
- 				uint32_t hpd_size, uint64_t hpd_gpu_addr);
- 
- 	int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id);
-+	
- 
- 	int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
--			uint32_t queue_id, uint32_t __user *wptr);
-+			uint32_t queue_id, uint32_t __user *wptr,
-+			uint32_t wptr_shift, uint32_t wptr_mask,
-+			struct mm_struct *mm);
-+
-+	int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd,
-+			     uint32_t __user *wptr, struct mm_struct *mm);
-+
-+	int (*hqd_dump)(struct kgd_dev *kgd,
-+			uint32_t pipe_id, uint32_t queue_id,
-+			uint32_t (**dump)[2], uint32_t *n_regs);
- 
--	int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd);
-+	int (*hqd_sdma_dump)(struct kgd_dev *kgd,
-+			     uint32_t engine_id, uint32_t queue_id,
-+			     uint32_t (**dump)[2], uint32_t *n_regs);
- 
- 	bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address,
- 				uint32_t pipe_id, uint32_t queue_id);
- 
--	int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type,
-+	int (*hqd_destroy)(struct kgd_dev *kgd, void *mqd, uint32_t reset_type,
- 				unsigned int timeout, uint32_t pipe_id,
- 				uint32_t queue_id);
- 
-@@ -168,7 +313,7 @@ struct kfd2kgd_calls {
- 
- 	int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd,
- 				unsigned int timeout);
--
-+				
- 	int (*address_watch_disable)(struct kgd_dev *kgd);
- 	int (*address_watch_execute)(struct kgd_dev *kgd,
- 					unsigned int watch_point_id,
-@@ -187,11 +332,72 @@ struct kfd2kgd_calls {
- 	uint16_t (*get_atc_vmid_pasid_mapping_pasid)(
- 					struct kgd_dev *kgd,
- 					uint8_t vmid);
-+	uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd);
- 	void (*write_vmid_invalidate_request)(struct kgd_dev *kgd,
- 					uint8_t vmid);
- 
-+	int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid);
-+
-+	int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);
-+
-+	int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va,
-+			uint64_t size, void *vm,
-+			struct kgd_mem **mem, uint64_t *offset,
-+			uint32_t flags);
-+	int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem,
-+			void *vm);
-+	int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem,
-+			void *vm);
-+	int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem,
-+			void *vm);
-+
- 	uint16_t (*get_fw_version)(struct kgd_dev *kgd,
- 				enum kgd_engine_type type);
-+
-+	void (*set_num_of_requests)(struct kgd_dev *kgd,
-+			uint8_t num_of_requests);
-+	int (*alloc_memory_of_scratch)(struct kgd_dev *kgd,
-+			uint64_t va, uint32_t vmid);
-+	int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable,
-+		uint8_t element_size, uint8_t index_stride, uint8_t mtype);
-+	void (*get_cu_info)(struct kgd_dev *kgd,
-+			struct kfd_cu_info *cu_info);
-+	int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma);
-+	int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd,
-+			struct kgd_mem *mem, void **kptr);
-+	void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid,
-+			uint32_t page_table_base);
-+
-+	int (*pin_get_sg_table_bo)(struct kgd_dev *kgd,
-+			struct kgd_mem *mem, uint64_t offset,
-+			uint64_t size, struct sg_table **ret_sg);
-+	void (*unpin_put_sg_table_bo)(struct kgd_mem *mem,
-+			struct sg_table *sg);
-+
-+	int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd,
-+			       struct kgd_dev **dma_buf_kgd, uint64_t *bo_size,
-+			       void *metadata_buffer, size_t buffer_size,
-+			       uint32_t *metadata_size, uint32_t *flags);
-+	int (*import_dmabuf)(struct kgd_dev *kgd, struct dma_buf *dmabuf,
-+			     uint64_t va, void *vm, struct kgd_mem **mem,
-+			     uint64_t *size, uint64_t *mmap_offset);
-+	int (*export_dmabuf)(struct kgd_dev *kgd, void *vm, struct kgd_mem *mem,
-+				struct dma_buf **dmabuf);
-+
-+	int (*get_vm_fault_info)(struct kgd_dev *kgd,
-+			struct kfd_vm_fault_info *info);
-+	int (*submit_ib)(struct kgd_dev *kgd, enum kgd_engine_type engine,
-+			uint32_t vmid, uint64_t gpu_addr,
-+			uint32_t *ib_cmd, uint32_t ib_len);
-+	int (*get_tile_config)(struct kgd_dev *kgd,
-+			struct tile_config *config);
-+
-+	int (*restore_process_bos)(void *process_info, struct dma_fence **ef);
-+	int (*copy_mem_to_mem)(struct kgd_dev *kgd, struct kgd_mem *src_mem,
-+			uint64_t src_offset, struct kgd_mem *dst_mem,
-+			uint64_t dest_offset, uint64_t size,
-+			struct dma_fence **f, uint64_t *actual_size);
-+	uint64_t (*get_vram_usage)(struct kgd_dev *kgd);
- };
- 
- /**
-@@ -210,6 +416,13 @@ struct kfd2kgd_calls {
-  *
-  * @resume: Notifies amdkfd about a resume action done to a kgd device
-  *
-+ * @quiesce_mm: Quiesce all user queue access to specified MM address space
-+ *
-+ * @resume_mm: Resume user queue access to specified MM address space
-+ *
-+ * @schedule_evict_and_restore_process: Schedules work queue that will prepare
-+ * for safe eviction of KFD BOs that belong to the specified process.
-+ *
-  * This structure contains function callback pointers so the kgd driver
-  * will notify to the amdkfd about certain status changes.
-  *
-@@ -224,9 +437,13 @@ struct kgd2kfd_calls {
- 	void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry);
- 	void (*suspend)(struct kfd_dev *kfd);
- 	int (*resume)(struct kfd_dev *kfd);
-+	int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm);
-+	int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm);
-+	int (*schedule_evict_and_restore_process)(struct mm_struct *mm,
-+			struct dma_fence *fence);
- };
- 
- int kgd2kfd_init(unsigned interface_version,
- 		const struct kgd2kfd_calls **g2f);
- 
--#endif	/* KGD_KFD_INTERFACE_H_INCLUDED */
-+#endif /* KGD_KFD_INTERFACE_H_INCLUDED */
-diff --git a/drivers/gpu/drm/amd/include/v9_structs.h b/drivers/gpu/drm/amd/include/v9_structs.h
-old mode 100644
-new mode 100755
-index 2fb25ab..ceaf493
---- a/drivers/gpu/drm/amd/include/v9_structs.h
-+++ b/drivers/gpu/drm/amd/include/v9_structs.h
-@@ -29,10 +29,10 @@ struct v9_sdma_mqd {
- 	uint32_t sdmax_rlcx_rb_base;
- 	uint32_t sdmax_rlcx_rb_base_hi;
- 	uint32_t sdmax_rlcx_rb_rptr;
-+	uint32_t sdmax_rlcx_rb_rptr_hi;
- 	uint32_t sdmax_rlcx_rb_wptr;
-+	uint32_t sdmax_rlcx_rb_wptr_hi;
- 	uint32_t sdmax_rlcx_rb_wptr_poll_cntl;
--	uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi;
--	uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo;
- 	uint32_t sdmax_rlcx_rb_rptr_addr_hi;
- 	uint32_t sdmax_rlcx_rb_rptr_addr_lo;
- 	uint32_t sdmax_rlcx_ib_cntl;
-@@ -44,29 +44,29 @@ struct v9_sdma_mqd {
- 	uint32_t sdmax_rlcx_skip_cntl;
- 	uint32_t sdmax_rlcx_context_status;
- 	uint32_t sdmax_rlcx_doorbell;
--	uint32_t sdmax_rlcx_virtual_addr;
--	uint32_t sdmax_rlcx_ape1_cntl;
-+	uint32_t sdmax_rlcx_status;
- 	uint32_t sdmax_rlcx_doorbell_log;
--	uint32_t reserved_22;
--	uint32_t reserved_23;
--	uint32_t reserved_24;
--	uint32_t reserved_25;
--	uint32_t reserved_26;
--	uint32_t reserved_27;
--	uint32_t reserved_28;
--	uint32_t reserved_29;
--	uint32_t reserved_30;
--	uint32_t reserved_31;
--	uint32_t reserved_32;
--	uint32_t reserved_33;
--	uint32_t reserved_34;
--	uint32_t reserved_35;
--	uint32_t reserved_36;
--	uint32_t reserved_37;
--	uint32_t reserved_38;
--	uint32_t reserved_39;
--	uint32_t reserved_40;
--	uint32_t reserved_41;
-+	uint32_t sdmax_rlcx_watermark;
-+	uint32_t sdmax_rlcx_doorbell_offset;
-+	uint32_t sdmax_rlcx_csa_addr_lo;
-+	uint32_t sdmax_rlcx_csa_addr_hi;
-+	uint32_t sdmax_rlcx_ib_sub_remain;
-+	uint32_t sdmax_rlcx_preempt;
-+	uint32_t sdmax_rlcx_dummy_reg;
-+	uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi;
-+	uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo;
-+	uint32_t sdmax_rlcx_rb_aql_cntl;
-+	uint32_t sdmax_rlcx_minor_ptr_update;
-+	uint32_t sdmax_rlcx_midcmd_data0;
-+	uint32_t sdmax_rlcx_midcmd_data1;
-+	uint32_t sdmax_rlcx_midcmd_data2;
-+	uint32_t sdmax_rlcx_midcmd_data3;
-+	uint32_t sdmax_rlcx_midcmd_data4;
-+	uint32_t sdmax_rlcx_midcmd_data5;
-+	uint32_t sdmax_rlcx_midcmd_data6;
-+	uint32_t sdmax_rlcx_midcmd_data7;
-+	uint32_t sdmax_rlcx_midcmd_data8;
-+	uint32_t sdmax_rlcx_midcmd_cntl;
- 	uint32_t reserved_42;
- 	uint32_t reserved_43;
- 	uint32_t reserved_44;
-diff --git a/drivers/gpu/drm/amd/include/vi_structs.h b/drivers/gpu/drm/amd/include/vi_structs.h
-old mode 100644
-new mode 100755
-index 2023482..717fbae
---- a/drivers/gpu/drm/amd/include/vi_structs.h
-+++ b/drivers/gpu/drm/amd/include/vi_structs.h
-@@ -153,6 +153,8 @@ struct vi_sdma_mqd {
- 	uint32_t reserved_125;
- 	uint32_t reserved_126;
- 	uint32_t reserved_127;
-+	uint32_t sdma_engine_id;
-+	uint32_t sdma_queue_id;
- };
- 
- struct vi_mqd {
-diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
-old mode 100644
-new mode 100755
-index 2292462..82d97f3
---- a/drivers/pci/pci.c
-+++ b/drivers/pci/pci.c
-@@ -2983,6 +2983,87 @@ bool pci_acs_path_enabled(struct pci_dev *start,
- }
- 
- /**
-+ * pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port
-+ * @dev: the PCI device
-+ *
-+ * Return 0 if the device is capable of generating AtomicOp requests,
-+ * all upstream bridges support AtomicOp routing, egress blocking is disabled
-+ * on all upstream ports, and the root port supports 32-bit, 64-bit and/or
-+ * 128-bit AtomicOp completion, or negative otherwise.
-+ */
-+int pci_enable_atomic_ops_to_root(struct pci_dev *dev)
-+{
-+	struct pci_bus *bus = dev->bus;
-+
-+	if (!pci_is_pcie(dev))
-+		return -EINVAL;
-+
-+	switch (pci_pcie_type(dev)) {
-+	/*
-+	 * PCIe 3.0, 6.15 specifies that endpoints and root ports are permitted
-+	 * to implement AtomicOp requester capabilities.
-+	 */
-+	case PCI_EXP_TYPE_ENDPOINT:
-+	case PCI_EXP_TYPE_LEG_END:
-+	case PCI_EXP_TYPE_RC_END:
-+		break;
-+	default:
-+		return -EINVAL;
-+	}
-+
-+	while (bus->parent) {
-+		struct pci_dev *bridge = bus->self;
-+		u32 cap;
-+
-+		pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap);
-+
-+		switch (pci_pcie_type(bridge)) {
-+		/*
-+		 * Upstream, downstream and root ports may implement AtomicOp
-+		 * routing capabilities. AtomicOp routing via a root port is
-+		 * not considered.
-+		 */
-+		case PCI_EXP_TYPE_UPSTREAM:
-+		case PCI_EXP_TYPE_DOWNSTREAM:
-+			if (!(cap & PCI_EXP_DEVCAP2_ATOMIC_ROUTE))
-+				return -EINVAL;
-+			break;
-+
-+		/*
-+		 * Root ports are permitted to implement AtomicOp completion
-+		 * capabilities.
-+		 */
-+		case PCI_EXP_TYPE_ROOT_PORT:
-+			if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
-+				     PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
-+				     PCI_EXP_DEVCAP2_ATOMIC_COMP128)))
-+				return -EINVAL;
-+			break;
-+		}
-+
-+		/*
-+		 * Upstream ports may block AtomicOps on egress.
-+		 */
-+		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) {
-+			u32 ctl2;
-+
-+			pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2,
-+						   &ctl2);
-+			if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_BLOCK)
-+				return -EINVAL;
-+		}
-+
-+		bus = bus->parent;
-+	}
-+
-+	pcie_capability_set_word(dev, PCI_EXP_DEVCTL2,
-+				 PCI_EXP_DEVCTL2_ATOMIC_REQ);
-+
-+	return 0;
-+}
-+EXPORT_SYMBOL(pci_enable_atomic_ops_to_root);
-+
-+/**
-  * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge
-  * @dev: the PCI device
-  * @pin: the INTx pin (1=INTA, 2=INTB, 3=INTC, 4=INTD)
-diff --git a/include/drm/amd_rdma.h b/include/drm/amd_rdma.h
-new file mode 100644
-index 0000000..b0cab3c
---- /dev/null
-+++ b/include/drm/amd_rdma.h
-@@ -0,0 +1,70 @@
-+/*
-+ * Copyright 2015 Advanced Micro Devices, Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+
-+/* @file This file defined kernel interfaces to communicate with amdkfd */
-+
-+#ifndef AMD_RDMA_H_
-+#define AMD_RDMA_H_
-+
-+
-+/**
-+ * Structure describing information needed to P2P access from another device
-+ * to specific location of GPU memory
-+ */
-+struct amd_p2p_info {
-+	uint64_t	va;		/**< Specify user virt. address
-+					  * which this page table
-+					  * described
-+					  */
-+	uint64_t	size;		/**< Specify total size of
-+					  * allocation
-+					  */
-+	struct pid	*pid;		/**< Specify process pid to which
-+					  * virtual address belongs
-+					  */
-+	struct sg_table *pages;		/**< Specify DMA/Bus addresses */
-+	void		*priv;		/**< Pointer set by AMD kernel
-+					  * driver
-+					  */
-+};
-+
-+/**
-+ * Structure providing function pointers to support rdma/p2p requirements.
-+ * to specific location of GPU memory
-+ */
-+struct amd_rdma_interface {
-+	int (*get_pages)(uint64_t address, uint64_t length, struct pid *pid,
-+				struct amd_p2p_info  **amd_p2p_data,
-+				void  (*free_callback)(void *client_priv),
-+				void  *client_priv);
-+	int (*put_pages)(struct amd_p2p_info **amd_p2p_data);
-+	int (*is_gpu_address)(uint64_t address, struct pid *pid);
-+	int (*get_page_size)(uint64_t address, uint64_t length, struct pid *pid,
-+				unsigned long *page_size);
-+};
-+
-+
-+int amdkfd_query_rdma_interface(const struct amd_rdma_interface **rdma);
-+
-+
-+#endif /* AMD_RDMA_H_ */
-+
-diff --git a/include/linux/pci.h b/include/linux/pci.h
-old mode 100644
-new mode 100755
-index b1abbcc..3df545d
---- a/include/linux/pci.h
-+++ b/include/linux/pci.h
-@@ -2072,6 +2072,7 @@ void pci_request_acs(void);
- bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags);
- bool pci_acs_path_enabled(struct pci_dev *start,
- 			  struct pci_dev *end, u16 acs_flags);
-+int pci_enable_atomic_ops_to_root(struct pci_dev *dev);
- 
- #define PCI_VPD_LRDT			0x80	/* Large Resource Data Type */
- #define PCI_VPD_LRDT_ID(x)		((x) | PCI_VPD_LRDT)
-diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
-index 5bb2b45..de5367c 100644
---- a/include/uapi/linux/kfd_ioctl.h
-+++ b/include/uapi/linux/kfd_ioctl.h
-@@ -23,15 +23,15 @@
- #ifndef KFD_IOCTL_H_INCLUDED
- #define KFD_IOCTL_H_INCLUDED
- 
--#include <drm/drm.h>
-+#include <linux/types.h>
- #include <linux/ioctl.h>
- 
- #define KFD_IOCTL_MAJOR_VERSION 1
--#define KFD_IOCTL_MINOR_VERSION 1
-+#define KFD_IOCTL_MINOR_VERSION 2
- 
- struct kfd_ioctl_get_version_args {
--	__u32 major_version;	/* from KFD */
--	__u32 minor_version;	/* from KFD */
-+	uint32_t major_version;	/* from KFD */
-+	uint32_t minor_version;	/* from KFD */
- };
- 
- /* For kfd_ioctl_create_queue_args.queue_type. */
-@@ -43,36 +43,51 @@ struct kfd_ioctl_get_version_args {
- #define KFD_MAX_QUEUE_PRIORITY		15
- 
- struct kfd_ioctl_create_queue_args {
--	__u64 ring_base_address;	/* to KFD */
--	__u64 write_pointer_address;	/* from KFD */
--	__u64 read_pointer_address;	/* from KFD */
--	__u64 doorbell_offset;	/* from KFD */
--
--	__u32 ring_size;		/* to KFD */
--	__u32 gpu_id;		/* to KFD */
--	__u32 queue_type;		/* to KFD */
--	__u32 queue_percentage;	/* to KFD */
--	__u32 queue_priority;	/* to KFD */
--	__u32 queue_id;		/* from KFD */
--
--	__u64 eop_buffer_address;	/* to KFD */
--	__u64 eop_buffer_size;	/* to KFD */
--	__u64 ctx_save_restore_address; /* to KFD */
--	__u64 ctx_save_restore_size;	/* to KFD */
-+	uint64_t ring_base_address;	/* to KFD */
-+	uint64_t write_pointer_address;	/* from KFD */
-+	uint64_t read_pointer_address;	/* from KFD */
-+	uint64_t doorbell_offset;	/* from KFD */
-+
-+	uint32_t ring_size;		/* to KFD */
-+	uint32_t gpu_id;		/* to KFD */
-+	uint32_t queue_type;		/* to KFD */
-+	uint32_t queue_percentage;	/* to KFD */
-+	uint32_t queue_priority;	/* to KFD */
-+	uint32_t queue_id;		/* from KFD */
-+
-+	uint64_t eop_buffer_address;	/* to KFD */
-+	uint64_t eop_buffer_size;	/* to KFD */
-+	uint64_t ctx_save_restore_address; /* to KFD */
-+	uint32_t ctx_save_restore_size;	/* to KFD */
-+	uint32_t ctl_stack_size;	/* to KFD */
- };
- 
- struct kfd_ioctl_destroy_queue_args {
--	__u32 queue_id;		/* to KFD */
--	__u32 pad;
-+	uint32_t queue_id;		/* to KFD */
-+	uint32_t pad;
- };
- 
- struct kfd_ioctl_update_queue_args {
--	__u64 ring_base_address;	/* to KFD */
-+	uint64_t ring_base_address;	/* to KFD */
-+
-+	uint32_t queue_id;		/* to KFD */
-+	uint32_t ring_size;		/* to KFD */
-+	uint32_t queue_percentage;	/* to KFD */
-+	uint32_t queue_priority;	/* to KFD */
-+};
- 
--	__u32 queue_id;		/* to KFD */
--	__u32 ring_size;		/* to KFD */
--	__u32 queue_percentage;	/* to KFD */
--	__u32 queue_priority;	/* to KFD */
-+struct kfd_ioctl_set_cu_mask_args {
-+	uint32_t queue_id;		/* to KFD */
-+	uint32_t num_cu_mask;		/* to KFD */
-+	uint64_t cu_mask_ptr;		/* to KFD */
-+};
-+
-+struct kfd_ioctl_get_queue_wave_state_args {
-+	uint64_t ctl_stack_address;	/* to KFD */
-+	uint32_t ctl_stack_used_size;	/* from KFD */
-+	uint32_t save_area_used_size;	/* from KFD */
-+	uint32_t queue_id;		/* to KFD */
-+	uint32_t pad;
- };
- 
- /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
-@@ -80,13 +95,20 @@ struct kfd_ioctl_update_queue_args {
- #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
- 
- struct kfd_ioctl_set_memory_policy_args {
--	__u64 alternate_aperture_base;	/* to KFD */
--	__u64 alternate_aperture_size;	/* to KFD */
-+	uint64_t alternate_aperture_base;	/* to KFD */
-+	uint64_t alternate_aperture_size;	/* to KFD */
-+
-+	uint32_t gpu_id;			/* to KFD */
-+	uint32_t default_policy;		/* to KFD */
-+	uint32_t alternate_policy;		/* to KFD */
-+	uint32_t pad;
-+};
- 
--	__u32 gpu_id;			/* to KFD */
--	__u32 default_policy;		/* to KFD */
--	__u32 alternate_policy;		/* to KFD */
--	__u32 pad;
-+struct kfd_ioctl_set_trap_handler_args {
-+	uint64_t tba_addr;
-+	uint64_t tma_addr;
-+	uint32_t gpu_id;			/* to KFD */
-+	uint32_t pad;
- };
- 
- /*
-@@ -97,35 +119,52 @@ struct kfd_ioctl_set_memory_policy_args {
-  */
- 
- struct kfd_ioctl_get_clock_counters_args {
--	__u64 gpu_clock_counter;	/* from KFD */
--	__u64 cpu_clock_counter;	/* from KFD */
--	__u64 system_clock_counter;	/* from KFD */
--	__u64 system_clock_freq;	/* from KFD */
-+	uint64_t gpu_clock_counter;	/* from KFD */
-+	uint64_t cpu_clock_counter;	/* from KFD */
-+	uint64_t system_clock_counter;	/* from KFD */
-+	uint64_t system_clock_freq;	/* from KFD */
- 
--	__u32 gpu_id;		/* to KFD */
--	__u32 pad;
-+	uint32_t gpu_id;		/* to KFD */
-+	uint32_t pad;
- };
- 
- #define NUM_OF_SUPPORTED_GPUS 7
- 
- struct kfd_process_device_apertures {
--	__u64 lds_base;		/* from KFD */
--	__u64 lds_limit;		/* from KFD */
--	__u64 scratch_base;		/* from KFD */
--	__u64 scratch_limit;		/* from KFD */
--	__u64 gpuvm_base;		/* from KFD */
--	__u64 gpuvm_limit;		/* from KFD */
--	__u32 gpu_id;		/* from KFD */
--	__u32 pad;
-+	uint64_t lds_base;		/* from KFD */
-+	uint64_t lds_limit;		/* from KFD */
-+	uint64_t scratch_base;		/* from KFD */
-+	uint64_t scratch_limit;		/* from KFD */
-+	uint64_t gpuvm_base;		/* from KFD */
-+	uint64_t gpuvm_limit;		/* from KFD */
-+	uint32_t gpu_id;		/* from KFD */
-+	uint32_t pad;
- };
- 
-+/* This IOCTL and the limited NUM_OF_SUPPORTED_GPUS is deprecated. Use
-+ * kfd_ioctl_get_process_apertures_new instead, which supports
-+ * arbitrary numbers of GPUs.
-+ */
- struct kfd_ioctl_get_process_apertures_args {
- 	struct kfd_process_device_apertures
- 			process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
- 
- 	/* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */
--	__u32 num_of_nodes;
--	__u32 pad;
-+	uint32_t num_of_nodes;
-+	uint32_t pad;
-+};
-+
-+struct kfd_ioctl_get_process_apertures_new_args {
-+	/* User allocated. Pointer to struct kfd_process_device_apertures
-+	 * filled in by Kernel
-+	 */
-+	uint64_t kfd_process_device_apertures_ptr;
-+	/* to KFD - indicates amount of memory present in
-+	 *  kfd_process_device_apertures_ptr
-+	 * from KFD - Number of entries filled by KFD.
-+	 */
-+	uint32_t num_of_nodes;
-+	uint32_t pad;
- };
- 
- #define MAX_ALLOWED_NUM_POINTS    100
-@@ -133,103 +172,245 @@ struct kfd_ioctl_get_process_apertures_args {
- #define MAX_ALLOWED_WAC_BUFF_SIZE  128
- 
- struct kfd_ioctl_dbg_register_args {
--	__u32 gpu_id;		/* to KFD */
--	__u32 pad;
-+	uint32_t gpu_id;		/* to KFD */
-+	uint32_t pad;
- };
- 
- struct kfd_ioctl_dbg_unregister_args {
--	__u32 gpu_id;		/* to KFD */
--	__u32 pad;
-+	uint32_t gpu_id;		/* to KFD */
-+	uint32_t pad;
- };
- 
- struct kfd_ioctl_dbg_address_watch_args {
--	__u64 content_ptr;		/* a pointer to the actual content */
--	__u32 gpu_id;		/* to KFD */
--	__u32 buf_size_in_bytes;	/*including gpu_id and buf_size */
-+	uint64_t content_ptr;		/* a pointer to the actual content */
-+	uint32_t gpu_id;		/* to KFD */
-+	uint32_t buf_size_in_bytes;	/*including gpu_id and buf_size */
- };
- 
- struct kfd_ioctl_dbg_wave_control_args {
--	__u64 content_ptr;		/* a pointer to the actual content */
--	__u32 gpu_id;		/* to KFD */
--	__u32 buf_size_in_bytes;	/*including gpu_id and buf_size */
-+	uint64_t content_ptr;		/* a pointer to the actual content */
-+	uint32_t gpu_id;		/* to KFD */
-+	uint32_t buf_size_in_bytes;	/*including gpu_id and buf_size */
- };
- 
- /* Matching HSA_EVENTTYPE */
--#define KFD_IOC_EVENT_SIGNAL			0
--#define KFD_IOC_EVENT_NODECHANGE		1
--#define KFD_IOC_EVENT_DEVICESTATECHANGE		2
--#define KFD_IOC_EVENT_HW_EXCEPTION		3
--#define KFD_IOC_EVENT_SYSTEM_EVENT		4
--#define KFD_IOC_EVENT_DEBUG_EVENT		5
--#define KFD_IOC_EVENT_PROFILE_EVENT		6
--#define KFD_IOC_EVENT_QUEUE_EVENT		7
--#define KFD_IOC_EVENT_MEMORY			8
--
--#define KFD_IOC_WAIT_RESULT_COMPLETE		0
--#define KFD_IOC_WAIT_RESULT_TIMEOUT		1
--#define KFD_IOC_WAIT_RESULT_FAIL		2
--
--#define KFD_SIGNAL_EVENT_LIMIT			256
-+#define KFD_IOC_EVENT_SIGNAL		0
-+#define KFD_IOC_EVENT_NODECHANGE	1
-+#define KFD_IOC_EVENT_DEVICESTATECHANGE	2
-+#define KFD_IOC_EVENT_HW_EXCEPTION	3
-+#define KFD_IOC_EVENT_SYSTEM_EVENT	4
-+#define KFD_IOC_EVENT_DEBUG_EVENT	5
-+#define KFD_IOC_EVENT_PROFILE_EVENT	6
-+#define KFD_IOC_EVENT_QUEUE_EVENT	7
-+#define KFD_IOC_EVENT_MEMORY		8
-+
-+#define KFD_IOC_WAIT_RESULT_COMPLETE	0
-+#define KFD_IOC_WAIT_RESULT_TIMEOUT	1
-+#define KFD_IOC_WAIT_RESULT_FAIL	2
-+
-+#define KFD_SIGNAL_EVENT_LIMIT		4096
- 
- struct kfd_ioctl_create_event_args {
--	__u64 event_page_offset;	/* from KFD */
--	__u32 event_trigger_data;	/* from KFD - signal events only */
--	__u32 event_type;		/* to KFD */
--	__u32 auto_reset;		/* to KFD */
--	__u32 node_id;		/* to KFD - only valid for certain
-+	uint64_t event_page_offset;	/* from KFD */
-+	uint32_t event_trigger_data;	/* from KFD - signal events only */
-+	uint32_t event_type;		/* to KFD */
-+	uint32_t auto_reset;		/* to KFD */
-+	uint32_t node_id;		/* to KFD - only valid for certain
- 							event types */
--	__u32 event_id;		/* from KFD */
--	__u32 event_slot_index;	/* from KFD */
-+	uint32_t event_id;		/* from KFD */
-+	uint32_t event_slot_index;	/* from KFD */
- };
- 
- struct kfd_ioctl_destroy_event_args {
--	__u32 event_id;		/* to KFD */
--	__u32 pad;
-+	uint32_t event_id;		/* to KFD */
-+	uint32_t pad;
- };
- 
- struct kfd_ioctl_set_event_args {
--	__u32 event_id;		/* to KFD */
--	__u32 pad;
-+	uint32_t event_id;		/* to KFD */
-+	uint32_t pad;
- };
- 
- struct kfd_ioctl_reset_event_args {
--	__u32 event_id;		/* to KFD */
--	__u32 pad;
-+	uint32_t event_id;		/* to KFD */
-+	uint32_t pad;
- };
- 
- struct kfd_memory_exception_failure {
--	__u32 NotPresent;	/* Page not present or supervisor privilege */
--	__u32 ReadOnly;	/* Write access to a read-only page */
--	__u32 NoExecute;	/* Execute access to a page marked NX */
--	__u32 pad;
-+	uint32_t NotPresent;	/* Page not present or supervisor privilege */
-+	uint32_t ReadOnly;	/* Write access to a read-only page */
-+	uint32_t NoExecute;	/* Execute access to a page marked NX */
-+	uint32_t imprecise;	/* Can't determine the	exact fault address */
- };
- 
--/* memory exception data*/
-+/* memory exception data */
- struct kfd_hsa_memory_exception_data {
- 	struct kfd_memory_exception_failure failure;
--	__u64 va;
--	__u32 gpu_id;
--	__u32 pad;
-+	uint64_t va;
-+	uint32_t gpu_id;
-+	uint32_t pad;
- };
- 
--/* Event data*/
-+/* Event data */
- struct kfd_event_data {
- 	union {
- 		struct kfd_hsa_memory_exception_data memory_exception_data;
- 	};				/* From KFD */
--	__u64 kfd_event_data_ext;	/* pointer to an extension structure
--					   for future exception types */
--	__u32 event_id;		/* to KFD */
--	__u32 pad;
-+	uint64_t kfd_event_data_ext;	/* pointer to an extension structure
-+	 	 	 	 	   for future exception types */
-+	uint32_t event_id;		/* to KFD */
-+	uint32_t pad;
- };
- 
- struct kfd_ioctl_wait_events_args {
--	__u64 events_ptr;		/* pointed to struct
-+	uint64_t events_ptr;		/* pointed to struct
- 					   kfd_event_data array, to KFD */
--	__u32 num_events;		/* to KFD */
--	__u32 wait_for_all;		/* to KFD */
--	__u32 timeout;		/* to KFD */
--	__u32 wait_result;		/* from KFD */
-+	uint32_t num_events;		/* to KFD */
-+	uint32_t wait_for_all;		/* to KFD */
-+	uint32_t timeout;		/* to KFD */
-+	uint32_t wait_result;		/* from KFD */
-+};
-+
-+struct kfd_ioctl_alloc_memory_of_scratch_args {
-+	uint64_t va_addr;	/* to KFD */
-+	uint64_t size;		/* to KFD */
-+	uint32_t gpu_id;	/* to KFD */
-+	uint32_t pad;
-+};
-+
-+/* Allocation flags: memory types */
-+#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM		(1 << 0)
-+#define KFD_IOC_ALLOC_MEM_FLAGS_GTT		(1 << 1)
-+#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR		(1 << 2)
-+#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL	(1 << 3)
-+/* Allocation flags: attributes/access options */
-+#define KFD_IOC_ALLOC_MEM_FLAGS_NONPAGED	(1 << 31)
-+#define KFD_IOC_ALLOC_MEM_FLAGS_READONLY	(1 << 30)
-+#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC		(1 << 29)
-+#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE	(1 << 28)
-+#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM	(1 << 27)
-+#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTE_ACCESS	(1 << 26)
-+#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT	(1 << 25)
-+
-+struct kfd_ioctl_alloc_memory_of_gpu_args {
-+	uint64_t va_addr;	/* to KFD */
-+	uint64_t size;		/* to KFD */
-+	uint64_t handle;	/* from KFD */
-+	uint64_t mmap_offset;   /* to KFD (userptr), from KFD (mmap offset) */
-+	uint32_t gpu_id;	/* to KFD */
-+	uint32_t flags;
-+};
-+
-+struct kfd_ioctl_free_memory_of_gpu_args {
-+	uint64_t handle;	/* to KFD */
-+};
-+
-+struct kfd_ioctl_map_memory_to_gpu_args {
-+	uint64_t handle;			/* to KFD */
-+	uint64_t device_ids_array_ptr;		/* to KFD */
-+	uint32_t device_ids_array_size;		/* to KFD */
-+	uint32_t pad;
-+};
-+
-+struct kfd_ioctl_unmap_memory_from_gpu_args {
-+	uint64_t handle;			/* to KFD */
-+	uint64_t device_ids_array_ptr;		/* to KFD */
-+	uint32_t device_ids_array_size;		/* to KFD */
-+	uint32_t pad;
-+};
-+
-+struct kfd_ioctl_set_process_dgpu_aperture_args {
-+	uint64_t dgpu_base;
-+	uint64_t dgpu_limit;
-+	uint32_t gpu_id;
-+	uint32_t pad;
-+};
-+
-+struct kfd_ioctl_get_dmabuf_info_args {
-+	uint64_t size;		/* from KFD */
-+	uint64_t metadata_ptr;	/* to KFD */
-+	uint32_t metadata_size;	/* to KFD (space allocated by user)
-+				 * from KFD (actual metadata size) */
-+	uint32_t gpu_id;	/* from KFD */
-+	uint32_t flags;		/* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */
-+	uint32_t dmabuf_fd;	/* to KFD */
-+};
-+
-+struct kfd_ioctl_import_dmabuf_args {
-+	uint64_t va_addr;	/* to KFD */
-+	uint64_t handle;	/* from KFD */
-+	uint32_t gpu_id;	/* to KFD */
-+	uint32_t dmabuf_fd;	/* to KFD */
-+};
-+
-+struct kfd_ioctl_ipc_export_handle_args {
-+	uint64_t handle;		/* to KFD */
-+	uint32_t share_handle[4];	/* from KFD */
-+	uint32_t gpu_id;		/* to KFD */
-+	uint32_t pad;
-+};
-+
-+struct kfd_ioctl_ipc_import_handle_args {
-+	uint64_t handle;		/* from KFD */
-+	uint64_t va_addr;		/* to KFD */
-+	uint64_t mmap_offset;		/* from KFD */
-+	uint32_t share_handle[4];	/* to KFD */
-+	uint32_t gpu_id;		/* to KFD */
-+	uint32_t pad;
-+};
-+
-+struct kfd_ioctl_get_tile_config_args {
-+	/* to KFD: pointer to tile array */
-+	uint64_t tile_config_ptr;
-+	/* to KFD: pointer to macro tile array */
-+	uint64_t macro_tile_config_ptr;
-+	/* to KFD: array size allocated by user mode
-+	 * from KFD: array size filled by kernel
-+	 */
-+	uint32_t num_tile_configs;
-+	/* to KFD: array size allocated by user mode
-+	 * from KFD: array size filled by kernel
-+	 */
-+	uint32_t num_macro_tile_configs;
-+
-+	uint32_t gpu_id;		/* to KFD */
-+	uint32_t gb_addr_config;	/* from KFD */
-+	uint32_t num_banks;		/* from KFD */
-+	uint32_t num_ranks;		/* from KFD */
-+	/* struct size can be extended later if needed
-+	 * without breaking ABI compatibility
-+	 */
-+};
-+
-+struct kfd_memory_range {
-+	uint64_t va_addr;
-+	uint64_t size;
-+};
-+
-+/* flags definitions
-+ * BIT0: 0: read operation, 1: write operation.
-+ * This also identifies if the src or dst array belongs to remote process
-+ */
-+#define KFD_CROSS_MEMORY_RW_BIT (1 << 0)
-+#define KFD_SET_CROSS_MEMORY_READ(flags) (flags &= ~KFD_CROSS_MEMORY_RW_BIT)
-+#define KFD_SET_CROSS_MEMORY_WRITE(flags) (flags |= KFD_CROSS_MEMORY_RW_BIT)
-+#define KFD_IS_CROSS_MEMORY_WRITE(flags) (flags & KFD_CROSS_MEMORY_RW_BIT)
-+
-+struct kfd_ioctl_cross_memory_copy_args {
-+	/* to KFD: Process ID of the remote process */
-+	uint32_t pid;
-+	/* to KFD: See above definition */
-+	uint32_t flags;
-+	/* to KFD: Source GPU VM range */
-+	uint64_t src_mem_range_array;
-+	/* to KFD: Size of above array */
-+	uint64_t src_mem_array_size;
-+	/* to KFD: Destination GPU VM range */
-+	uint64_t dst_mem_range_array;
-+	/* to KFD: Size of above array */
-+	uint64_t dst_mem_array_size;
-+	/* from KFD: Total amount of bytes copied */
-+	uint64_t bytes_copied;
- };
- 
- 
-@@ -287,7 +468,56 @@ struct kfd_ioctl_wait_events_args {
- #define AMDKFD_IOC_DBG_WAVE_CONTROL		\
- 		AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args)
- 
-+#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU		\
-+		AMDKFD_IOWR(0x11, struct kfd_ioctl_alloc_memory_of_gpu_args)
-+
-+#define AMDKFD_IOC_FREE_MEMORY_OF_GPU		\
-+		AMDKFD_IOWR(0x12, struct kfd_ioctl_free_memory_of_gpu_args)
-+
-+#define AMDKFD_IOC_MAP_MEMORY_TO_GPU		\
-+		AMDKFD_IOWR(0x13, struct kfd_ioctl_map_memory_to_gpu_args)
-+
-+#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU	\
-+		AMDKFD_IOWR(0x14, struct kfd_ioctl_unmap_memory_from_gpu_args)
-+
-+#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH	\
-+		AMDKFD_IOWR(0x15, struct kfd_ioctl_alloc_memory_of_scratch_args)
-+
-+#define AMDKFD_IOC_SET_CU_MASK		\
-+		AMDKFD_IOW(0x16, struct kfd_ioctl_set_cu_mask_args)
-+
-+#define AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE   \
-+		AMDKFD_IOW(0x17,	\
-+		struct kfd_ioctl_set_process_dgpu_aperture_args)
-+
-+#define AMDKFD_IOC_SET_TRAP_HANDLER		\
-+		AMDKFD_IOW(0x18, struct kfd_ioctl_set_trap_handler_args)
-+
-+#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW	\
-+	AMDKFD_IOWR(0x19, struct kfd_ioctl_get_process_apertures_new_args)
-+
-+#define AMDKFD_IOC_GET_DMABUF_INFO		\
-+		AMDKFD_IOWR(0x1A, struct kfd_ioctl_get_dmabuf_info_args)
-+
-+#define AMDKFD_IOC_IMPORT_DMABUF		\
-+		AMDKFD_IOWR(0x1B, struct kfd_ioctl_import_dmabuf_args)
-+
-+#define AMDKFD_IOC_GET_TILE_CONFIG		\
-+		AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_tile_config_args)
-+
-+#define AMDKFD_IOC_IPC_IMPORT_HANDLE		\
-+		AMDKFD_IOWR(0x1D, struct kfd_ioctl_ipc_import_handle_args)
-+
-+#define AMDKFD_IOC_IPC_EXPORT_HANDLE		\
-+		AMDKFD_IOWR(0x1E, struct kfd_ioctl_ipc_export_handle_args)
-+
-+#define AMDKFD_IOC_CROSS_MEMORY_COPY		\
-+		AMDKFD_IOWR(0x1F, struct kfd_ioctl_cross_memory_copy_args)
-+
-+#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE		\
-+		AMDKFD_IOWR(0x20, struct kfd_ioctl_get_queue_wave_state_args)
-+
- #define AMDKFD_COMMAND_START		0x01
--#define AMDKFD_COMMAND_END		0x11
-+#define AMDKFD_COMMAND_END		0x21
- 
- #endif
-diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
-old mode 100644
-new mode 100755
-index 87c2c84..1256851
---- a/include/uapi/linux/pci_regs.h
-+++ b/include/uapi/linux/pci_regs.h
-@@ -624,7 +624,9 @@
- #define PCI_EXP_DEVCAP2		36	/* Device Capabilities 2 */
- #define  PCI_EXP_DEVCAP2_ARI		0x00000020 /* Alternative Routing-ID */
- #define  PCI_EXP_DEVCAP2_ATOMIC_ROUTE	0x00000040 /* Atomic Op routing */
-+#define  PCI_EXP_DEVCAP2_ATOMIC_COMP32	0x00000080 /* 32b AtomicOp completion */
- #define PCI_EXP_DEVCAP2_ATOMIC_COMP64	0x00000100 /* Atomic 64-bit compare */
-+#define  PCI_EXP_DEVCAP2_ATOMIC_COMP128	0x00000200 /* 128b AtomicOp completion*/
- #define  PCI_EXP_DEVCAP2_LTR		0x00000800 /* Latency tolerance reporting */
- #define  PCI_EXP_DEVCAP2_OBFF_MASK	0x000c0000 /* OBFF support mechanism */
- #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
-@@ -634,6 +636,7 @@
- #define  PCI_EXP_DEVCTL2_ARI		0x0020	/* Alternative Routing-ID */
- #define PCI_EXP_DEVCTL2_ATOMIC_REQ	0x0040	/* Set Atomic requests */
- #define PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK 0x0080 /* Block atomic egress */
-+#define  PCI_EXP_DEVCTL2_ATOMIC_BLOCK	0x0040	/* Block AtomicOp on egress */
- #define  PCI_EXP_DEVCTL2_IDO_REQ_EN	0x0100	/* Allow IDO for requests */
- #define  PCI_EXP_DEVCTL2_IDO_CMP_EN	0x0200	/* Allow IDO for completions */
- #define  PCI_EXP_DEVCTL2_LTR_EN		0x0400	/* Enable LTR mechanism */
-diff --git a/kernel/fork.c b/kernel/fork.c
-index a19ee25..70d8d5b 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -1082,6 +1082,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
- 
- 	return mm;
- }
-+EXPORT_SYMBOL_GPL(mm_access);
- 
- static void complete_vfork_done(struct task_struct *tsk)
- {
--- 
-2.7.4
-