aboutsummaryrefslogtreecommitdiffstats
path: root/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch')
-rw-r--r--meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch8695
1 files changed, 8695 insertions, 0 deletions
diff --git a/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch
new file mode 100644
index 00000000..a27db153
--- /dev/null
+++ b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch
@@ -0,0 +1,8695 @@
+From 817ccd6f0987f83ddbf989602f0fbf320157f0a9 Mon Sep 17 00:00:00 2001
+From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com>
+Date: Thu, 18 Oct 2018 12:42:04 +0530
+Subject: [PATCH 1353/4131] compilation fix for amdkfd porting
+
+Signed-off-by: Sanjay R Mehta <sanju.mehta@amd.com>
+Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/Makefile | 8 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 346 ++-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 185 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 196 ++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 537 ++++-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 590 ++++-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h | 62 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1227 ++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2578 +++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 -
+ drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 46 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h | 2 +
+ drivers/gpu/drm/amd/amdgpu/soc15d.h | 1 +
+ drivers/gpu/drm/amd/amdgpu/vid.h | 2 +
+ drivers/gpu/drm/amd/amdkfd/Makefile | 2 -
+ drivers/gpu/drm/amd/amdkfd/backport/backport.h | 7 -
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 -
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 4 -
+ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 12 -
+ drivers/gpu/drm/amd/amdkfd/kfd_events.c | 28 -
+ drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 10 -
+ drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 8 -
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 4 -
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 4 -
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 4 -
+ drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 2 -
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 52 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 22 -
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 6 -
+ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 -
+ drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 231 +-
+ drivers/gpu/drm/amd/include/v9_structs.h | 48 +-
+ drivers/gpu/drm/amd/include/vi_structs.h | 2 +
+ drivers/pci/pci.c | 81 +
+ include/drm/amd_rdma.h | 70 +
+ include/linux/pci.h | 1 +
+ include/uapi/linux/kfd_ioctl.h | 442 +++-
+ include/uapi/linux/pci_regs.h | 3 +
+ kernel/fork.c | 1 +
+ 44 files changed, 6315 insertions(+), 537 deletions(-)
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/Makefile
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/soc15d.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/vid.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/Makefile
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/include/v9_structs.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/include/vi_structs.h
+ mode change 100644 => 100755 drivers/pci/pci.c
+ create mode 100644 include/drm/amd_rdma.h
+ mode change 100644 => 100755 include/linux/pci.h
+ mode change 100644 => 100755 include/uapi/linux/pci_regs.h
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
+old mode 100644
+new mode 100755
+index 57b8d5f..6b373d0
+--- a/drivers/gpu/drm/amd/amdgpu/Makefile
++++ b/drivers/gpu/drm/amd/amdgpu/Makefile
+@@ -32,12 +32,11 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
+ amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \
+ amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
+ amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \
+- amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o
++ amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o amdgpu_amdkfd_fence.o
+
+ # add asic specific block
+ amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
+ ci_smc.o ci_dpm.o dce_v8_0.o gfx_v7_0.o cik_sdma.o uvd_v4_2.o vce_v2_0.o \
+- amdgpu_amdkfd_gfx_v7.o
+
+ amdgpu-$(CONFIG_DRM_AMDGPU_SI)+= si.o gmc_v6_0.o gfx_v6_0.o si_ih.o si_dma.o dce_v6_0.o si_dpm.o si_smc.o
+
+@@ -109,7 +108,10 @@ amdgpu-y += \
+ # add amdkfd interfaces
+ amdgpu-y += \
+ amdgpu_amdkfd.o \
+- amdgpu_amdkfd_gfx_v8.o
++ amdgpu_amdkfd_gfx_v7.o \
++ amdgpu_amdkfd_gfx_v8.o \
++ amdgpu_amdkfd_gfx_v9.o \
++ amdgpu_amdkfd_gpuvm.o
+
+ # add cgs
+ amdgpu-y += amdgpu_cgs.o
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+old mode 100644
+new mode 100755
+index fe23de8..bcf95e7
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+@@ -184,6 +184,7 @@ struct amdgpu_cs_parser;
+ struct amdgpu_job;
+ struct amdgpu_irq_src;
+ struct amdgpu_fpriv;
++struct kfd_vm_fault_info;
+ struct amdgpu_bo_va_mapping;
+
+ enum amdgpu_cp_irq {
+@@ -403,6 +404,7 @@ struct amdgpu_gem_object {
+ struct amdgpu_bo *bo;
+ };
+
++struct kgd_mem;
+ #define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_gem_object, base)->bo
+
+ void amdgpu_gem_object_free(struct drm_gem_object *obj);
+@@ -543,6 +545,9 @@ struct amdgpu_mc {
+ u64 private_aperture_end;
+ /* protects concurrent invalidation */
+ spinlock_t invalidate_lock;
++
++ struct kfd_vm_fault_info *vm_fault_info;
++ atomic_t vm_fault_info_updated;
+ };
+
+ /*
+@@ -961,6 +966,7 @@ struct amdgpu_gfx_config {
+ };
+
+ struct amdgpu_cu_info {
++ uint32_t simd_per_cu;
+ uint32_t max_waves_per_simd;
+ uint32_t wave_front_size;
+ uint32_t max_scratch_slots_per_cu;
+@@ -1649,6 +1655,7 @@ struct amdgpu_device {
+ /* record hw reset is performed */
+ bool has_hw_reset;
+ u8 reset_magic[AMDGPU_RESET_MAGIC_NUM];
++ spinlock_t tlb_invalidation_lock;
+
+ /* record last mm index being written through WREG32*/
+ unsigned long last_mm_index;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+old mode 100644
+new mode 100755
+index 7ec1915..ec8141f
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+@@ -20,23 +20,29 @@
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
++#undef pr_fmt
++#define pr_fmt(fmt) "kfd2kgd: " fmt
++
+ #include "amdgpu_amdkfd.h"
+-#include "amd_shared.h"
++#include <linux/dma-buf.h>
+ #include <drm/drmP.h>
+ #include "amdgpu.h"
+ #include "amdgpu_gfx.h"
+ #include <linux/module.h>
+
+-const struct kfd2kgd_calls *kfd2kgd;
++#define AMDKFD_SKIP_UNCOMPILED_CODE 1
++
+ const struct kgd2kfd_calls *kgd2kfd;
+-bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**);
++bool (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**);
++
++unsigned int global_compute_vmid_bitmap = 0xFF00;
+
+ int amdgpu_amdkfd_init(void)
+ {
+ int ret;
+
+ #if defined(CONFIG_HSA_AMD_MODULE)
+- int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**);
++ int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**);
+
+ kgd2kfd_init_p = symbol_request(kgd2kfd_init);
+
+@@ -57,56 +63,68 @@ int amdgpu_amdkfd_init(void)
+ #else
+ ret = -ENOENT;
+ #endif
+-
++ amdgpu_amdkfd_gpuvm_init_mem_limits();
+ return ret;
+ }
+
+-bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev)
++void amdgpu_amdkfd_fini(void)
+ {
++ if (kgd2kfd) {
++ kgd2kfd->exit();
++ symbol_put(kgd2kfd_init);
++ }
++}
++
++void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev)
++{
++ const struct kfd2kgd_calls *kfd2kgd;
++
++ if (!kgd2kfd)
++ return;
++
+ switch (adev->asic_type) {
+ #ifdef CONFIG_DRM_AMDGPU_CIK
+ case CHIP_KAVERI:
++ case CHIP_HAWAII:
+ kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions();
+ break;
+ #endif
+ case CHIP_CARRIZO:
++ case CHIP_TONGA:
++ case CHIP_FIJI:
++ case CHIP_POLARIS10:
++ case CHIP_POLARIS11:
+ kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions();
+ break;
++ case CHIP_VEGA10:
++ case CHIP_RAVEN:
++ kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions();
++ break;
+ default:
+- return false;
+- }
+-
+- return true;
+-}
+-
+-void amdgpu_amdkfd_fini(void)
+-{
+- if (kgd2kfd) {
+- kgd2kfd->exit();
+- symbol_put(kgd2kfd_init);
++ dev_info(adev->dev, "kfd not supported on this ASIC\n");
++ return;
+ }
+-}
+
+-void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev)
+-{
+- if (kgd2kfd)
+- adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev,
+- adev->pdev, kfd2kgd);
++ adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev,
++ adev->pdev, kfd2kgd);
+ }
+
+ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
+ {
+ int i;
+ int last_valid_bit;
++
+ if (adev->kfd) {
+ struct kgd2kfd_shared_resources gpu_resources = {
+- .compute_vmid_bitmap = 0xFF00,
++ .compute_vmid_bitmap = global_compute_vmid_bitmap,
+ .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec,
+- .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe
++ .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe,
++ .gpuvm_size = (uint64_t)amdgpu_vm_size << 30
+ };
+
+ /* this is going to have a few of the MSBs set that we need to
+- * clear */
++ * clear
++ */
+ bitmap_complement(gpu_resources.queue_bitmap,
+ adev->gfx.mec.queue_bitmap,
+ KGD_MAX_QUEUES);
+@@ -120,7 +138,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
+ gpu_resources.queue_bitmap);
+
+ /* According to linux/bitmap.h we shouldn't use bitmap_clear if
+- * nbits is not compile time constant */
++ * nbits is not compile time constant
++ */
+ last_valid_bit = 1 /* only first MEC can have compute queues */
+ * adev->gfx.mec.num_pipe_per_mec
+ * adev->gfx.mec.num_queue_per_pipe;
+@@ -131,6 +150,28 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
+ &gpu_resources.doorbell_physical_address,
+ &gpu_resources.doorbell_aperture_size,
+ &gpu_resources.doorbell_start_offset);
++ if (adev->asic_type >= CHIP_VEGA10) {
++ /* On SOC15 the BIF is involved in routing
++ * doorbells using the low 12 bits of the
++ * address. Communicate the assignments to
++ * KFD. KFD uses two doorbell pages per
++ * process in case of 64-bit doorbells so we
++ * can use each doorbell assignment twice.
++ */
++ gpu_resources.sdma_doorbell[0][0] =
++ AMDGPU_DOORBELL64_sDMA_ENGINE0;
++ gpu_resources.sdma_doorbell[0][1] =
++ AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200;
++ gpu_resources.sdma_doorbell[1][0] =
++ AMDGPU_DOORBELL64_sDMA_ENGINE1;
++ gpu_resources.sdma_doorbell[1][1] =
++ AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200;
++ /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for
++ * SDMA, IH and VCN. So don't use them for the CP.
++ */
++ gpu_resources.reserved_doorbell_mask = 0x1f0;
++ gpu_resources.reserved_doorbell_val = 0x0f0;
++ }
+
+ kgd2kfd->device_init(adev->kfd, &gpu_resources);
+ }
+@@ -167,24 +208,81 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev)
+ return r;
+ }
+
++int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
++ uint32_t vmid, uint64_t gpu_addr,
++ uint32_t *ib_cmd, uint32_t ib_len)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++ struct amdgpu_job *job;
++ struct amdgpu_ib *ib;
++ struct amdgpu_ring *ring;
++ struct dma_fence *f = NULL;
++ int ret;
++
++ switch (engine) {
++ case KGD_ENGINE_MEC1:
++ ring = &adev->gfx.compute_ring[0];
++ break;
++ case KGD_ENGINE_SDMA1:
++ ring = &adev->sdma.instance[0].ring;
++ break;
++ case KGD_ENGINE_SDMA2:
++ ring = &adev->sdma.instance[1].ring;
++ break;
++ default:
++ pr_err("Invalid engine in IB submission: %d\n", engine);
++ ret = -EINVAL;
++ goto err;
++ }
++
++ ret = amdgpu_job_alloc(adev, 1, &job, NULL);
++ if (ret)
++ goto err;
++
++ ib = &job->ibs[0];
++ memset(ib, 0, sizeof(struct amdgpu_ib));
++
++ ib->gpu_addr = gpu_addr;
++ ib->ptr = ib_cmd;
++ ib->length_dw = ib_len;
++ /* This works for NO_HWS. TODO: need to handle without knowing VMID */
++ job->vm_id = vmid;
++
++ ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
++ if (ret) {
++ DRM_ERROR("amdgpu: failed to schedule IB.\n");
++ goto err_ib_sched;
++ }
++
++ ret = dma_fence_wait(f, false);
++
++err_ib_sched:
++ dma_fence_put(f);
++ amdgpu_job_free(job);
++err:
++ return ret;
++}
++
++u32 pool_to_domain(enum kgd_memory_pool p)
++{
++ switch (p) {
++ case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM;
++ default: return AMDGPU_GEM_DOMAIN_GTT;
++ }
++}
++
+ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
+ void **mem_obj, uint64_t *gpu_addr,
+ void **cpu_ptr)
+ {
+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+- struct kgd_mem **mem = (struct kgd_mem **) mem_obj;
++ struct amdgpu_bo *bo = NULL;
+ int r;
+-
+- BUG_ON(kgd == NULL);
+- BUG_ON(gpu_addr == NULL);
+- BUG_ON(cpu_ptr == NULL);
+-
+- *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL);
+- if ((*mem) == NULL)
+- return -ENOMEM;
++ uint64_t gpu_addr_tmp = 0;
++ void *cpu_ptr_tmp = NULL;
+
+ r = amdgpu_bo_create(adev, size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT,
+- AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, &(*mem)->bo);
++ AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &bo);
+ if (r) {
+ dev_err(adev->dev,
+ "failed to allocate BO for amdkfd (%d)\n", r);
+@@ -192,64 +290,87 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
+ }
+
+ /* map the buffer */
+- r = amdgpu_bo_reserve((*mem)->bo, true);
++ r = amdgpu_bo_reserve(bo, true);
+ if (r) {
+ dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r);
+ goto allocate_mem_reserve_bo_failed;
+ }
+
+- r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT,
+- &(*mem)->gpu_addr);
++ r = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT,
++ &gpu_addr_tmp);
+ if (r) {
+ dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r);
+ goto allocate_mem_pin_bo_failed;
+ }
+- *gpu_addr = (*mem)->gpu_addr;
+
+- r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr);
++ r = amdgpu_bo_kmap(bo, &cpu_ptr_tmp);
+ if (r) {
+ dev_err(adev->dev,
+ "(%d) failed to map bo to kernel for amdkfd\n", r);
+ goto allocate_mem_kmap_bo_failed;
+ }
+- *cpu_ptr = (*mem)->cpu_ptr;
+
+- amdgpu_bo_unreserve((*mem)->bo);
++ *mem_obj = bo;
++ *gpu_addr = gpu_addr_tmp;
++ *cpu_ptr = cpu_ptr_tmp;
++
++ amdgpu_bo_unreserve(bo);
+
+ return 0;
+
+ allocate_mem_kmap_bo_failed:
+- amdgpu_bo_unpin((*mem)->bo);
++ amdgpu_bo_unpin(bo);
+ allocate_mem_pin_bo_failed:
+- amdgpu_bo_unreserve((*mem)->bo);
++ amdgpu_bo_unreserve(bo);
+ allocate_mem_reserve_bo_failed:
+- amdgpu_bo_unref(&(*mem)->bo);
++ amdgpu_bo_unref(&bo);
+
+ return r;
+ }
+
+ void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
+ {
+- struct kgd_mem *mem = (struct kgd_mem *) mem_obj;
++ struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
+
+- BUG_ON(mem == NULL);
+-
+- amdgpu_bo_reserve(mem->bo, true);
+- amdgpu_bo_kunmap(mem->bo);
+- amdgpu_bo_unpin(mem->bo);
+- amdgpu_bo_unreserve(mem->bo);
+- amdgpu_bo_unref(&(mem->bo));
+- kfree(mem);
++ amdgpu_bo_reserve(bo, true);
++ amdgpu_bo_kunmap(bo);
++ amdgpu_bo_unpin(bo);
++ amdgpu_bo_unreserve(bo);
++ amdgpu_bo_unref(&(bo));
+ }
+
+-uint64_t get_vmem_size(struct kgd_dev *kgd)
++void get_local_mem_info(struct kgd_dev *kgd,
++ struct kfd_local_mem_info *mem_info)
+ {
+- struct amdgpu_device *adev =
+- (struct amdgpu_device *)kgd;
++ uint64_t address_mask;
++ resource_size_t aper_limit;
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+
+- BUG_ON(kgd == NULL);
++ address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask :
++ ~((1ULL << 32) - 1);
++ aper_limit = adev->mc.aper_base + adev->mc.aper_size;
++
++ memset(mem_info, 0, sizeof(*mem_info));
++ if (!(adev->mc.aper_base & address_mask ||
++ aper_limit & address_mask)) {
++ mem_info->local_mem_size_public = adev->mc.visible_vram_size;
++ mem_info->local_mem_size_private = adev->mc.real_vram_size -
++ adev->mc.visible_vram_size;
++ } else {
++ mem_info->local_mem_size_public = 0;
++ mem_info->local_mem_size_private = adev->mc.real_vram_size;
++ }
++ mem_info->vram_width = adev->mc.vram_width;
+
+- return adev->mc.real_vram_size;
++ pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n",
++ adev->mc.aper_base, aper_limit,
++ mem_info->local_mem_size_public,
++ mem_info->local_mem_size_private);
++
++ if (amdgpu_sriov_vf(adev))
++ mem_info->mem_clk_max = adev->clock.default_mclk / 100;
++ else
++ mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100;
+ }
+
+ uint64_t get_gpu_clock_counter(struct kgd_dev *kgd)
+@@ -271,3 +392,106 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd)
+
+ return amdgpu_dpm_get_sclk(adev, false) / 100;
+ }
++
++void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++ struct amdgpu_cu_info acu_info = adev->gfx.cu_info;
++
++ memset(cu_info, 0, sizeof(*cu_info));
++ if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap))
++ return;
++
++ cu_info->cu_active_number = acu_info.number;
++ cu_info->cu_ao_mask = acu_info.ao_cu_mask;
++ memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0],
++ sizeof(acu_info.bitmap));
++ cu_info->num_shader_engines = adev->gfx.config.max_shader_engines;
++ cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se;
++ cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh;
++ cu_info->simd_per_cu = acu_info.simd_per_cu;
++ cu_info->max_waves_per_simd = acu_info.max_waves_per_simd;
++ cu_info->wave_front_size = acu_info.wave_front_size;
++ cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu;
++ cu_info->lds_size = acu_info.lds_size;
++}
++
++int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
++ struct kgd_dev **dma_buf_kgd,
++ uint64_t *bo_size, void *metadata_buffer,
++ size_t buffer_size, uint32_t *metadata_size,
++ uint32_t *flags)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++ struct dma_buf *dma_buf;
++ struct drm_gem_object *obj;
++ struct amdgpu_bo *bo;
++ uint64_t metadata_flags;
++ int r = -EINVAL;
++
++ dma_buf = dma_buf_get(dma_buf_fd);
++ if (IS_ERR(dma_buf))
++ return PTR_ERR(dma_buf);
++
++ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops)
++ /* Can't handle non-graphics buffers */
++ goto out_put;
++
++ obj = dma_buf->priv;
++ if (obj->dev->driver != adev->ddev->driver)
++ /* Can't handle buffers from different drivers */
++ goto out_put;
++
++ adev = obj->dev->dev_private;
++ bo = gem_to_amdgpu_bo(obj);
++ if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
++ AMDGPU_GEM_DOMAIN_GTT |
++ AMDGPU_GEM_DOMAIN_DGMA)))
++ /* Only VRAM, GTT and DGMA BOs are supported */
++ goto out_put;
++
++ r = 0;
++ if (dma_buf_kgd)
++ *dma_buf_kgd = (struct kgd_dev *)adev;
++ if (bo_size)
++ *bo_size = amdgpu_bo_size(bo);
++ if (metadata_size)
++ *metadata_size = bo->metadata_size;
++ if (metadata_buffer)
++ r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size,
++ metadata_size, &metadata_flags);
++ if (flags) {
++ /* If the preferred domain is DGMA, set flags to VRAM because
++ * KFD doesn't support allocating DGMA memory
++ */
++ *flags = (bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
++ AMDGPU_GEM_DOMAIN_DGMA)) ?
++ ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT;
++
++ if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
++ *flags |= ALLOC_MEM_FLAGS_PUBLIC;
++ }
++
++out_put:
++ dma_buf_put(dma_buf);
++ return r;
++}
++
++uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++ uint64_t usage =
++ amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
++ return usage;
++}
++
++bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev,
++ u32 vmid)
++{
++ if (adev->kfd) {
++ if ((1 << vmid) & global_compute_vmid_bitmap)
++ return true;
++ }
++
++ return false;
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+old mode 100644
+new mode 100755
+index 6d3a10b..b259ba7
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+@@ -27,20 +27,109 @@
+
+ #include <linux/types.h>
+ #include <linux/mm.h>
++#include <linux/workqueue.h>
++#include <linux/mmu_context.h>
+ #include <kgd_kfd_interface.h>
++#include "amdgpu.h"
++
++extern const struct kgd2kfd_calls *kgd2kfd;
+
+ struct amdgpu_device;
+
++struct kfd_bo_va_list {
++ struct list_head bo_list;
++ struct amdgpu_bo_va *bo_va;
++ void *kgd_dev;
++ bool is_mapped;
++ bool map_fail;
++ uint64_t va;
++ uint64_t pte_flags;
++};
++
+ struct kgd_mem {
++ struct mutex lock;
+ struct amdgpu_bo *bo;
+- uint64_t gpu_addr;
+- void *cpu_ptr;
++ struct list_head bo_va_list;
++ /* protected by amdkfd_process_info.lock */
++ struct ttm_validate_buffer validate_list;
++ struct ttm_validate_buffer resv_list;
++ uint32_t domain;
++ unsigned int mapped_to_gpu_memory;
++ void *kptr;
++ uint64_t va;
++
++ uint32_t mapping_flags;
++
++ atomic_t invalid;
++ struct amdkfd_process_info *process_info;
++ struct page **user_pages;
++
++ struct amdgpu_sync sync;
++
++ /* flags bitfield */
++ bool coherent : 1;
++ bool no_substitute : 1;
++ bool aql_queue : 1;
++};
++
++/* KFD Memory Eviction */
++struct amdgpu_amdkfd_fence {
++ struct dma_fence base;
++ void *mm;
++ spinlock_t lock;
++ char timeline_name[TASK_COMM_LEN];
++};
++
++struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
++ void *mm);
++bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm);
++struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
++
++struct amdkfd_process_info {
++ /* List head of all VMs that belong to a KFD process */
++ struct list_head vm_list_head;
++ /* List head for all KFD BOs that belong to a KFD process. */
++ struct list_head kfd_bo_list;
++ /* List of userptr BOs that are valid or invalid */
++ struct list_head userptr_valid_list;
++ struct list_head userptr_inval_list;
++ /* Lock to protect kfd_bo_list */
++ struct mutex lock;
++
++ /* Number of VMs */
++ unsigned int n_vms;
++ /* Eviction Fence */
++ struct amdgpu_amdkfd_fence *eviction_fence;
++
++ /* MMU-notifier related fields */
++ atomic_t evicted_bos;
++ struct delayed_work work;
++ struct pid *pid;
++};
++
++/* struct amdkfd_vm -
++ * For Memory Eviction KGD requires a mechanism to keep track of all KFD BOs
++ * belonging to a KFD process. All the VMs belonging to the same process point
++ * to the same amdkfd_process_info.
++ */
++struct amdkfd_vm {
++ /* Keep base as the first parameter for pointer compatibility between
++ * amdkfd_vm and amdgpu_vm.
++ */
++ struct amdgpu_vm base;
++
++ /* List node in amdkfd_process_info.vm_list_head*/
++ struct list_head vm_list_node;
++
++ struct amdgpu_device *adev;
++ /* Points to the KFD process VM info*/
++ struct amdkfd_process_info *process_info;
+ };
+
++
+ int amdgpu_amdkfd_init(void);
+ void amdgpu_amdkfd_fini(void);
+
+-bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev);
+
+ void amdgpu_amdkfd_suspend(struct amdgpu_device *adev);
+ int amdgpu_amdkfd_resume(struct amdgpu_device *adev);
+@@ -50,17 +139,105 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
+ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev);
+ void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev);
+
++int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm);
++int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
++ uint32_t vmid, uint64_t gpu_addr,
++ uint32_t *ib_cmd, uint32_t ib_len);
++int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
++ struct dma_fence **ef);
+ struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void);
+ struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void);
++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void);
++int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem,
++ uint64_t src_offset, struct kgd_mem *dst_mem,
++ uint64_t dest_offset, uint64_t size, struct dma_fence **f,
++ uint64_t *actual_size);
++
++bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev,
++ u32 vmid);
+
+ /* Shared API */
++int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm,
++ struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va);
+ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
+ void **mem_obj, uint64_t *gpu_addr,
+ void **cpu_ptr);
+ void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
+-uint64_t get_vmem_size(struct kgd_dev *kgd);
++void get_local_mem_info(struct kgd_dev *kgd,
++ struct kfd_local_mem_info *mem_info);
+ uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);
+
+ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
++void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info);
++int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
++ struct kgd_dev **dmabuf_kgd,
++ uint64_t *bo_size, void *metadata_buffer,
++ size_t buffer_size, uint32_t *metadata_size,
++ uint32_t *flags);
++uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
++
++#define read_user_wptr(mmptr, wptr, dst) \
++ ({ \
++ bool valid = false; \
++ if ((mmptr) && (wptr)) { \
++ if ((mmptr) == current->mm) { \
++ valid = !get_user((dst), (wptr)); \
++ } else if (current->mm == NULL) { \
++ use_mm(mmptr); \
++ valid = !get_user((dst), (wptr)); \
++ unuse_mm(mmptr); \
++ } \
++ } \
++ valid; \
++ })
++
++/* GPUVM API */
++int amdgpu_amdkfd_gpuvm_sync_memory(
++ struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);
++int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
++ struct kgd_dev *kgd, uint64_t va, uint64_t size,
++ void *vm, struct kgd_mem **mem,
++ uint64_t *offset, uint32_t flags);
++int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm);
++int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm);
++int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm);
+
++int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm,
++ void **process_info,
++ struct dma_fence **ef);
++void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm);
++
++uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm);
++
++int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
++ struct kfd_vm_fault_info *info);
++
++int amdgpu_amdkfd_gpuvm_mmap_bo(
++ struct kgd_dev *kgd, struct vm_area_struct *vma);
++
++int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
++ struct kgd_mem *mem, void **kptr);
++
++int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd,
++ struct kgd_mem *mem, uint64_t offset,
++ uint64_t size, struct sg_table **ret_sg);
++void amdgpu_amdkfd_gpuvm_unpin_put_sg_table(
++ struct kgd_mem *mem, struct sg_table *sg);
++int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
++ struct dma_buf *dmabuf,
++ uint64_t va, void *vm,
++ struct kgd_mem **mem, uint64_t *size,
++ uint64_t *mmap_offset);
++int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm,
++ struct kgd_mem *mem,
++ struct dma_buf **dmabuf);
++int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm);
++int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm);
++
++void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
++void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo);
+ #endif /* AMDGPU_AMDKFD_H_INCLUDED */
++
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
+new file mode 100644
+index 0000000..3961937
+--- /dev/null
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
+@@ -0,0 +1,196 @@
++/*
++ * Copyright 2016 Advanced Micro Devices, Inc.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
++ * OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include <linux/spinlock.h>
++#include <linux/atomic.h>
++#include <linux/stacktrace.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include "amdgpu_amdkfd.h"
++
++const struct dma_fence_ops amd_kfd_fence_ops;
++static atomic_t fence_seq = ATOMIC_INIT(0);
++
++static int amd_kfd_fence_signal(struct dma_fence *f);
++
++/* Eviction Fence
++ * Fence helper functions to deal with KFD memory eviction.
++ * Big Idea - Since KFD submissions are done by user queues, a BO cannot be
++ * evicted unless all the user queues for that process are evicted.
++ *
++ * All the BOs in a process share an eviction fence. When process X wants
++ * to map VRAM memory but TTM can't find enough space, TTM will attempt to
++ * evict BOs from its LRU list. TTM checks if the BO is valuable to evict
++ * by calling ttm_bo_driver->eviction_valuable().
++ *
++ * ttm_bo_driver->eviction_valuable() - will return false if the BO belongs
++ * to process X. Otherwise, it will return true to indicate BO can be
++ * evicted by TTM.
++ *
++ * If ttm_bo_driver->eviction_valuable returns true, then TTM will continue
++ * the evcition process for that BO by calling ttm_bo_evict --> amdgpu_bo_move
++ * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler.
++ *
++ * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to
++ * nofity when the BO is free to move. fence_add_callback --> enable_signaling
++ * --> amdgpu_amdkfd_fence.enable_signaling
++ *
++ * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will quiesce
++ * user queues and signal fence. The work item will also start another delayed
++ * work item to restore BOs
++ */
++
++struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
++ void *mm)
++{
++ struct amdgpu_amdkfd_fence *fence = NULL;
++
++ fence = kzalloc(sizeof(*fence), GFP_KERNEL);
++ if (fence == NULL)
++ return NULL;
++
++ /* mm_struct mm is used as void pointer to identify the parent
++ * KFD process. Don't dereference it. Fence and any threads using
++ * mm is guranteed to be released before process termination.
++ */
++ fence->mm = mm;
++ get_task_comm(fence->timeline_name, current);
++ spin_lock_init(&fence->lock);
++
++ dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock,
++ context, atomic_inc_return(&fence_seq));
++
++ return fence;
++}
++
++struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
++{
++ struct amdgpu_amdkfd_fence *fence;
++
++ if (!f)
++ return NULL;
++
++ fence = container_of(f, struct amdgpu_amdkfd_fence, base);
++ if (fence && f->ops == &amd_kfd_fence_ops)
++ return fence;
++
++ return NULL;
++}
++
++static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f)
++{
++ return "amdgpu_amdkfd_fence";
++}
++
++static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f)
++{
++ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
++
++ return fence->timeline_name;
++}
++
++/**
++ * amd_kfd_fence_enable_signaling - This gets called when TTM wants to evict
++ * a KFD BO and schedules a job to move the BO.
++ * If fence is already signaled return true.
++ * If fence is not signaled schedule a evict KFD process work item.
++ */
++static bool amd_kfd_fence_enable_signaling(struct dma_fence *f)
++{
++ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
++
++ if (!fence)
++ return false;
++
++ if (dma_fence_is_signaled(f))
++ return true;
++
++ if (!kgd2kfd->schedule_evict_and_restore_process(
++ (struct mm_struct *)fence->mm, f))
++ return true;
++
++ return false;
++}
++
++static int amd_kfd_fence_signal(struct dma_fence *f)
++{
++ unsigned long flags;
++ int ret;
++
++ spin_lock_irqsave(f->lock, flags);
++ /* Set enabled bit so cb will called */
++ set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags);
++ ret = dma_fence_signal_locked(f);
++ spin_unlock_irqrestore(f->lock, flags);
++
++ return ret;
++}
++
++/**
++ * amd_kfd_fence_release - callback that fence can be freed
++ *
++ * @fence: fence
++ *
++ * This function is called when the reference count becomes zero.
++ * It just RCU schedules freeing up the fence.
++*/
++static void amd_kfd_fence_release(struct dma_fence *f)
++{
++ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
++ /* Unconditionally signal the fence. The process is getting
++ * terminated.
++ */
++ if (WARN_ON(!fence))
++ return; /* Not an amdgpu_amdkfd_fence */
++
++ amd_kfd_fence_signal(f);
++ kfree_rcu(f, rcu);
++}
++
++/**
++ * amd_kfd_fence_check_mm - Check if @mm is same as that of the fence @f
++ * if same return TRUE else return FALSE.
++ *
++ * @f: [IN] fence
++ * @mm: [IN] mm that needs to be verified
++*/
++bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm)
++{
++ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
++
++ if (!fence)
++ return false;
++ else if (fence->mm == mm)
++ return true;
++
++ return false;
++}
++
++const struct dma_fence_ops amd_kfd_fence_ops = {
++ .get_driver_name = amd_kfd_fence_get_driver_name,
++ .get_timeline_name = amd_kfd_fence_get_timeline_name,
++ .enable_signaling = amd_kfd_fence_enable_signaling,
++ .signaled = NULL,
++ .wait = dma_fence_default_wait,
++ .release = amd_kfd_fence_release,
++};
++
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+old mode 100644
+new mode 100755
+index 5748504..6964ece
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+@@ -20,6 +20,9 @@
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
++#undef pr_fmt
++#define pr_fmt(fmt) "kfd2kgd: " fmt
++
+ #include <linux/fdtable.h>
+ #include <linux/uaccess.h>
+ #include <linux/firmware.h>
+@@ -39,6 +42,14 @@
+ #include "gmc/gmc_7_1_sh_mask.h"
+ #include "cik_structs.h"
+
++#define AMDKFD_SKIP_UNCOMPILED_CODE 1
++
++enum hqd_dequeue_request_type {
++ NO_ACTION = 0,
++ DRAIN_PIPE,
++ RESET_WAVES
++};
++
+ enum {
+ MAX_TRAPID = 8, /* 3 bits in the bitfield. */
+ MAX_WATCH_ADDRESSES = 4
+@@ -55,8 +66,8 @@ enum {
+ enum {
+ ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL,
+ ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF,
+- ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000,
+- /* extend the mask to 26 bits to match the low address field */
++ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000,
++ /* extend the mask to 26 bits in order to match the low address field */
+ ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6,
+ ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF
+ };
+@@ -81,30 +92,42 @@ union TCP_WATCH_CNTL_BITS {
+ float f32All;
+ };
+
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++ int fd, uint32_t handle, struct kgd_mem **mem);
++
++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
++
+ /*
+ * Register access functions
+ */
+
+ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
+- uint32_t sh_mem_config, uint32_t sh_mem_ape1_base,
+- uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
+-
++ uint32_t sh_mem_config, uint32_t sh_mem_ape1_base,
++ uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
+ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+- unsigned int vmid);
+-
++ unsigned int vmid);
+ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
+- uint32_t hpd_size, uint64_t hpd_gpu_addr);
++ uint32_t hpd_size, uint64_t hpd_gpu_addr);
+ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
+ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+- uint32_t queue_id, uint32_t __user *wptr);
+-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
++ uint32_t queue_id, uint32_t __user *wptr,
++ uint32_t wptr_shift, uint32_t wptr_mask,
++ struct mm_struct *mm);
++static int kgd_hqd_dump(struct kgd_dev *kgd,
++ uint32_t pipe_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
++ uint32_t __user *wptr, struct mm_struct *mm);
++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
++ uint32_t engine_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
+ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
+- uint32_t pipe_id, uint32_t queue_id);
+-
+-static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
++ uint32_t pipe_id, uint32_t queue_id);
++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
++ enum kfd_preempt_type reset_type,
+ unsigned int utimeout, uint32_t pipe_id,
+ uint32_t queue_id);
+-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
+ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+ unsigned int utimeout);
+ static int kgd_address_watch_disable(struct kgd_dev *kgd);
+@@ -124,21 +147,60 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid);
+ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
+ uint8_t vmid);
+ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
++static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req);
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++ uint64_t va, uint32_t vmid);
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++ uint8_t element_size, uint8_t index_stride, uint8_t mtype);
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t page_table_base);
++static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd);
++
++/* Because of REG_GET_FIELD() being used, we put this function in the
++ * asic specific file.
++ */
++static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
++ struct tile_config *config)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+
+-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
++ config->gb_addr_config = adev->gfx.config.gb_addr_config;
++ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
++ MC_ARB_RAMCFG, NOOFBANK);
++ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
++ MC_ARB_RAMCFG, NOOFRANKS);
++
++ config->tile_config_ptr = adev->gfx.config.tile_mode_array;
++ config->num_tile_configs =
++ ARRAY_SIZE(adev->gfx.config.tile_mode_array);
++ config->macro_tile_config_ptr =
++ adev->gfx.config.macrotile_mode_array;
++ config->num_macro_tile_configs =
++ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
++
++
++ return 0;
++}
+
+ static const struct kfd2kgd_calls kfd2kgd = {
+ .init_gtt_mem_allocation = alloc_gtt_mem,
+ .free_gtt_mem = free_gtt_mem,
+- .get_vmem_size = get_vmem_size,
++ .get_local_mem_info = get_local_mem_info,
+ .get_gpu_clock_counter = get_gpu_clock_counter,
+ .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
++ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
++ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
++ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
++ .open_graphic_handle = open_graphic_handle,
+ .program_sh_mem_settings = kgd_program_sh_mem_settings,
+ .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
+ .init_pipeline = kgd_init_pipeline,
+ .init_interrupts = kgd_init_interrupts,
+ .hqd_load = kgd_hqd_load,
+ .hqd_sdma_load = kgd_hqd_sdma_load,
++ .hqd_dump = kgd_hqd_dump,
++ .hqd_sdma_dump = kgd_hqd_sdma_dump,
+ .hqd_is_occupied = kgd_hqd_is_occupied,
+ .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
+ .hqd_destroy = kgd_hqd_destroy,
+@@ -147,17 +209,50 @@ static const struct kfd2kgd_calls kfd2kgd = {
+ .address_watch_execute = kgd_address_watch_execute,
+ .wave_control_execute = kgd_wave_control_execute,
+ .address_watch_get_offset = kgd_address_watch_get_offset,
+- .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid,
+- .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid,
++ .get_atc_vmid_pasid_mapping_pasid =
++ get_atc_vmid_pasid_mapping_pasid,
++ .get_atc_vmid_pasid_mapping_valid =
++ get_atc_vmid_pasid_mapping_valid,
++ .read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg,
+ .write_vmid_invalidate_request = write_vmid_invalidate_request,
+- .get_fw_version = get_fw_version
++ .invalidate_tlbs = invalidate_tlbs,
++ .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
++ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
++ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
++ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
++ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
++ .get_fw_version = get_fw_version,
++ .set_num_of_requests = set_num_of_requests,
++ .get_cu_info = get_cu_info,
++ .alloc_memory_of_scratch = alloc_memory_of_scratch,
++ .write_config_static_mem = write_config_static_mem,
++ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo,
++ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
++ .set_vm_context_page_table_base = set_vm_context_page_table_base,
++ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table,
++ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table,
++ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info,
++ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf,
++ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf,
++ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info,
++ .submit_ib = amdgpu_amdkfd_submit_ib,
++ .get_tile_config = amdgpu_amdkfd_get_tile_config,
++ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos,
++ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem,
++ .get_vram_usage = amdgpu_amdkfd_get_vram_usage
+ };
+
+-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions()
+ {
+ return (struct kfd2kgd_calls *)&kfd2kgd;
+ }
+
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++ int fd, uint32_t handle, struct kgd_mem **mem)
++{
++ return 0;
++}
++
+ static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
+ {
+ return (struct amdgpu_device *)kgd;
+@@ -186,7 +281,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
+ {
+ struct amdgpu_device *adev = get_amdgpu_device(kgd);
+
+- uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
++ uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
+ uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
+
+ lock_srbm(kgd, mec, pipe, queue_id, 0);
+@@ -222,12 +317,12 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+
+ /*
+ * We have to assume that there is no outstanding mapping.
+- * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
+- * a mapping is in progress or because a mapping finished and the
+- * SW cleared it. So the protocol is to always wait & clear.
++ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a
++ * mapping is in progress or because a mapping finished and the SW
++ * cleared it. So the protocol is to always wait & clear.
+ */
+- uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
+- ATC_VMID0_PASID_MAPPING__VALID_MASK;
++ uint32_t pasid_mapping = (pasid == 0) ? 0 :
++ (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK;
+
+ WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping);
+
+@@ -273,8 +368,7 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
+
+ retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
+ m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
+-
+- pr_debug("kfd: sdma base address: 0x%x\n", retval);
++ pr_debug("sdma base address: 0x%x\n", retval);
+
+ return retval;
+ }
+@@ -290,26 +384,91 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+ }
+
+ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+- uint32_t queue_id, uint32_t __user *wptr)
++ uint32_t queue_id, uint32_t __user *wptr,
++ uint32_t wptr_shift, uint32_t wptr_mask,
++ struct mm_struct *mm)
+ {
+ struct amdgpu_device *adev = get_amdgpu_device(kgd);
+- uint32_t wptr_shadow, is_wptr_shadow_valid;
+ struct cik_mqd *m;
++ uint32_t *mqd_hqd;
++ uint32_t reg, wptr_val, data;
++ bool valid_wptr = false;
+
+ m = get_mqd(mqd);
+
+- is_wptr_shadow_valid = !get_user(wptr_shadow, wptr);
+- if (is_wptr_shadow_valid)
+- m->cp_hqd_pq_wptr = wptr_shadow;
++ acquire_queue(kgd, pipe_id, queue_id);
++
++ /* HQD registers extend from CP_MQD_BASE_ADDR to CP_MQD_CONTROL. */
++ mqd_hqd = &m->cp_mqd_base_addr_lo;
++
++ for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++)
++ WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]);
++
++ /* Copy userspace write pointer value to register.
++ * Activate doorbell logic to monitor subsequent changes.
++ */
++ data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
++ CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
++ WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data);
++
++ /* read_user_ptr may take the mm->mmap_sem.
++ * release srbm_mutex to avoid circular dependency between
++ * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex.
++ */
++ release_queue(kgd);
++ valid_wptr = read_user_wptr(mm, wptr, wptr_val);
+
+ acquire_queue(kgd, pipe_id, queue_id);
+- gfx_v7_0_mqd_commit(adev, m);
++ if (valid_wptr)
++ WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask);
++
++ data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
++ WREG32(mmCP_HQD_ACTIVE, data);
++
++
+ release_queue(kgd);
+
+ return 0;
+ }
+
+-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
++static int kgd_hqd_dump(struct kgd_dev *kgd,
++ uint32_t pipe_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t i = 0, reg;
++#define HQD_N_REGS (35+4)
++#define DUMP_REG(addr) do { \
++ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \
++ break; \
++ (*dump)[i][0] = (addr) << 2; \
++ (*dump)[i++][1] = RREG32(addr); \
++ } while (0)
++
++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
++ if (*dump == NULL)
++ return -ENOMEM;
++
++ acquire_queue(kgd, pipe_id, queue_id);
++
++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0);
++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1);
++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2);
++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3);
++
++ for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++)
++ DUMP_REG(reg);
++
++ release_queue(kgd);
++
++ WARN_ON_ONCE(i != HQD_N_REGS);
++ *n_regs = i;
++
++ return 0;
++}
++
++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
++ uint32_t __user *wptr, struct mm_struct *mm)
+ {
+ struct amdgpu_device *adev = get_amdgpu_device(kgd);
+ struct cik_sdma_rlc_registers *m;
+@@ -320,17 +479,17 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
+ m = get_sdma_mqd(mqd);
+ sdma_base_addr = get_sdma_base_addr(m);
+
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
+- m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
++ m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
+
+- end_jiffies = msecs_to_jiffies(2000) + jiffies;
+ while (true) {
+- data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
+- if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
+- break;
+- if (time_after(jiffies, end_jiffies))
+- return -ETIME;
+- usleep_range(500, 1000);
++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
++ break;
++ if (timeout == 0)
++ return -ETIME;
++ msleep(10);
++ timeout -= 10;
+ }
+ if (m->sdma_engine_id) {
+ data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL);
+@@ -344,25 +503,59 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
+ WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
+ }
+
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL,
+- m->sdma_rlc_doorbell);
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
+- m->sdma_rlc_virtual_addr);
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base);
++ data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL,
++ ENABLE, 1);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr);
++ if (read_user_wptr(mm, wptr, data))
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
++ else
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
++ m->sdma_rlc_rb_rptr);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
++ m->sdma_rlc_virtual_addr);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base);
++
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
+ m->sdma_rlc_rb_base_hi);
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
+ m->sdma_rlc_rb_rptr_addr_lo);
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
+ m->sdma_rlc_rb_rptr_addr_hi);
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
+- m->sdma_rlc_rb_cntl);
+-
++ data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL,
++ RB_ENABLE, 1);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
+ return 0;
+ }
+
++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
++ uint32_t engine_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET +
++ queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
++ uint32_t i = 0, reg;
++#undef HQD_N_REGS
++#define HQD_N_REGS (19+4)
++
++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
++ if (*dump == NULL)
++ return -ENOMEM;
++
++ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
++ DUMP_REG(sdma_offset + reg);
++ for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK;
++ reg++)
++ DUMP_REG(sdma_offset + reg);
++
++ WARN_ON_ONCE(i != HQD_N_REGS);
++ *n_regs = i;
++
++ return 0;
++}
++
+ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
+ uint32_t pipe_id, uint32_t queue_id)
+ {
+@@ -403,30 +596,99 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
+ return false;
+ }
+
+-static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
++ enum kfd_preempt_type reset_type,
+ unsigned int utimeout, uint32_t pipe_id,
+ uint32_t queue_id)
+ {
+ struct amdgpu_device *adev = get_amdgpu_device(kgd);
+ uint32_t temp;
+- int timeout = utimeout;
++ enum hqd_dequeue_request_type type;
++ unsigned long flags, end_jiffies;
++ int retry;
+
+ acquire_queue(kgd, pipe_id, queue_id);
+ WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0);
+
+- WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type);
++ switch (reset_type) {
++ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
++ type = DRAIN_PIPE;
++ break;
++ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
++ type = RESET_WAVES;
++ break;
++ default:
++ type = DRAIN_PIPE;
++ break;
++ }
++
++ /* Workaround: If IQ timer is active and the wait time is close to or
++ * equal to 0, dequeueing is not safe. Wait until either the wait time
++ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is
++ * cleared before continuing. Also, ensure wait times are set to at
++ * least 0x3.
++ */
++ local_irq_save(flags);
++ preempt_disable();
++ retry = 5000; /* wait for 500 usecs at maximum */
++ while (true) {
++ temp = RREG32(mmCP_HQD_IQ_TIMER);
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) {
++ pr_debug("HW is processing IQ\n");
++ goto loop;
++ }
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) {
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE)
++ == 3) /* SEM-rearm is safe */
++ break;
++ /* Wait time 3 is safe for CP, but our MMIO read/write
++ * time is close to 1 microsecond, so check for 10 to
++ * leave more buffer room
++ */
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME)
++ >= 10)
++ break;
++ pr_debug("IQ timer is active\n");
++ } else
++ break;
++loop:
++ if (!retry) {
++ pr_err("CP HQD IQ timer status time out\n");
++ break;
++ }
++ ndelay(100);
++ --retry;
++ }
++ retry = 1000;
++ while (true) {
++ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST);
++ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK))
++ break;
++ pr_debug("Dequeue request is pending\n");
+
++ if (!retry) {
++ pr_err("CP HQD dequeue request time out\n");
++ break;
++ }
++ ndelay(100);
++ --retry;
++ }
++ local_irq_restore(flags);
++ preempt_enable();
++
++ WREG32(mmCP_HQD_DEQUEUE_REQUEST, type);
++
++ end_jiffies = (utimeout * HZ / 1000) + jiffies;
+ while (true) {
+ temp = RREG32(mmCP_HQD_ACTIVE);
+- if (temp & CP_HQD_ACTIVE__ACTIVE_MASK)
++ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
+ break;
+- if (timeout <= 0) {
+- pr_err("kfd: cp queue preemption time out.\n");
++ if (time_after(jiffies, end_jiffies)) {
++ pr_err("cp queue preemption time out\n");
+ release_queue(kgd);
+ return -ETIME;
+ }
+- msleep(20);
+- timeout -= 20;
++ usleep_range(500, 1000);
+ }
+
+ release_queue(kgd);
+@@ -440,7 +702,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+ struct cik_sdma_rlc_registers *m;
+ uint32_t sdma_base_addr;
+ uint32_t temp;
+- int timeout = utimeout;
++ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
+
+ m = get_sdma_mqd(mqd);
+ sdma_base_addr = get_sdma_base_addr(m);
+@@ -451,12 +713,11 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+
+ while (true) {
+ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
+- if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
+ break;
+- if (timeout <= 0)
++ if (time_after(jiffies, end_jiffies))
+ return -ETIME;
+- msleep(20);
+- timeout -= 20;
++ usleep_range(500, 1000);
+ }
+
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
+@@ -464,6 +725,8 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
+ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
+
++ m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
++
+ return 0;
+ }
+
+@@ -481,8 +744,9 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd)
+
+ /* Turning off this address until we set all the registers */
+ for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
+- WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX +
+- ADDRESS_WATCH_REG_CNTL], cntl.u32All);
++ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
+
+ return 0;
+ }
+@@ -500,20 +764,24 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd,
+
+ /* Turning off this watch point until we set all the registers */
+ cntl.bitfields.valid = 0;
+- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
+- ADDRESS_WATCH_REG_CNTL], cntl.u32All);
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
+
+- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
+- ADDRESS_WATCH_REG_ADDR_HI], addr_hi);
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_ADDR_HI],
++ addr_hi);
+
+- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
+- ADDRESS_WATCH_REG_ADDR_LO], addr_lo);
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_ADDR_LO],
++ addr_lo);
+
+ /* Enable the watch point */
+ cntl.bitfields.valid = 1;
+
+- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
+- ADDRESS_WATCH_REG_CNTL], cntl.u32All);
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
+
+ return 0;
+ }
+@@ -567,7 +835,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
+
+ reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid);
+- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
++ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
+ }
+
+ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
+@@ -577,52 +845,90 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
+ WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid);
+ }
+
++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++ int vmid;
++
++ for (vmid = 0; vmid < 16; vmid++) {
++ if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
++ continue;
++ if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) &
++ ATC_VMID0_PASID_MAPPING__VALID_MASK) {
++ if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) &
++ ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) {
++ WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid);
++ break;
++ }
++ }
++ }
++
++ return 0;
++}
++
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++ uint8_t element_size, uint8_t index_stride, uint8_t mtype)
++{
++ uint32_t reg;
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT |
++ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT |
++ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT |
++ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT;
++
++ WREG32(mmSH_STATIC_MEM_CONFIG, reg);
++ return 0;
++}
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++ uint64_t va, uint32_t vmid)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++ lock_srbm(kgd, 0, 0, 0, vmid);
++ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va);
++ unlock_srbm(kgd);
++
++ return 0;
++}
++
++
+ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
+ {
+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
+ const union amdgpu_firmware_header *hdr;
+
+- BUG_ON(kgd == NULL);
+-
+ switch (type) {
+ case KGD_ENGINE_PFP:
+- hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.pfp_fw->data;
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data;
+ break;
+
+ case KGD_ENGINE_ME:
+- hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.me_fw->data;
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data;
+ break;
+
+ case KGD_ENGINE_CE:
+- hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.ce_fw->data;
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data;
+ break;
+
+ case KGD_ENGINE_MEC1:
+- hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.mec_fw->data;
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data;
+ break;
+
+ case KGD_ENGINE_MEC2:
+- hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.mec2_fw->data;
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data;
+ break;
+
+ case KGD_ENGINE_RLC:
+- hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.rlc_fw->data;
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data;
+ break;
+
+ case KGD_ENGINE_SDMA1:
+- hdr = (const union amdgpu_firmware_header *)
+- adev->sdma.instance[0].fw->data;
++ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data;
+ break;
+
+ case KGD_ENGINE_SDMA2:
+- hdr = (const union amdgpu_firmware_header *)
+- adev->sdma.instance[1].fw->data;
++ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data;
+ break;
+
+ default:
+@@ -636,3 +942,42 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
+ return hdr->common.ucode_version;
+ }
+
++static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req)
++{
++ uint32_t value;
++ struct amdgpu_device *adev = get_amdgpu_device(dev);
++
++ value = RREG32(mmATC_ATS_DEBUG);
++ value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK;
++ value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT);
++
++ WREG32(mmATC_ATS_DEBUG, value);
++}
++
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t page_table_base)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ /* TODO: Don't use hardcoded VMIDs */
++ if (vmid < 8 || vmid > 15) {
++ pr_err("trying to set page table base for wrong VMID\n");
++ return;
++ }
++ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base);
++}
++
++ /**
++ * read_vmid_from_vmfault_reg - read vmid from register
++ *
++ * adev: amdgpu_device pointer
++ * @vmid: vmid pointer
++ * read vmid from register (CIK).
++ */
++static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS);
++
++ return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID);
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+old mode 100644
+new mode 100755
+index c5044d5..2ff10e9
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+@@ -20,6 +20,9 @@
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
++#undef pr_fmt
++#define pr_fmt(fmt) "kfd2kgd: " fmt
++
+ #include <linux/module.h>
+ #include <linux/fdtable.h>
+ #include <linux/uaccess.h>
+@@ -28,7 +31,7 @@
+ #include "amdgpu.h"
+ #include "amdgpu_amdkfd.h"
+ #include "amdgpu_ucode.h"
+-#include "gfx_v8_0.h"
++#include "amdgpu_amdkfd_gfx_v8.h"
+ #include "gca/gfx_8_0_sh_mask.h"
+ #include "gca/gfx_8_0_d.h"
+ #include "gca/gfx_8_0_enum.h"
+@@ -39,7 +42,31 @@
+ #include "vi_structs.h"
+ #include "vid.h"
+
+-struct cik_sdma_rlc_registers;
++enum hqd_dequeue_request_type {
++ NO_ACTION = 0,
++ DRAIN_PIPE,
++ RESET_WAVES,
++ SAVE_WAVES
++};
++
++static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = {
++ mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL,
++ mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL,
++ mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL,
++ mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL
++};
++
++
++struct vi_sdma_mqd;
++
++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
++ void *vm, struct kgd_mem **mem);
++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem);
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++ int fd, uint32_t handle, struct kgd_mem **mem);
++
++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
+
+ /*
+ * Register access functions
+@@ -55,17 +82,26 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
+ uint32_t hpd_size, uint64_t hpd_gpu_addr);
+ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
+ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+- uint32_t queue_id, uint32_t __user *wptr);
+-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
++ uint32_t queue_id, uint32_t __user *wptr,
++ uint32_t wptr_shift, uint32_t wptr_mask,
++ struct mm_struct *mm);
++static int kgd_hqd_dump(struct kgd_dev *kgd,
++ uint32_t pipe_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
++ uint32_t __user *wptr, struct mm_struct *mm);
++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
++ uint32_t engine_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
+ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
+ uint32_t pipe_id, uint32_t queue_id);
+ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
+-static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
++ enum kfd_preempt_type reset_type,
+ unsigned int utimeout, uint32_t pipe_id,
+ uint32_t queue_id);
+ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+ unsigned int utimeout);
+-static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
+ static int kgd_address_watch_disable(struct kgd_dev *kgd);
+ static int kgd_address_watch_execute(struct kgd_dev *kgd,
+ unsigned int watch_point_id,
+@@ -84,20 +120,61 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
+ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
+ uint8_t vmid);
+ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
+-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
++static void set_num_of_requests(struct kgd_dev *kgd,
++ uint8_t num_of_requests);
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++ uint64_t va, uint32_t vmid);
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++ uint8_t element_size, uint8_t index_stride, uint8_t mtype);
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t page_table_base);
++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
++
++/* Because of REG_GET_FIELD() being used, we put this function in the
++ * asic specific file.
++ */
++static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
++ struct tile_config *config)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++
++ config->gb_addr_config = adev->gfx.config.gb_addr_config;
++ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
++ MC_ARB_RAMCFG, NOOFBANK);
++ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
++ MC_ARB_RAMCFG, NOOFRANKS);
++
++ config->tile_config_ptr = adev->gfx.config.tile_mode_array;
++ config->num_tile_configs =
++ ARRAY_SIZE(adev->gfx.config.tile_mode_array);
++ config->macro_tile_config_ptr =
++ adev->gfx.config.macrotile_mode_array;
++ config->num_macro_tile_configs =
++ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
++
++ return 0;
++}
+
+ static const struct kfd2kgd_calls kfd2kgd = {
+ .init_gtt_mem_allocation = alloc_gtt_mem,
+ .free_gtt_mem = free_gtt_mem,
+- .get_vmem_size = get_vmem_size,
++ .get_local_mem_info = get_local_mem_info,
+ .get_gpu_clock_counter = get_gpu_clock_counter,
+ .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
++ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
++ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
++ .create_process_gpumem = create_process_gpumem,
++ .destroy_process_gpumem = destroy_process_gpumem,
++ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
++ .open_graphic_handle = open_graphic_handle,
+ .program_sh_mem_settings = kgd_program_sh_mem_settings,
+ .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
+ .init_pipeline = kgd_init_pipeline,
+ .init_interrupts = kgd_init_interrupts,
+ .hqd_load = kgd_hqd_load,
+ .hqd_sdma_load = kgd_hqd_sdma_load,
++ .hqd_dump = kgd_hqd_dump,
++ .hqd_sdma_dump = kgd_hqd_sdma_dump,
+ .hqd_is_occupied = kgd_hqd_is_occupied,
+ .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
+ .hqd_destroy = kgd_hqd_destroy,
+@@ -111,14 +188,56 @@ static const struct kfd2kgd_calls kfd2kgd = {
+ .get_atc_vmid_pasid_mapping_valid =
+ get_atc_vmid_pasid_mapping_valid,
+ .write_vmid_invalidate_request = write_vmid_invalidate_request,
+- .get_fw_version = get_fw_version
++ .invalidate_tlbs = invalidate_tlbs,
++ .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
++ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
++ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
++ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
++ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
++ .get_fw_version = get_fw_version,
++ .set_num_of_requests = set_num_of_requests,
++ .get_cu_info = get_cu_info,
++ .alloc_memory_of_scratch = alloc_memory_of_scratch,
++ .write_config_static_mem = write_config_static_mem,
++ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo,
++ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
++ .set_vm_context_page_table_base = set_vm_context_page_table_base,
++ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table,
++ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table,
++ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info,
++ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf,
++ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf,
++ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info,
++ .submit_ib = amdgpu_amdkfd_submit_ib,
++ .get_tile_config = amdgpu_amdkfd_get_tile_config,
++ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos,
++ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem,
++ .get_vram_usage = amdgpu_amdkfd_get_vram_usage
+ };
+
+-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void)
++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions()
+ {
+ return (struct kfd2kgd_calls *)&kfd2kgd;
+ }
+
++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
++ void *vm, struct kgd_mem **mem)
++{
++ return 0;
++}
++
++/* Destroys the GPU allocation and frees the kgd_mem structure */
++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem)
++{
++
++}
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++ int fd, uint32_t handle, struct kgd_mem **mem)
++{
++ return 0;
++}
++
+ static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
+ {
+ return (struct amdgpu_device *)kgd;
+@@ -147,7 +266,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
+ {
+ struct amdgpu_device *adev = get_amdgpu_device(kgd);
+
+- uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
++ uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
+ uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
+
+ lock_srbm(kgd, mec, pipe, queue_id, 0);
+@@ -216,21 +335,28 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
+ uint32_t mec;
+ uint32_t pipe;
+
+- mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
++ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
+ pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
+
+ lock_srbm(kgd, mec, pipe, 0, 0);
+
+- WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK);
++ WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
++ CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
+
+ unlock_srbm(kgd);
+
+ return 0;
+ }
+
+-static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
++static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m)
+ {
+- return 0;
++ uint32_t retval;
++
++ retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
++ m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET;
++ pr_debug("sdma base address: 0x%x\n", retval);
++
++ return retval;
+ }
+
+ static inline struct vi_mqd *get_mqd(void *mqd)
+@@ -238,9 +364,9 @@ static inline struct vi_mqd *get_mqd(void *mqd)
+ return (struct vi_mqd *)mqd;
+ }
+
+-static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
++static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
+ {
+- return (struct cik_sdma_rlc_registers *)mqd;
++ return (struct vi_sdma_mqd *)mqd;
+ }
+
+ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+@@ -252,16 +378,18 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+ struct vi_mqd *m;
+ uint32_t *mqd_hqd;
+ uint32_t reg, wptr_val, data;
++ bool valid_wptr = false;
+
+ m = get_mqd(mqd);
+
+ acquire_queue(kgd, pipe_id, queue_id);
+- /*HIQ is set during driver init period with vmid set to 0. For SRIOV
+- * world switching support let the RLC know about the HIQ.
+- *
+- * Workaround: This causes reboots on CZ. Disable this on CZ, which
+- * doesn't support SRIOV anyway.
+- */
++
++ /* HIQ is set during driver init period with vmid set to 0. For SRIOV
++ * world switching support let the RLC know about the HIQ.
++ *
++ * Workaround: This causes reboots on CZ. Disable this on CZ, which
++ * doesn't support SRIOV anyway.
++ */
+ if (m->cp_hqd_vmid == 0 &&
+ adev->asic_type != CHIP_CARRIZO) {
+ uint32_t value, mec, pipe;
+@@ -304,7 +432,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+ CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
+ WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data);
+
+- if (read_user_wptr(mm, wptr, wptr_val))
++ /* read_user_ptr may take the mm->mmap_sem.
++ * release srbm_mutex to avoid circular dependency between
++ * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex.
++ */
++ release_queue(kgd);
++ valid_wptr = read_user_wptr(mm, wptr, wptr_val);
++ acquire_queue(kgd, pipe_id, queue_id);
++ if (valid_wptr)
+ WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask);
+
+ data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
+@@ -315,8 +450,138 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+ return 0;
+ }
+
+-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
++static int kgd_hqd_dump(struct kgd_dev *kgd,
++ uint32_t pipe_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t i = 0, reg;
++#define HQD_N_REGS (54+4)
++#define DUMP_REG(addr) do { \
++ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \
++ break; \
++ (*dump)[i][0] = (addr) << 2; \
++ (*dump)[i++][1] = RREG32(addr); \
++ } while (0)
++
++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
++ if (*dump == NULL)
++ return -ENOMEM;
++
++ acquire_queue(kgd, pipe_id, queue_id);
++
++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0);
++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1);
++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2);
++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3);
++
++ for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++)
++ DUMP_REG(reg);
++
++ release_queue(kgd);
++
++ WARN_ON_ONCE(i != HQD_N_REGS);
++ *n_regs = i;
++
++ return 0;
++}
++
++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
++ uint32_t __user *wptr, struct mm_struct *mm)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ struct vi_sdma_mqd *m;
++ uint32_t sdma_base_addr;
++ uint32_t temp, timeout = 2000;
++ uint32_t data;
++
++ m = get_sdma_mqd(mqd);
++ sdma_base_addr = get_sdma_base_addr(m);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
++ m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
++
++ while (true) {
++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
++ break;
++ if (timeout == 0)
++ return -ETIME;
++ msleep(10);
++ timeout -= 10;
++ }
++ if (m->sdma_engine_id) {
++ data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL);
++ data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL,
++ RESUME_CTX, 0);
++ WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data);
++ } else {
++ data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL);
++ data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
++ RESUME_CTX, 0);
++ WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
++ }
++
++ data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
++ ENABLE, 1);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
++
++ if (read_user_wptr(mm, wptr, data))
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
++ else
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
++ m->sdmax_rlcx_rb_rptr);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
++ m->sdmax_rlcx_virtual_addr);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
++ m->sdmax_rlcx_rb_base_hi);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
++ m->sdmax_rlcx_rb_rptr_addr_lo);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
++ m->sdmax_rlcx_rb_rptr_addr_hi);
++
++ data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
++ RB_ENABLE, 1);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
++
++ return 0;
++}
++
++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
++ uint32_t engine_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs)
+ {
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET +
++ queue_id * KFD_VI_SDMA_QUEUE_OFFSET;
++ uint32_t i = 0, reg;
++#undef HQD_N_REGS
++#define HQD_N_REGS (19+4+2+3+7)
++
++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
++ if (*dump == NULL)
++ return -ENOMEM;
++
++ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
++ DUMP_REG(sdma_offset + reg);
++ for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK;
++ reg++)
++ DUMP_REG(sdma_offset + reg);
++ for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI;
++ reg++)
++ DUMP_REG(sdma_offset + reg);
++ for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG;
++ reg++)
++ DUMP_REG(sdma_offset + reg);
++ for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL;
++ reg++)
++ DUMP_REG(sdma_offset + reg);
++
++ WARN_ON_ONCE(i != HQD_N_REGS);
++ *n_regs = i;
++
+ return 0;
+ }
+
+@@ -345,7 +610,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
+ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
+ {
+ struct amdgpu_device *adev = get_amdgpu_device(kgd);
+- struct cik_sdma_rlc_registers *m;
++ struct vi_sdma_mqd *m;
+ uint32_t sdma_base_addr;
+ uint32_t sdma_rlc_rb_cntl;
+
+@@ -360,29 +625,102 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
+ return false;
+ }
+
+-static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
++ enum kfd_preempt_type reset_type,
+ unsigned int utimeout, uint32_t pipe_id,
+ uint32_t queue_id)
+ {
+ struct amdgpu_device *adev = get_amdgpu_device(kgd);
+ uint32_t temp;
+- int timeout = utimeout;
++ enum hqd_dequeue_request_type type;
++ unsigned long flags, end_jiffies;
++ int retry;
++ struct vi_mqd *m = get_mqd(mqd);
+
+ acquire_queue(kgd, pipe_id, queue_id);
+
+- WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type);
++ if (m->cp_hqd_vmid == 0)
++ WREG32_FIELD(RLC_CP_SCHEDULERS, scheduler1, 0);
+
++ switch (reset_type) {
++ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
++ type = DRAIN_PIPE;
++ break;
++ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
++ type = RESET_WAVES;
++ break;
++ default:
++ type = DRAIN_PIPE;
++ break;
++ }
++
++ /* Workaround: If IQ timer is active and the wait time is close to or
++ * equal to 0, dequeueing is not safe. Wait until either the wait time
++ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is
++ * cleared before continuing. Also, ensure wait times are set to at
++ * least 0x3.
++ */
++ local_irq_save(flags);
++ preempt_disable();
++ retry = 5000; /* wait for 500 usecs at maximum */
++ while (true) {
++ temp = RREG32(mmCP_HQD_IQ_TIMER);
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) {
++ pr_debug("HW is processing IQ\n");
++ goto loop;
++ }
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) {
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE)
++ == 3) /* SEM-rearm is safe */
++ break;
++ /* Wait time 3 is safe for CP, but our MMIO read/write
++ * time is close to 1 microsecond, so check for 10 to
++ * leave more buffer room
++ */
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME)
++ >= 10)
++ break;
++ pr_debug("IQ timer is active\n");
++ } else
++ break;
++loop:
++ if (!retry) {
++ pr_err("CP HQD IQ timer status time out\n");
++ break;
++ }
++ ndelay(100);
++ --retry;
++ }
++ retry = 1000;
++ while (true) {
++ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST);
++ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK))
++ break;
++ pr_debug("Dequeue request is pending\n");
++
++ if (!retry) {
++ pr_err("CP HQD dequeue request time out\n");
++ break;
++ }
++ ndelay(100);
++ --retry;
++ }
++ local_irq_restore(flags);
++ preempt_enable();
++
++ WREG32(mmCP_HQD_DEQUEUE_REQUEST, type);
++
++ end_jiffies = (utimeout * HZ / 1000) + jiffies;
+ while (true) {
+ temp = RREG32(mmCP_HQD_ACTIVE);
+- if (temp & CP_HQD_ACTIVE__ACTIVE_MASK)
++ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
+ break;
+- if (timeout <= 0) {
+- pr_err("kfd: cp queue preemption time out.\n");
++ if (time_after(jiffies, end_jiffies)) {
++ pr_err("cp queue preemption time out.\n");
+ release_queue(kgd);
+ return -ETIME;
+ }
+- msleep(20);
+- timeout -= 20;
++ usleep_range(500, 1000);
+ }
+
+ release_queue(kgd);
+@@ -393,10 +731,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+ unsigned int utimeout)
+ {
+ struct amdgpu_device *adev = get_amdgpu_device(kgd);
+- struct cik_sdma_rlc_registers *m;
++ struct vi_sdma_mqd *m;
+ uint32_t sdma_base_addr;
+ uint32_t temp;
+- int timeout = utimeout;
++ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
+
+ m = get_sdma_mqd(mqd);
+ sdma_base_addr = get_sdma_base_addr(m);
+@@ -407,18 +745,19 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
+
+ while (true) {
+ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
+- if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
+ break;
+- if (timeout <= 0)
++ if (time_after(jiffies, end_jiffies))
+ return -ETIME;
+- msleep(20);
+- timeout -= 20;
++ usleep_range(500, 1000);
+ }
+
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
+- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
++ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
++ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
++
++ m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
+
+ return 0;
+ }
+@@ -440,7 +779,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
+
+ reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid);
+- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
++ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
+ }
+
+ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
+@@ -450,8 +789,83 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
+ WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid);
+ }
+
++/*
++ * FIXME: Poliars test failed with this package, FIJI works fine
++ * From the CP spec it does not official support the invalidation
++ * with the specified pasid in the package, so disable it for V8
++ *
++ */
++#ifdef V8_SUPPORT_IT_OFFICIAL
++static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid)
++{
++ signed long r;
++ struct dma_fence *f;
++ struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
++
++ mutex_lock(&adev->gfx.kiq.ring_mutex);
++ amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/
++ amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0));
++ amdgpu_ring_write(ring,
++ PACKET3_INVALIDATE_TLBS_DST_SEL(1) |
++ PACKET3_INVALIDATE_TLBS_PASID(pasid));
++ amdgpu_fence_emit(ring, &f);
++ amdgpu_ring_commit(ring);
++ mutex_unlock(&adev->gfx.kiq.ring_mutex);
++
++ r = dma_fence_wait(f, false);
++ if (r)
++ DRM_ERROR("wait for kiq fence error: %ld.\n", r);
++ dma_fence_put(f);
++
++ return r;
++}
++#endif
++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++ int vmid;
++
++#ifdef V8_SUPPORT_IT_OFFICIAL
++ struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
++
++ if (ring->ready)
++ return invalidate_tlbs_with_kiq(adev, pasid);
++#endif
++
++ for (vmid = 0; vmid < 16; vmid++) {
++ if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
++ continue;
++ if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) &
++ ATC_VMID0_PASID_MAPPING__VALID_MASK) {
++ if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) &
++ ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) {
++ WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid);
++ break;
++ }
++ }
++ }
++
++ return 0;
++}
++
+ static int kgd_address_watch_disable(struct kgd_dev *kgd)
+ {
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ union TCP_WATCH_CNTL_BITS cntl;
++ unsigned int i;
++
++ cntl.u32All = 0;
++
++ cntl.bitfields.valid = 0;
++ cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
++ cntl.bitfields.atc = 1;
++
++ /* Turning off this address until we set all the registers */
++ for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
++ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
++
+ return 0;
+ }
+
+@@ -461,6 +875,32 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd,
+ uint32_t addr_hi,
+ uint32_t addr_lo)
+ {
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ union TCP_WATCH_CNTL_BITS cntl;
++
++ cntl.u32All = cntl_val;
++
++ /* Turning off this watch point until we set all the registers */
++ cntl.bitfields.valid = 0;
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
++
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_ADDR_HI],
++ addr_hi);
++
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_ADDR_LO],
++ addr_lo);
++
++ /* Enable the watch point */
++ cntl.bitfields.valid = 1;
++
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX
++ + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
++
+ return 0;
+ }
+
+@@ -493,6 +933,32 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
+ unsigned int watch_point_id,
+ unsigned int reg_offset)
+ {
++ return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset];
++}
++
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++ uint8_t element_size, uint8_t index_stride, uint8_t mtype)
++{
++ uint32_t reg;
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT |
++ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT |
++ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT |
++ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT;
++
++ WREG32(mmSH_STATIC_MEM_CONFIG, reg);
++ return 0;
++}
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++ uint64_t va, uint32_t vmid)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++ lock_srbm(kgd, 0, 0, 0, vmid);
++ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va);
++ unlock_srbm(kgd);
++
+ return 0;
+ }
+
+@@ -501,47 +967,45 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
+ const union amdgpu_firmware_header *hdr;
+
+- BUG_ON(kgd == NULL);
+-
+ switch (type) {
+ case KGD_ENGINE_PFP:
+ hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.pfp_fw->data;
++ adev->gfx.pfp_fw->data;
+ break;
+
+ case KGD_ENGINE_ME:
+ hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.me_fw->data;
++ adev->gfx.me_fw->data;
+ break;
+
+ case KGD_ENGINE_CE:
+ hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.ce_fw->data;
++ adev->gfx.ce_fw->data;
+ break;
+
+ case KGD_ENGINE_MEC1:
+ hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.mec_fw->data;
++ adev->gfx.mec_fw->data;
+ break;
+
+ case KGD_ENGINE_MEC2:
+ hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.mec2_fw->data;
++ adev->gfx.mec2_fw->data;
+ break;
+
+ case KGD_ENGINE_RLC:
+ hdr = (const union amdgpu_firmware_header *)
+- adev->gfx.rlc_fw->data;
++ adev->gfx.rlc_fw->data;
+ break;
+
+ case KGD_ENGINE_SDMA1:
+ hdr = (const union amdgpu_firmware_header *)
+- adev->sdma.instance[0].fw->data;
++ adev->sdma.instance[0].fw->data;
+ break;
+
+ case KGD_ENGINE_SDMA2:
+ hdr = (const union amdgpu_firmware_header *)
+- adev->sdma.instance[1].fw->data;
++ adev->sdma.instance[1].fw->data;
+ break;
+
+ default:
+@@ -554,3 +1018,21 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
+ /* Only 12 bit in use*/
+ return hdr->common.ucode_version;
+ }
++
++static void set_num_of_requests(struct kgd_dev *kgd,
++ uint8_t num_of_requests)
++{
++ pr_debug("This is a stub\n");
++}
++
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t page_table_base)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ /* TODO: Don't use hardcoded VMIDs */
++ if (vmid < 8 || vmid > 15) {
++ pr_err("trying to set page table base for wrong VMID\n");
++ return;
++ }
++ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base);
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h
+new file mode 100644
+index 0000000..3c94919
+--- /dev/null
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h
+@@ -0,0 +1,62 @@
++/*
++ * Copyright 2015 Advanced Micro Devices, Inc.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
++ * OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef AMDGPU_AMDKFD_GFX_V8_H_INCLUDED
++#define AMDGPU_AMDKFD_GFX_V8_H_INCLUDED
++
++#include <linux/types.h>
++
++enum {
++ MAX_TRAPID = 8, /* 3 bits in the bitfield. */
++ MAX_WATCH_ADDRESSES = 4
++};
++
++enum {
++ ADDRESS_WATCH_REG_ADDR_HI = 0,
++ ADDRESS_WATCH_REG_ADDR_LO,
++ ADDRESS_WATCH_REG_CNTL,
++ ADDRESS_WATCH_REG_MAX
++};
++
++/* not defined in the VI reg file */
++enum {
++ ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL,
++ ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF,
++ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000,
++ /* extend the mask to 26 bits in order to match the low address field */
++ ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6,
++ ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF
++};
++
++union TCP_WATCH_CNTL_BITS {
++ struct {
++ uint32_t mask:24;
++ uint32_t vmid:4;
++ uint32_t atc:1;
++ uint32_t mode:2;
++ uint32_t valid:1;
++ } bitfields, bits;
++ uint32_t u32All;
++ signed int i32All;
++ float f32All;
++};
++#endif
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+new file mode 100644
+index 0000000..edbae19
+--- /dev/null
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+@@ -0,0 +1,1227 @@
++/*
++ * Copyright 2014 Advanced Micro Devices, Inc.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
++ * OTHER DEALINGS IN THE SOFTWARE.
++ */
++#undef pr_fmt
++#define pr_fmt(fmt) "kfd2kgd: " fmt
++
++#include <linux/module.h>
++#include <linux/fdtable.h>
++#include <linux/uaccess.h>
++#include <linux/firmware.h>
++#include <drm/drmP.h>
++#include "amdgpu.h"
++#include "amdgpu_amdkfd.h"
++#include "amdgpu_ucode.h"
++#include "amdgpu_amdkfd_gfx_v8.h"
++#include "vega10/soc15ip.h"
++#include "vega10/GC/gc_9_0_offset.h"
++#include "vega10/GC/gc_9_0_sh_mask.h"
++#include "vega10/vega10_enum.h"
++#include "vega10/SDMA0/sdma0_4_0_offset.h"
++#include "vega10/SDMA0/sdma0_4_0_sh_mask.h"
++#include "vega10/SDMA1/sdma1_4_0_offset.h"
++#include "vega10/SDMA1/sdma1_4_0_sh_mask.h"
++#include "vega10/ATHUB/athub_1_0_offset.h"
++#include "vega10/ATHUB/athub_1_0_sh_mask.h"
++#include "vega10/OSSSYS/osssys_4_0_offset.h"
++#include "vega10/OSSSYS/osssys_4_0_sh_mask.h"
++#include "soc15_common.h"
++#include "v9_structs.h"
++#include "soc15.h"
++#include "soc15d.h"
++
++/* HACK: MMHUB and GC both have VM-related register with the same
++ * names but different offsets. Define the MMHUB register we need here
++ * with a prefix. A proper solution would be to move the functions
++ * programming these registers into gfx_v9_0.c and mmhub_v1_0.c
++ * respectively.
++ */
++#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3
++#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0
++
++#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705
++#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0
++
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0
++
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0
++
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0
++
++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727
++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0
++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728
++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0
++
++enum hqd_dequeue_request_type {
++ NO_ACTION = 0,
++ DRAIN_PIPE,
++ RESET_WAVES,
++ SAVE_WAVES
++};
++
++static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = {
++ mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL,
++ mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL,
++ mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL,
++ mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL
++};
++
++
++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
++ void *vm, struct kgd_mem **mem);
++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem);
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++ int fd, uint32_t handle, struct kgd_mem **mem);
++
++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
++
++/*
++ * Register access functions
++ */
++
++static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t sh_mem_config,
++ uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
++ uint32_t sh_mem_bases);
++static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
++ unsigned int vmid);
++static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
++ uint32_t hpd_size, uint64_t hpd_gpu_addr);
++static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
++static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
++ uint32_t queue_id, uint32_t __user *wptr,
++ uint32_t wptr_shift, uint32_t wptr_mask,
++ struct mm_struct *mm);
++static int kgd_hqd_dump(struct kgd_dev *kgd,
++ uint32_t pipe_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
++ uint32_t __user *wptr, struct mm_struct *mm);
++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
++ uint32_t engine_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
++static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
++ uint32_t pipe_id, uint32_t queue_id);
++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
++ enum kfd_preempt_type reset_type,
++ unsigned int utimeout, uint32_t pipe_id,
++ uint32_t queue_id);
++static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
++ unsigned int utimeout);
++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
++static uint32_t get_watch_base_addr(void);
++static int kgd_address_watch_disable(struct kgd_dev *kgd);
++static int kgd_address_watch_execute(struct kgd_dev *kgd,
++ unsigned int watch_point_id,
++ uint32_t cntl_val,
++ uint32_t addr_hi,
++ uint32_t addr_lo);
++static int kgd_wave_control_execute(struct kgd_dev *kgd,
++ uint32_t gfx_index_val,
++ uint32_t sq_cmd);
++static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
++ unsigned int watch_point_id,
++ unsigned int reg_offset);
++
++static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
++ uint8_t vmid);
++static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
++ uint8_t vmid);
++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
++static void set_num_of_requests(struct kgd_dev *kgd,
++ uint8_t num_of_requests);
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++ uint64_t va, uint32_t vmid);
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++ uint8_t element_size, uint8_t index_stride, uint8_t mtype);
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t page_table_base);
++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
++
++/* Because of REG_GET_FIELD() being used, we put this function in the
++ * asic specific file.
++ */
++static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
++ struct tile_config *config)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++
++ config->gb_addr_config = adev->gfx.config.gb_addr_config;
++#if 0
++/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but
++ * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu
++ * changes commented out related code, doing the same here for now but
++ * need to sync with Ken et al
++ */
++ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
++ MC_ARB_RAMCFG, NOOFBANK);
++ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
++ MC_ARB_RAMCFG, NOOFRANKS);
++#endif
++
++ config->tile_config_ptr = adev->gfx.config.tile_mode_array;
++ config->num_tile_configs =
++ ARRAY_SIZE(adev->gfx.config.tile_mode_array);
++ config->macro_tile_config_ptr =
++ adev->gfx.config.macrotile_mode_array;
++ config->num_macro_tile_configs =
++ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
++
++ return 0;
++}
++
++static const struct kfd2kgd_calls kfd2kgd = {
++ .init_gtt_mem_allocation = alloc_gtt_mem,
++ .free_gtt_mem = free_gtt_mem,
++ .get_local_mem_info = get_local_mem_info,
++ .get_gpu_clock_counter = get_gpu_clock_counter,
++ .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
++ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
++ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
++ .create_process_gpumem = create_process_gpumem,
++ .destroy_process_gpumem = destroy_process_gpumem,
++ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
++ .open_graphic_handle = open_graphic_handle,
++ .program_sh_mem_settings = kgd_program_sh_mem_settings,
++ .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
++ .init_pipeline = kgd_init_pipeline,
++ .init_interrupts = kgd_init_interrupts,
++ .hqd_load = kgd_hqd_load,
++ .hqd_sdma_load = kgd_hqd_sdma_load,
++ .hqd_dump = kgd_hqd_dump,
++ .hqd_sdma_dump = kgd_hqd_sdma_dump,
++ .hqd_is_occupied = kgd_hqd_is_occupied,
++ .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
++ .hqd_destroy = kgd_hqd_destroy,
++ .hqd_sdma_destroy = kgd_hqd_sdma_destroy,
++ .address_watch_disable = kgd_address_watch_disable,
++ .address_watch_execute = kgd_address_watch_execute,
++ .wave_control_execute = kgd_wave_control_execute,
++ .address_watch_get_offset = kgd_address_watch_get_offset,
++ .get_atc_vmid_pasid_mapping_pasid =
++ get_atc_vmid_pasid_mapping_pasid,
++ .get_atc_vmid_pasid_mapping_valid =
++ get_atc_vmid_pasid_mapping_valid,
++ .write_vmid_invalidate_request = write_vmid_invalidate_request,
++ .invalidate_tlbs = invalidate_tlbs,
++ .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
++ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
++ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
++ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
++ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
++ .get_fw_version = get_fw_version,
++ .set_num_of_requests = set_num_of_requests,
++ .get_cu_info = get_cu_info,
++ .alloc_memory_of_scratch = alloc_memory_of_scratch,
++ .write_config_static_mem = write_config_static_mem,
++ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo,
++ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
++ .set_vm_context_page_table_base = set_vm_context_page_table_base,
++ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table,
++ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table,
++ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info,
++ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf,
++ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf,
++ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info,
++ .submit_ib = amdgpu_amdkfd_submit_ib,
++ .get_tile_config = amdgpu_amdkfd_get_tile_config,
++ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos,
++ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem,
++ .get_vram_usage = amdgpu_amdkfd_get_vram_usage
++};
++
++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions()
++{
++ return (struct kfd2kgd_calls *)&kfd2kgd;
++}
++
++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
++ void *vm, struct kgd_mem **mem)
++{
++ return 0;
++}
++
++/* Destroys the GPU allocation and frees the kgd_mem structure */
++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem)
++{
++
++}
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++ int fd, uint32_t handle, struct kgd_mem **mem)
++{
++ return 0;
++}
++
++static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
++{
++ return (struct amdgpu_device *)kgd;
++}
++
++static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe,
++ uint32_t queue, uint32_t vmid)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ mutex_lock(&adev->srbm_mutex);
++ soc15_grbm_select(adev, mec, pipe, queue, vmid);
++}
++
++static void unlock_srbm(struct kgd_dev *kgd)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ soc15_grbm_select(adev, 0, 0, 0, 0);
++ mutex_unlock(&adev->srbm_mutex);
++}
++
++static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
++ uint32_t queue_id)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
++ uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
++
++ lock_srbm(kgd, mec, pipe, queue_id, 0);
++}
++
++static uint32_t get_queue_mask(struct amdgpu_device *adev,
++ uint32_t pipe_id, uint32_t queue_id)
++{
++ unsigned int bit = (pipe_id * adev->gfx.mec.num_pipe_per_mec +
++ queue_id) & 31;
++
++ return ((uint32_t)1) << bit;
++}
++
++static void release_queue(struct kgd_dev *kgd)
++{
++ unlock_srbm(kgd);
++}
++
++static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t sh_mem_config,
++ uint32_t sh_mem_ape1_base,
++ uint32_t sh_mem_ape1_limit,
++ uint32_t sh_mem_bases)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ lock_srbm(kgd, 0, 0, 0, vmid);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
++ /* APE1 no longer exists on GFX9 */
++
++ unlock_srbm(kgd);
++}
++
++static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
++ unsigned int vmid)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ /*
++ * We have to assume that there is no outstanding mapping.
++ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
++ * a mapping is in progress or because a mapping finished
++ * and the SW cleared it.
++ * So the protocol is to always wait & clear.
++ */
++ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
++ ATC_VMID0_PASID_MAPPING__VALID_MASK;
++
++ /*
++ * need to do this twice, once for gfx and once for mmhub
++ * for ATC add 16 to VMID for mmhub, for IH different registers.
++ * ATC_VMID0..15 registers are separate from ATC_VMID16..31.
++ */
++
++ WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid,
++ pasid_mapping);
++
++ while (!(RREG32(SOC15_REG_OFFSET(
++ ATHUB, 0,
++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
++ (1U << vmid)))
++ cpu_relax();
++
++ WREG32(SOC15_REG_OFFSET(ATHUB, 0,
++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
++ 1U << vmid);
++
++ /* Mapping vmid to pasid also for IH block */
++ WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid,
++ pasid_mapping);
++
++ WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid,
++ pasid_mapping);
++
++ while (!(RREG32(SOC15_REG_OFFSET(
++ ATHUB, 0,
++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
++ (1U << (vmid + 16))))
++ cpu_relax();
++
++ WREG32(SOC15_REG_OFFSET(ATHUB, 0,
++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
++ 1U << (vmid + 16));
++
++ /* Mapping vmid to pasid also for IH block */
++ WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid,
++ pasid_mapping);
++ return 0;
++}
++
++static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
++ uint32_t hpd_size, uint64_t hpd_gpu_addr)
++{
++ /* amdgpu owns the per-pipe state */
++ return 0;
++}
++
++/* TODO - RING0 form of field is obsolete, seems to date back to SI
++ * but still works
++ */
++
++static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t mec;
++ uint32_t pipe;
++
++ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
++ pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
++
++ lock_srbm(kgd, mec, pipe, 0, 0);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL),
++ CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
++ CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
++
++ unlock_srbm(kgd);
++
++ return 0;
++}
++
++static uint32_t get_sdma_base_addr(unsigned int engine_id,
++ unsigned int queue_id)
++{
++ static const uint32_t base[2] = {
++ SOC15_REG_OFFSET(SDMA0, 0,
++ mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL,
++ SOC15_REG_OFFSET(SDMA1, 0,
++ mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL
++ };
++ uint32_t retval;
++
++ retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL -
++ mmSDMA0_RLC0_RB_CNTL);
++
++ pr_debug("sdma base address: 0x%x\n", retval);
++
++ return retval;
++}
++
++static uint32_t get_watch_base_addr(void)
++{
++ uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) -
++ mmTCP_WATCH0_ADDR_H;
++
++ pr_debug("kfd: reg watch base address: 0x%x\n", retval);
++
++ return retval;
++}
++
++static inline struct v9_mqd *get_mqd(void *mqd)
++{
++ return (struct v9_mqd *)mqd;
++}
++
++static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
++{
++ return (struct v9_sdma_mqd *)mqd;
++}
++
++static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
++ uint32_t queue_id, uint32_t __user *wptr,
++ uint32_t wptr_shift, uint32_t wptr_mask,
++ struct mm_struct *mm)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ struct v9_mqd *m;
++ uint32_t *mqd_hqd;
++ uint32_t reg, hqd_base, data;
++
++ m = get_mqd(mqd);
++
++ acquire_queue(kgd, pipe_id, queue_id);
++
++ /* HIQ is set during driver init period with vmid set to 0*/
++ if (m->cp_hqd_vmid == 0) {
++ uint32_t value, mec, pipe;
++
++ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
++ pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
++
++ pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
++ mec, pipe, queue_id);
++ value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS));
++ value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1,
++ ((mec << 5) | (pipe << 3) | queue_id | 0x80));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value);
++ }
++
++ /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
++ mqd_hqd = &m->cp_mqd_base_addr_lo;
++ hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
++
++ for (reg = hqd_base;
++ reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
++ WREG32(reg, mqd_hqd[reg - hqd_base]);
++
++
++ /* Activate doorbell logic before triggering WPTR poll. */
++ data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
++ CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data);
++
++ if (wptr) {
++ /* Don't read wptr with get_user because the user
++ * context may not be accessible (if this function
++ * runs in a work queue). Instead trigger a one-shot
++ * polling read from memory in the CP. This assumes
++ * that wptr is GPU-accessible in the queue's VMID via
++ * ATC or SVM. WPTR==RPTR before starting the poll so
++ * the CP starts fetching new commands from the right
++ * place.
++ *
++ * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
++ * tricky. Assume that the queue didn't overflow. The
++ * number of valid bits in the 32-bit RPTR depends on
++ * the queue size. The remaining bits are taken from
++ * the saved 64-bit WPTR. If the WPTR wrapped, add the
++ * queue size.
++ */
++ uint32_t queue_size =
++ 2 << REG_GET_FIELD(m->cp_hqd_pq_control,
++ CP_HQD_PQ_CONTROL, QUEUE_SIZE);
++ uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1);
++
++ if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr)
++ guessed_wptr += queue_size;
++ guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
++ guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
++ lower_32_bits(guessed_wptr));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
++ upper_32_bits(guessed_wptr));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
++ lower_32_bits((uint64_t)wptr));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
++ upper_32_bits((uint64_t)wptr));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1),
++ get_queue_mask(adev, pipe_id, queue_id));
++ }
++
++ /* Start the EOP fetcher */
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
++ REG_SET_FIELD(m->cp_hqd_eop_rptr,
++ CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
++
++ data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data);
++
++ release_queue(kgd);
++
++ return 0;
++}
++
++static int kgd_hqd_dump(struct kgd_dev *kgd,
++ uint32_t pipe_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t i = 0, reg;
++#define HQD_N_REGS 56
++#define DUMP_REG(addr) do { \
++ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \
++ break; \
++ (*dump)[i][0] = (addr) << 2; \
++ (*dump)[i++][1] = RREG32(addr); \
++ } while (0)
++
++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
++ if (*dump == NULL)
++ return -ENOMEM;
++
++ acquire_queue(kgd, pipe_id, queue_id);
++
++ for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
++ reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
++ DUMP_REG(reg);
++
++ release_queue(kgd);
++
++ WARN_ON_ONCE(i != HQD_N_REGS);
++ *n_regs = i;
++
++ return 0;
++}
++
++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
++ uint32_t __user *wptr, struct mm_struct *mm)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ struct v9_sdma_mqd *m;
++ uint32_t sdma_base_addr, sdmax_gfx_context_cntl;
++ uint32_t temp, timeout = 2000;
++ uint32_t data;
++ uint64_t data64;
++ uint64_t __user *wptr64 = (uint64_t __user *)wptr;
++
++ m = get_sdma_mqd(mqd);
++ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id,
++ m->sdma_queue_id);
++ sdmax_gfx_context_cntl = m->sdma_engine_id ?
++ SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) :
++ SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
++ m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
++
++ while (true) {
++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
++ break;
++ if (timeout == 0)
++ return -ETIME;
++ msleep(10);
++ timeout -= 10;
++ }
++ data = RREG32(sdmax_gfx_context_cntl);
++ data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
++ RESUME_CTX, 0);
++ WREG32(sdmax_gfx_context_cntl, data);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET,
++ m->sdmax_rlcx_doorbell_offset);
++
++ data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
++ ENABLE, 1);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI,
++ m->sdmax_rlcx_rb_rptr_hi);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
++ if (read_user_wptr(mm, wptr64, data64)) {
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
++ lower_32_bits(data64));
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
++ upper_32_bits(data64));
++ } else {
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
++ m->sdmax_rlcx_rb_rptr);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
++ m->sdmax_rlcx_rb_rptr_hi);
++ }
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
++ m->sdmax_rlcx_rb_base_hi);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
++ m->sdmax_rlcx_rb_rptr_addr_lo);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
++ m->sdmax_rlcx_rb_rptr_addr_hi);
++
++ data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
++ RB_ENABLE, 1);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
++
++ return 0;
++}
++
++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
++ uint32_t engine_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t sdma_base_addr = get_sdma_base_addr(engine_id, queue_id);
++ uint32_t i = 0, reg;
++#undef HQD_N_REGS
++#define HQD_N_REGS (19+6+7+10)
++
++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
++ if (*dump == NULL)
++ return -ENOMEM;
++
++ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
++ DUMP_REG(sdma_base_addr + reg);
++ for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
++ DUMP_REG(sdma_base_addr + reg);
++ for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
++ reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
++ DUMP_REG(sdma_base_addr + reg);
++ for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
++ reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
++ DUMP_REG(sdma_base_addr + reg);
++
++ WARN_ON_ONCE(i != HQD_N_REGS);
++ *n_regs = i;
++
++ return 0;
++}
++
++static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
++ uint32_t pipe_id, uint32_t queue_id)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t act;
++ bool retval = false;
++ uint32_t low, high;
++
++ acquire_queue(kgd, pipe_id, queue_id);
++ act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
++ if (act) {
++ low = lower_32_bits(queue_address >> 8);
++ high = upper_32_bits(queue_address >> 8);
++
++ if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) &&
++ high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI)))
++ retval = true;
++ }
++ release_queue(kgd);
++ return retval;
++}
++
++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ struct v9_sdma_mqd *m;
++ uint32_t sdma_base_addr;
++ uint32_t sdma_rlc_rb_cntl;
++
++ m = get_sdma_mqd(mqd);
++ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id,
++ m->sdma_queue_id);
++
++ sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
++
++ if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
++ return true;
++
++ return false;
++}
++
++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
++ enum kfd_preempt_type reset_type,
++ unsigned int utimeout, uint32_t pipe_id,
++ uint32_t queue_id)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ enum hqd_dequeue_request_type type;
++ unsigned long end_jiffies;
++ uint32_t temp;
++ struct v9_mqd *m = get_mqd(mqd);
++
++#if 0
++ unsigned long flags;
++ int retry;
++#endif
++
++ acquire_queue(kgd, pipe_id, queue_id);
++
++ if (m->cp_hqd_vmid == 0)
++ WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
++
++ switch (reset_type) {
++ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
++ type = DRAIN_PIPE;
++ break;
++ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
++ type = RESET_WAVES;
++ break;
++ default:
++ type = DRAIN_PIPE;
++ break;
++ }
++
++#if 0 /* Is this still needed? */
++ /* Workaround: If IQ timer is active and the wait time is close to or
++ * equal to 0, dequeueing is not safe. Wait until either the wait time
++ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is
++ * cleared before continuing. Also, ensure wait times are set to at
++ * least 0x3.
++ */
++ local_irq_save(flags);
++ preempt_disable();
++ retry = 5000; /* wait for 500 usecs at maximum */
++ while (true) {
++ temp = RREG32(mmCP_HQD_IQ_TIMER);
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) {
++ pr_debug("HW is processing IQ\n");
++ goto loop;
++ }
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) {
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE)
++ == 3) /* SEM-rearm is safe */
++ break;
++ /* Wait time 3 is safe for CP, but our MMIO read/write
++ * time is close to 1 microsecond, so check for 10 to
++ * leave more buffer room
++ */
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME)
++ >= 10)
++ break;
++ pr_debug("IQ timer is active\n");
++ } else
++ break;
++loop:
++ if (!retry) {
++ pr_err("CP HQD IQ timer status time out\n");
++ break;
++ }
++ ndelay(100);
++ --retry;
++ }
++ retry = 1000;
++ while (true) {
++ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST);
++ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK))
++ break;
++ pr_debug("Dequeue request is pending\n");
++
++ if (!retry) {
++ pr_err("CP HQD dequeue request time out\n");
++ break;
++ }
++ ndelay(100);
++ --retry;
++ }
++ local_irq_restore(flags);
++ preempt_enable();
++#endif
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
++
++ end_jiffies = (utimeout * HZ / 1000) + jiffies;
++ while (true) {
++ temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
++ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
++ break;
++ if (time_after(jiffies, end_jiffies)) {
++ pr_err("cp queue preemption time out.\n");
++ release_queue(kgd);
++ return -ETIME;
++ }
++ usleep_range(500, 1000);
++ }
++
++ release_queue(kgd);
++ return 0;
++}
++
++static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
++ unsigned int utimeout)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ struct v9_sdma_mqd *m;
++ uint32_t sdma_base_addr;
++ uint32_t temp;
++ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
++
++ m = get_sdma_mqd(mqd);
++ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id,
++ m->sdma_queue_id);
++
++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
++ temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp);
++
++ while (true) {
++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
++ break;
++ if (time_after(jiffies, end_jiffies))
++ return -ETIME;
++ usleep_range(500, 1000);
++ }
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
++ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
++ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
++
++ m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
++ m->sdmax_rlcx_rb_rptr_hi =
++ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI);
++
++ return 0;
++}
++
++static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
++ uint8_t vmid)
++{
++ uint32_t reg;
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++ reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
++ + vmid);
++ return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
++}
++
++static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
++ uint8_t vmid)
++{
++ uint32_t reg;
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++ reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
++ + vmid);
++ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
++}
++
++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++ uint32_t req = (1 << vmid) |
++ (1 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* light */
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK |
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK |
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK |
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK |
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK;
++
++ spin_lock(&adev->tlb_invalidation_lock);
++
++ /* Use light weight invalidation.
++ *
++ * TODO 1: agree on the right set of invalidation registers for
++ * KFD use. Use the last one for now. Invalidate both GC and
++ * MMHUB.
++ *
++ * TODO 2: support range-based invalidation, requires kfg2kgd
++ * interface change
++ */
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32),
++ 0xffffffff);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32),
++ 0x0000001f);
++
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0,
++ mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32),
++ 0xffffffff);
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0,
++ mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32),
++ 0x0000001f);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req);
++
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ),
++ req);
++
++ while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) &
++ (1 << vmid)))
++ cpu_relax();
++
++ while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0,
++ mmMMHUB_VM_INVALIDATE_ENG16_ACK)) &
++ (1 << vmid)))
++ cpu_relax();
++
++ spin_unlock(&adev->tlb_invalidation_lock);
++
++}
++
++static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid)
++{
++ signed long r;
++ struct dma_fence *f;
++ struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
++
++ mutex_lock(&adev->gfx.kiq.ring_mutex);
++ amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/
++ amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0));
++ amdgpu_ring_write(ring,
++ PACKET3_INVALIDATE_TLBS_DST_SEL(1) |
++ PACKET3_INVALIDATE_TLBS_ALL_HUB(1) |
++ PACKET3_INVALIDATE_TLBS_PASID(pasid) |
++ PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(2));
++ amdgpu_fence_emit(ring, &f);
++ amdgpu_ring_commit(ring);
++ mutex_unlock(&adev->gfx.kiq.ring_mutex);
++
++ r = dma_fence_wait(f, false);
++ if (r)
++ DRM_ERROR("wait for kiq fence error: %ld.\n", r);
++ dma_fence_put(f);
++
++ return r;
++}
++
++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++ int vmid;
++ struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
++
++ if (ring->ready)
++ return invalidate_tlbs_with_kiq(adev, pasid);
++
++ for (vmid = 0; vmid < 16; vmid++) {
++ if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
++ continue;
++ if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) {
++ if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid)
++ == pasid) {
++ write_vmid_invalidate_request(kgd, vmid);
++ break;
++ }
++ }
++ }
++
++ return 0;
++}
++
++static int kgd_address_watch_disable(struct kgd_dev *kgd)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ union TCP_WATCH_CNTL_BITS cntl;
++ unsigned int i;
++ uint32_t watch_base_addr;
++
++ cntl.u32All = 0;
++
++ cntl.bitfields.valid = 0;
++ cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
++ cntl.bitfields.atc = 1;
++
++ watch_base_addr = get_watch_base_addr();
++ /* Turning off this address until we set all the registers */
++ for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
++ WREG32(watch_base_addr +
++ watchRegs[i * ADDRESS_WATCH_REG_MAX +
++ ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
++
++ return 0;
++}
++
++static int kgd_address_watch_execute(struct kgd_dev *kgd,
++ unsigned int watch_point_id,
++ uint32_t cntl_val,
++ uint32_t addr_hi,
++ uint32_t addr_lo)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ union TCP_WATCH_CNTL_BITS cntl;
++ uint32_t watch_base_addr;
++
++ watch_base_addr = get_watch_base_addr();
++ cntl.u32All = cntl_val;
++
++ /* Turning off this watch point until we set all the registers */
++ cntl.bitfields.valid = 0;
++ WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
++
++ WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI],
++ addr_hi);
++
++ WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO],
++ addr_lo);
++
++ /* Enable the watch point */
++ cntl.bitfields.valid = 1;
++
++ WREG32(watch_base_addr +
++ watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
++ ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
++
++ return 0;
++}
++
++static int kgd_wave_control_execute(struct kgd_dev *kgd,
++ uint32_t gfx_index_val,
++ uint32_t sq_cmd)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t data = 0;
++
++ mutex_lock(&adev->grbm_idx_mutex);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd);
++
++ data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
++ INSTANCE_BROADCAST_WRITES, 1);
++ data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
++ SH_BROADCAST_WRITES, 1);
++ data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
++ SE_BROADCAST_WRITES, 1);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
++ mutex_unlock(&adev->grbm_idx_mutex);
++
++ return 0;
++}
++
++static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
++ unsigned int watch_point_id,
++ unsigned int reg_offset)
++{
++ return get_watch_base_addr() +
++ watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset];
++}
++
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++ uint8_t element_size, uint8_t index_stride, uint8_t mtype)
++{
++ /* No longer needed on GFXv9. These values are now hard-coded,
++ * except for the MTYPE which comes from the page table.
++ */
++
++ return 0;
++}
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++ uint64_t va, uint32_t vmid)
++{
++ /* No longer needed on GFXv9. The scratch base address is
++ * passed to the shader by the CP. It's the user mode driver's
++ * responsibility.
++ */
++
++ return 0;
++}
++
++/* FIXME: Does this need to be ASIC-specific code? */
++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++ const union amdgpu_firmware_header *hdr;
++
++ switch (type) {
++ case KGD_ENGINE_PFP:
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data;
++ break;
++
++ case KGD_ENGINE_ME:
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data;
++ break;
++
++ case KGD_ENGINE_CE:
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data;
++ break;
++
++ case KGD_ENGINE_MEC1:
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data;
++ break;
++
++ case KGD_ENGINE_MEC2:
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data;
++ break;
++
++ case KGD_ENGINE_RLC:
++ hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data;
++ break;
++
++ case KGD_ENGINE_SDMA1:
++ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data;
++ break;
++
++ case KGD_ENGINE_SDMA2:
++ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data;
++ break;
++
++ default:
++ return 0;
++ }
++
++ if (hdr == NULL)
++ return 0;
++
++ /* Only 12 bit in use*/
++ return hdr->common.ucode_version;
++}
++
++static void set_num_of_requests(struct kgd_dev *kgd,
++ uint8_t num_of_requests)
++{
++ pr_debug("This is a stub\n");
++}
++
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t page_table_base)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT |
++ AMDGPU_PTE_VALID;
++
++ /* TODO: Don't use hardcoded VMIDs */
++ if (vmid < 8 || vmid > 15) {
++ pr_err("trying to set page table base for wrong VMID %u\n",
++ vmid);
++ return;
++ }
++
++ /* TODO: take advantage of per-process address space size. For
++ * now, all processes share the same address space size, like
++ * on GFX8 and older.
++ */
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
++
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2),
++ lower_32_bits(adev->vm_manager.max_pfn - 1));
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2),
++ upper_32_bits(adev->vm_manager.max_pfn - 1));
++
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2),
++ lower_32_bits(adev->vm_manager.max_pfn - 1));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2),
++ upper_32_bits(adev->vm_manager.max_pfn - 1));
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+new file mode 100644
+index 0000000..7df892d
+--- /dev/null
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+@@ -0,0 +1,2578 @@
++/*
++ * Copyright 2014 Advanced Micro Devices, Inc.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
++ * OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#undef pr_fmt
++#define pr_fmt(fmt) "kfd2kgd: " fmt
++
++#include <linux/module.h>
++#include <linux/fdtable.h>
++#include <linux/uaccess.h>
++#include <linux/firmware.h>
++#include <linux/list.h>
++#include <linux/sched/mm.h>
++#include <drm/drmP.h>
++#include <linux/dma-buf.h>
++#include <linux/pagemap.h>
++#include "amdgpu_amdkfd.h"
++#include "amdgpu_ucode.h"
++#include "gca/gfx_8_0_sh_mask.h"
++#include "gca/gfx_8_0_d.h"
++#include "gca/gfx_8_0_enum.h"
++#include "oss/oss_3_0_sh_mask.h"
++#include "oss/oss_3_0_d.h"
++#include "gmc/gmc_8_1_sh_mask.h"
++#include "gmc/gmc_8_1_d.h"
++
++/* Special VM and GART address alignment needed for VI pre-Fiji due to
++ * a HW bug.
++ */
++#define VI_BO_SIZE_ALIGN (0x8000)
++
++/* BO flag to indicate a KFD userptr BO */
++#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63)
++
++/* Impose limit on how much memory KFD can use */
++struct kfd_mem_usage_limit {
++ uint64_t max_system_mem_limit;
++ uint64_t max_userptr_mem_limit;
++ int64_t system_mem_used;
++ int64_t userptr_mem_used;
++ spinlock_t mem_limit_lock;
++};
++
++static struct kfd_mem_usage_limit kfd_mem_limit;
++
++/* Struct used for amdgpu_amdkfd_bo_validate */
++struct amdgpu_vm_parser {
++ uint32_t domain;
++ bool wait;
++};
++
++static const char * const domain_bit_to_string[] = {
++ "CPU",
++ "GTT",
++ "VRAM",
++ "GDS",
++ "GWS",
++ "OA"
++};
++
++#define domain_string(domain) domain_bit_to_string[ffs(domain)-1]
++
++static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work);
++
++
++static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
++{
++ return (struct amdgpu_device *)kgd;
++}
++
++static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm,
++ struct kgd_mem *mem)
++{
++ struct kfd_bo_va_list *entry;
++
++ list_for_each_entry(entry, &mem->bo_va_list, bo_list)
++ if (entry->bo_va->base.vm == avm)
++ return false;
++
++ return true;
++}
++
++/* Set memory usage limits. Current, limits are
++ * System (kernel) memory - 15/16th System RAM
++ * Userptr memory - 15/16th System RAM
++ */
++void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
++{
++ struct sysinfo si;
++ uint64_t mem;
++
++ si_meminfo(&si);
++ mem = si.totalram - si.totalhigh;
++ mem *= si.mem_unit;
++
++ spin_lock_init(&kfd_mem_limit.mem_limit_lock);
++ kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4); /* 15/16 */
++ kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 4); /* 15/16 */
++ pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n",
++ (kfd_mem_limit.max_system_mem_limit >> 20),
++ (kfd_mem_limit.max_userptr_mem_limit >> 20));
++}
++
++static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev,
++ uint64_t size, u32 domain)
++{
++ size_t acc_size;
++ int ret = 0;
++
++ acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
++ sizeof(struct amdgpu_bo));
++
++ spin_lock(&kfd_mem_limit.mem_limit_lock);
++ if (domain == AMDGPU_GEM_DOMAIN_GTT) {
++ if (kfd_mem_limit.system_mem_used + (acc_size + size) >
++ kfd_mem_limit.max_system_mem_limit) {
++ ret = -ENOMEM;
++ goto err_no_mem;
++ }
++ kfd_mem_limit.system_mem_used += (acc_size + size);
++ } else if (domain == AMDGPU_GEM_DOMAIN_CPU) {
++ if ((kfd_mem_limit.system_mem_used + acc_size >
++ kfd_mem_limit.max_system_mem_limit) ||
++ (kfd_mem_limit.userptr_mem_used + (size + acc_size) >
++ kfd_mem_limit.max_userptr_mem_limit)) {
++ ret = -ENOMEM;
++ goto err_no_mem;
++ }
++ kfd_mem_limit.system_mem_used += acc_size;
++ kfd_mem_limit.userptr_mem_used += size;
++ }
++err_no_mem:
++ spin_unlock(&kfd_mem_limit.mem_limit_lock);
++ return ret;
++}
++
++static void unreserve_system_mem_limit(struct amdgpu_device *adev,
++ uint64_t size, u32 domain)
++{
++ size_t acc_size;
++
++ acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
++ sizeof(struct amdgpu_bo));
++
++ spin_lock(&kfd_mem_limit.mem_limit_lock);
++ if (domain == AMDGPU_GEM_DOMAIN_GTT) {
++ kfd_mem_limit.system_mem_used -= (acc_size + size);
++ } else if (domain == AMDGPU_GEM_DOMAIN_CPU) {
++ kfd_mem_limit.system_mem_used -= acc_size;
++ kfd_mem_limit.userptr_mem_used -= size;
++ }
++ WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
++ "kfd system memory accounting unbalanced");
++ WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0,
++ "kfd userptr memory accounting unbalanced");
++
++ spin_unlock(&kfd_mem_limit.mem_limit_lock);
++}
++
++void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo)
++{
++ spin_lock(&kfd_mem_limit.mem_limit_lock);
++
++ if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) {
++ kfd_mem_limit.system_mem_used -= bo->tbo.acc_size;
++ kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo);
++ } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) {
++ kfd_mem_limit.system_mem_used -=
++ (bo->tbo.acc_size + amdgpu_bo_size(bo));
++ }
++ WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
++ "kfd system memory accounting unbalanced");
++ WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0,
++ "kfd userptr memory accounting unbalanced");
++
++ spin_unlock(&kfd_mem_limit.mem_limit_lock);
++}
++
++
++/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence(s) from BO's
++ * reservation object.
++ *
++ * @bo: [IN] Remove eviction fence(s) from this BO
++ * @ef: [IN] If ef is specified, then this eviction fence is removed if it
++ * is present in the shared list.
++ * @ef_list: [OUT] Returns list of eviction fences. These fences are removed
++ * from BO's reservation object shared list.
++ * @ef_count: [OUT] Number of fences in ef_list.
++ *
++ * NOTE: If called with ef_list, then amdgpu_amdkfd_add_eviction_fence must be
++ * called to restore the eviction fences and to avoid memory leak. This is
++ * useful for shared BOs.
++ * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held.
++ */
++static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
++ struct amdgpu_amdkfd_fence *ef,
++ struct amdgpu_amdkfd_fence ***ef_list,
++ unsigned int *ef_count)
++{
++ struct reservation_object_list *fobj;
++ struct reservation_object *resv;
++ unsigned int i = 0, j = 0, k = 0, shared_count;
++ unsigned int count = 0;
++ struct amdgpu_amdkfd_fence **fence_list;
++
++ if (!ef && !ef_list)
++ return -EINVAL;
++
++ if (ef_list) {
++ *ef_list = NULL;
++ *ef_count = 0;
++ }
++
++ resv = bo->tbo.resv;
++ fobj = reservation_object_get_list(resv);
++
++ if (!fobj)
++ return 0;
++
++ preempt_disable();
++ write_seqcount_begin(&resv->seq);
++
++ /* Go through all the shared fences in the resevation object. If
++ * ef is specified and it exists in the list, remove it and reduce the
++ * count. If ef is not specified, then get the count of eviction fences
++ * present.
++ */
++ shared_count = fobj->shared_count;
++ for (i = 0; i < shared_count; ++i) {
++ struct dma_fence *f;
++
++ f = rcu_dereference_protected(fobj->shared[i],
++ reservation_object_held(resv));
++
++ if (ef) {
++ if (f->context == ef->base.context) {
++ dma_fence_put(f);
++ fobj->shared_count--;
++ } else
++ RCU_INIT_POINTER(fobj->shared[j++], f);
++
++ } else if (to_amdgpu_amdkfd_fence(f))
++ count++;
++ }
++ write_seqcount_end(&resv->seq);
++ preempt_enable();
++
++ if (ef || !count)
++ return 0;
++
++ /* Alloc memory for count number of eviction fence pointers. Fill the
++ * ef_list array and ef_count
++ */
++
++ fence_list = kcalloc(count, sizeof(struct amdgpu_amdkfd_fence *),
++ GFP_KERNEL);
++ if (!fence_list)
++ return -ENOMEM;
++
++ preempt_disable();
++ write_seqcount_begin(&resv->seq);
++
++ j = 0;
++ for (i = 0; i < shared_count; ++i) {
++ struct dma_fence *f;
++ struct amdgpu_amdkfd_fence *efence;
++
++ f = rcu_dereference_protected(fobj->shared[i],
++ reservation_object_held(resv));
++
++ efence = to_amdgpu_amdkfd_fence(f);
++ if (efence) {
++ fence_list[k++] = efence;
++ fobj->shared_count--;
++ } else
++ RCU_INIT_POINTER(fobj->shared[j++], f);
++ }
++
++ write_seqcount_end(&resv->seq);
++ preempt_enable();
++
++ *ef_list = fence_list;
++ *ef_count = k;
++
++ return 0;
++}
++
++/* amdgpu_amdkfd_add_eviction_fence - Adds eviction fence(s) back into BO's
++ * reservation object.
++ *
++ * @bo: [IN] Add eviction fences to this BO
++ * @ef_list: [IN] List of eviction fences to be added
++ * @ef_count: [IN] Number of fences in ef_list.
++ *
++ * NOTE: Must call amdgpu_amdkfd_remove_eviction_fence before calling this
++ * function.
++ */
++static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo,
++ struct amdgpu_amdkfd_fence **ef_list,
++ unsigned int ef_count)
++{
++ int i;
++
++ if (!ef_list || !ef_count)
++ return;
++
++ for (i = 0; i < ef_count; i++) {
++ amdgpu_bo_fence(bo, &ef_list[i]->base, true);
++ /* Readding the fence takes an additional reference. Drop that
++ * reference.
++ */
++ dma_fence_put(&ef_list[i]->base);
++ }
++
++ kfree(ef_list);
++}
++
++static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
++ bool wait)
++{
++ int ret;
++
++ if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm),
++ "Called with userptr BO"))
++ return -EINVAL;
++
++ amdgpu_ttm_placement_from_domain(bo, domain);
++
++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false);
++ if (ret)
++ goto validate_fail;
++ if (wait) {
++ struct amdgpu_amdkfd_fence **ef_list;
++ unsigned int ef_count;
++
++ ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list,
++ &ef_count);
++ if (ret)
++ goto validate_fail;
++
++ ttm_bo_wait(&bo->tbo, false, false);
++ amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count);
++ }
++
++validate_fail:
++ return ret;
++}
++
++static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo)
++{
++ struct amdgpu_vm_parser *p = param;
++
++ return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait);
++}
++
++/* vm_validate_pt_pd_bos - Validate page table and directory BOs
++ *
++ * Also updates page directory entries so we don't need to do this
++ * again later until the page directory is validated again (e.g. after
++ * an eviction or allocating new page tables).
++ */
++static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
++{
++ struct amdgpu_bo *pd = vm->root.base.bo;
++ struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
++ struct amdgpu_vm_parser param;
++ int ret;
++
++ param.domain = AMDGPU_GEM_DOMAIN_VRAM;
++ param.wait = false;
++
++ ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate,
++ &param);
++ if (ret) {
++ pr_err("amdgpu: failed to validate PT BOs\n");
++ return ret;
++ }
++
++ ret = amdgpu_amdkfd_validate(&param, pd);
++ if (ret) {
++ pr_err("amdgpu: failed to validate PD\n");
++ return ret;
++ }
++
++ ret = amdgpu_vm_update_directories(adev, vm);
++ if (ret != 0)
++ return ret;
++
++ return 0;
++}
++
++/* add_bo_to_vm - Add a BO to a VM
++ *
++ * Everything that needs to bo done only once when a BO is first added
++ * to a VM. It can later be mapped and unmapped many times without
++ * repeating these steps.
++ *
++ * 1. Allocate and initialize BO VA entry data structure
++ * 2. Add BO to the VM
++ * 3. Determine ASIC-specific PTE flags
++ * 4. Alloc page tables and directories if needed
++ * 4a. Validate new page tables and directories and update directories
++ */
++static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem,
++ struct amdgpu_vm *avm, bool is_aql,
++ struct kfd_bo_va_list **p_bo_va_entry)
++{
++ int ret;
++ struct kfd_bo_va_list *bo_va_entry;
++ struct amdkfd_vm *kvm = container_of(avm,
++ struct amdkfd_vm, base);
++ struct amdgpu_bo *pd = avm->root.base.bo;
++ struct amdgpu_bo *bo = mem->bo;
++ uint64_t va = mem->va;
++ struct list_head *list_bo_va = &mem->bo_va_list;
++ unsigned long bo_size = bo->tbo.mem.size;
++
++ if (!va) {
++ pr_err("Invalid VA when adding BO to VM\n");
++ return -EINVAL;
++ }
++
++ if (is_aql)
++ va += bo_size;
++
++ bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL);
++ if (!bo_va_entry)
++ return -ENOMEM;
++
++ pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
++ va + bo_size, avm);
++
++ /* Add BO to VM internal data structures*/
++ bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo);
++ if (bo_va_entry->bo_va == NULL) {
++ ret = -EINVAL;
++ pr_err("Failed to add BO object to VM. ret == %d\n",
++ ret);
++ goto err_vmadd;
++ }
++
++ bo_va_entry->va = va;
++ bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev,
++ mem->mapping_flags);
++ bo_va_entry->kgd_dev = (void *)adev;
++ list_add(&bo_va_entry->bo_list, list_bo_va);
++
++ if (p_bo_va_entry)
++ *p_bo_va_entry = bo_va_entry;
++
++ /* Allocate new page tables if neeeded and validate
++ * them. Clearing of new page tables and validate need to wait
++ * on move fences. We don't want that to trigger the eviction
++ * fence, so remove it temporarily.
++ */
++ amdgpu_amdkfd_remove_eviction_fence(pd,
++ kvm->process_info->eviction_fence,
++ NULL, NULL);
++
++ ret = amdgpu_vm_alloc_pts(adev, avm, va, amdgpu_bo_size(bo));
++ if (ret) {
++ pr_err("Failed to allocate pts, err=%d\n", ret);
++ goto err_alloc_pts;
++ }
++
++ ret = vm_validate_pt_pd_bos(avm);
++ if (ret != 0) {
++ pr_err("validate_pt_pd_bos() failed\n");
++ goto err_alloc_pts;
++ }
++
++ /* Add the eviction fence back */
++ amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true);
++
++ return 0;
++
++err_alloc_pts:
++ amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true);
++ amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va);
++ list_del(&bo_va_entry->bo_list);
++err_vmadd:
++ kfree(bo_va_entry);
++ return ret;
++}
++
++static void remove_bo_from_vm(struct amdgpu_device *adev,
++ struct kfd_bo_va_list *entry, unsigned long size)
++{
++ pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n",
++ entry->va,
++ entry->va + size, entry);
++ amdgpu_vm_bo_rmv(adev, entry->bo_va);
++ list_del(&entry->bo_list);
++ kfree(entry);
++}
++
++static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
++ struct amdkfd_process_info *process_info,
++ bool userptr)
++{
++ struct ttm_validate_buffer *entry = &mem->validate_list;
++ struct amdgpu_bo *bo = mem->bo;
++
++ INIT_LIST_HEAD(&entry->head);
++ entry->shared = true;
++ entry->bo = &bo->tbo;
++ mutex_lock(&process_info->lock);
++ if (userptr)
++ list_add_tail(&entry->head, &process_info->userptr_valid_list);
++ else
++ list_add_tail(&entry->head, &process_info->kfd_bo_list);
++ mutex_unlock(&process_info->lock);
++}
++
++/* Initializes user pages. It registers the MMU notifier and validates
++ * the userptr BO in the GTT domain.
++ *
++ * The BO must already be on the userptr_valid_list. Otherwise an
++ * eviction and restore may happen that leaves the new BO unmapped
++ * with the user mode queues running.
++ *
++ * Takes the process_info->lock to protect against concurrent restore
++ * workers.
++ *
++ * Returns 0 for success, negative errno for errors.
++ */
++static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
++ uint64_t user_addr)
++{
++ struct amdkfd_process_info *process_info = mem->process_info;
++ struct amdgpu_bo *bo = mem->bo;
++ int ret = 0;
++
++ mutex_lock(&process_info->lock);
++
++ ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0);
++ if (ret) {
++ pr_err("%s: Failed to set userptr: %d\n", __func__, ret);
++ goto out;
++ }
++
++ ret = amdgpu_mn_register(bo, user_addr);
++ if (ret) {
++ pr_err("%s: Failed to register MMU notifier: %d\n",
++ __func__, ret);
++ goto out;
++ }
++
++ /* If no restore worker is running concurrently, user_pages
++ * should not be allocated
++ */
++ WARN(mem->user_pages, "Leaking user_pages array");
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
++ mem->user_pages = drm_calloc_large(bo->tbo.ttm->num_pages,
++ sizeof(struct page *));
++#else
++ mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages,
++ sizeof(struct page *),
++ GFP_KERNEL | __GFP_ZERO);
++#endif
++ if (!mem->user_pages) {
++ pr_err("%s: Failed to allocate pages array\n", __func__);
++ ret = -ENOMEM;
++ goto unregister_out;
++ }
++
++ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages);
++ if (ret) {
++ pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
++ goto free_out;
++ }
++
++ amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages);
++
++ ret = amdgpu_bo_reserve(bo, true);
++ if (ret) {
++ pr_err("%s: Failed to reserve BO\n", __func__);
++ goto release_out;
++ }
++ amdgpu_ttm_placement_from_domain(bo, mem->domain);
++ ret = ttm_bo_validate(&bo->tbo, &bo->placement,
++ true, false);
++ if (ret)
++ pr_err("%s: failed to validate BO\n", __func__);
++ amdgpu_bo_unreserve(bo);
++
++release_out:
++ if (ret)
++ release_pages(mem->user_pages, bo->tbo.ttm->num_pages, 0);
++free_out:
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
++ drm_free_large(mem->user_pages);
++#else
++ kvfree(mem->user_pages);
++#endif
++ mem->user_pages = NULL;
++unregister_out:
++ if (ret)
++ amdgpu_mn_unregister(bo);
++out:
++ mutex_unlock(&process_info->lock);
++ return ret;
++}
++
++static int __map_bo_to_kernel(struct amdgpu_bo *bo, u32 domain, void **kptr)
++{
++ int ret;
++
++ ret = amdgpu_bo_reserve(bo, true);
++ if (ret) {
++ pr_err("Failed to reserve bo. ret %d\n", ret);
++ return ret;
++ }
++
++ ret = amdgpu_bo_pin(bo, domain, NULL);
++ if (ret) {
++ pr_err("Failed to pin bo. ret %d\n", ret);
++ goto pin_failed;
++ }
++
++ ret = amdgpu_bo_kmap(bo, kptr);
++ if (ret) {
++ pr_err("Failed to map bo to kernel. ret %d\n", ret);
++ goto kmap_failed;
++ }
++
++ amdgpu_bo_unreserve(bo);
++
++ return ret;
++
++kmap_failed:
++ amdgpu_bo_unpin(bo);
++pin_failed:
++ amdgpu_bo_unreserve(bo);
++
++ return ret;
++}
++
++static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va,
++ uint64_t size, void *vm, struct kgd_mem **mem,
++ uint64_t *offset, u32 domain, u64 flags,
++ struct sg_table *sg, bool aql_queue,
++ bool readonly, bool execute, bool coherent, bool no_sub,
++ bool userptr)
++{
++ struct amdgpu_device *adev;
++ int ret;
++ struct amdgpu_bo *bo;
++ uint64_t user_addr = 0;
++ int byte_align;
++ u32 alloc_domain;
++ uint32_t mapping_flags;
++ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm;
++
++ if (aql_queue)
++ size = size >> 1;
++ if (userptr) {
++ if (!offset || !*offset)
++ return -EINVAL;
++ user_addr = *offset;
++ }
++
++ adev = get_amdgpu_device(kgd);
++ byte_align = (adev->family == AMDGPU_FAMILY_VI &&
++ adev->asic_type != CHIP_FIJI &&
++ adev->asic_type != CHIP_POLARIS10 &&
++ adev->asic_type != CHIP_POLARIS11) ?
++ VI_BO_SIZE_ALIGN : 1;
++
++ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
++ if (*mem == NULL) {
++ ret = -ENOMEM;
++ goto err;
++ }
++ INIT_LIST_HEAD(&(*mem)->bo_va_list);
++ mutex_init(&(*mem)->lock);
++ (*mem)->coherent = coherent;
++ (*mem)->no_substitute = no_sub;
++ (*mem)->aql_queue = aql_queue;
++
++ mapping_flags = AMDGPU_VM_PAGE_READABLE;
++ if (!readonly)
++ mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;
++ if (execute)
++ mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
++ if (coherent)
++ mapping_flags |= AMDGPU_VM_MTYPE_UC;
++ else
++ mapping_flags |= AMDGPU_VM_MTYPE_NC;
++
++ (*mem)->mapping_flags = mapping_flags;
++
++ alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain;
++
++ amdgpu_sync_create(&(*mem)->sync);
++
++ ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain);
++ if (ret) {
++ pr_err("Insufficient system memory\n");
++ goto err_bo_create;
++ }
++
++ pr_debug("\t create BO VA 0x%llx size 0x%llx domain %s\n",
++ va, size, domain_string(alloc_domain));
++
++ /* Allocate buffer object. Userptr objects need to start out
++ * in the CPU domain, get moved to GTT when pinned.
++ */
++ ret = amdgpu_bo_create(adev, size, byte_align, false,
++ alloc_domain,
++ flags, sg, NULL, 0, &bo);
++ if (ret != 0) {
++ pr_err("Failed to create BO on domain %s. ret %d\n",
++ domain_string(alloc_domain), ret);
++ unreserve_system_mem_limit(adev, size, alloc_domain);
++ goto err_bo_create;
++ }
++ bo->kfd_bo = *mem;
++ (*mem)->bo = bo;
++ if (userptr)
++ bo->flags |= AMDGPU_AMDKFD_USERPTR_BO;
++
++ (*mem)->va = va;
++ (*mem)->domain = domain;
++ (*mem)->mapped_to_gpu_memory = 0;
++ (*mem)->process_info = kfd_vm->process_info;
++ add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, userptr);
++
++ if (userptr) {
++ ret = init_user_pages(*mem, current->mm, user_addr);
++ if (ret) {
++ mutex_lock(&kfd_vm->process_info->lock);
++ list_del(&(*mem)->validate_list.head);
++ mutex_unlock(&kfd_vm->process_info->lock);
++ goto allocate_init_user_pages_failed;
++ }
++ }
++
++ if (offset)
++ *offset = amdgpu_bo_mmap_offset(bo);
++
++ return 0;
++
++allocate_init_user_pages_failed:
++ amdgpu_bo_unref(&bo);
++err_bo_create:
++ kfree(*mem);
++err:
++ return ret;
++}
++
++/* Reserving a BO and its page table BOs must happen atomically to
++ * avoid deadlocks. When updating userptrs we need to temporarily
++ * back-off the reservation and then reacquire it. Track all the
++ * reservation info in a context structure. Buffers can be mapped to
++ * multiple VMs simultaneously (buffers being restored on multiple
++ * GPUs).
++ */
++struct bo_vm_reservation_context {
++ struct amdgpu_bo_list_entry kfd_bo;
++ unsigned int n_vms;
++ struct amdgpu_bo_list_entry *vm_pd;
++ struct ww_acquire_ctx ticket;
++ struct list_head list, duplicates;
++ struct amdgpu_sync *sync;
++ bool reserved;
++};
++
++/**
++ * reserve_bo_and_vm - reserve a BO and a VM unconditionally.
++ * @mem: KFD BO structure.
++ * @vm: the VM to reserve.
++ * @ctx: the struct that will be used in unreserve_bo_and_vms().
++ */
++static int reserve_bo_and_vm(struct kgd_mem *mem,
++ struct amdgpu_vm *vm,
++ struct bo_vm_reservation_context *ctx)
++{
++ struct amdgpu_bo *bo = mem->bo;
++ int ret;
++
++ WARN_ON(!vm);
++
++ ctx->reserved = false;
++ ctx->n_vms = 1;
++ ctx->sync = &mem->sync;
++
++ INIT_LIST_HEAD(&ctx->list);
++ INIT_LIST_HEAD(&ctx->duplicates);
++
++ ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry)
++ * ctx->n_vms, GFP_KERNEL);
++ if (ctx->vm_pd == NULL)
++ return -ENOMEM;
++
++ ctx->kfd_bo.robj = bo;
++ ctx->kfd_bo.priority = 0;
++ ctx->kfd_bo.tv.bo = &bo->tbo;
++ ctx->kfd_bo.tv.shared = true;
++ ctx->kfd_bo.user_pages = NULL;
++ list_add(&ctx->kfd_bo.tv.head, &ctx->list);
++
++ amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]);
++
++ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
++ false, &ctx->duplicates);
++ if (!ret)
++ ctx->reserved = true;
++ else
++ pr_err("Failed to reserve buffers in ttm\n");
++
++ if (ret) {
++ kfree(ctx->vm_pd);
++ ctx->vm_pd = NULL;
++ }
++
++ return ret;
++}
++
++enum VA_TYPE {
++ VA_NOT_MAPPED = 0,
++ VA_MAPPED,
++ VA_DO_NOT_CARE,
++};
++
++/**
++ * reserve_bo_and_vm - reserve a BO and some VMs that the BO has been added
++ * to, conditionally based on map_type.
++ * @mem: KFD BO structure.
++ * @vm: the VM to reserve. If NULL, then all VMs associated with the BO
++ * is used. Otherwise, a single VM associated with the BO.
++ * @map_type: the mapping status that will be used to filter the VMs.
++ * @ctx: the struct that will be used in unreserve_bo_and_vms().
++ */
++static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
++ struct amdgpu_vm *vm, enum VA_TYPE map_type,
++ struct bo_vm_reservation_context *ctx)
++{
++ struct amdgpu_bo *bo = mem->bo;
++ struct kfd_bo_va_list *entry;
++ unsigned int i;
++ int ret;
++
++ ctx->reserved = false;
++ ctx->n_vms = 0;
++ ctx->vm_pd = NULL;
++ ctx->sync = &mem->sync;
++
++ INIT_LIST_HEAD(&ctx->list);
++ INIT_LIST_HEAD(&ctx->duplicates);
++
++ list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
++ if ((vm && vm != entry->bo_va->base.vm) ||
++ (entry->is_mapped != map_type
++ && map_type != VA_DO_NOT_CARE))
++ continue;
++
++ ctx->n_vms++;
++ }
++
++ if (ctx->n_vms != 0) {
++ ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry)
++ * ctx->n_vms, GFP_KERNEL);
++ if (ctx->vm_pd == NULL)
++ return -ENOMEM;
++ }
++
++ ctx->kfd_bo.robj = bo;
++ ctx->kfd_bo.priority = 0;
++ ctx->kfd_bo.tv.bo = &bo->tbo;
++ ctx->kfd_bo.tv.shared = true;
++ ctx->kfd_bo.user_pages = NULL;
++ list_add(&ctx->kfd_bo.tv.head, &ctx->list);
++
++ i = 0;
++ list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
++ if ((vm && vm != entry->bo_va->base.vm) ||
++ (entry->is_mapped != map_type
++ && map_type != VA_DO_NOT_CARE))
++ continue;
++
++ amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list,
++ &ctx->vm_pd[i]);
++ i++;
++ }
++
++ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
++ false, &ctx->duplicates);
++ if (!ret)
++ ctx->reserved = true;
++ else
++ pr_err("Failed to reserve buffers in ttm.\n");
++
++ if (ret) {
++ kfree(ctx->vm_pd);
++ ctx->vm_pd = NULL;
++ }
++
++ return ret;
++}
++
++static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
++ bool wait, bool intr)
++{
++ int ret = 0;
++
++ if (wait)
++ ret = amdgpu_sync_wait(ctx->sync, intr);
++
++ if (ctx->reserved)
++ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list);
++ kfree(ctx->vm_pd);
++
++ ctx->sync = NULL;
++
++ ctx->reserved = false;
++ ctx->vm_pd = NULL;
++
++ return ret;
++}
++
++static int unmap_bo_from_gpuvm(struct amdgpu_device *adev,
++ struct kfd_bo_va_list *entry,
++ struct amdgpu_sync *sync)
++{
++ struct amdgpu_bo_va *bo_va = entry->bo_va;
++ struct amdgpu_vm *vm = bo_va->base.vm;
++ struct amdkfd_vm *kvm = container_of(vm, struct amdkfd_vm, base);
++ struct amdgpu_bo *pd = vm->root.base.bo;
++
++ /* Remove eviction fence from PD (and thereby from PTs too as they
++ * share the resv. object. Otherwise during PT update job (see
++ * amdgpu_vm_bo_update_mapping), eviction fence will get added to
++ * job->sync object
++ */
++ amdgpu_amdkfd_remove_eviction_fence(pd,
++ kvm->process_info->eviction_fence,
++ NULL, NULL);
++ amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
++
++ amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
++
++ /* Add the eviction fence back */
++ amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true);
++
++ amdgpu_sync_fence(adev, sync, bo_va->last_pt_update);
++
++ /* Sync objects can't handle multiple GPUs (contexts) updating
++ * sync->last_vm_update. Fortunately we don't need it for
++ * KFD's purposes, so we can just drop that fence.
++ */
++ if (sync->last_vm_update) {
++ dma_fence_put(sync->last_vm_update);
++ sync->last_vm_update = NULL;
++ }
++
++ return 0;
++}
++
++static int update_gpuvm_pte(struct amdgpu_device *adev,
++ struct kfd_bo_va_list *entry,
++ struct amdgpu_sync *sync)
++{
++ int ret;
++ struct amdgpu_vm *vm;
++ struct amdgpu_bo_va *bo_va;
++ struct amdgpu_bo *bo;
++
++ bo_va = entry->bo_va;
++ vm = bo_va->base.vm;
++ bo = bo_va->base.bo;
++
++ /* Update the page tables */
++ ret = amdgpu_vm_bo_update(adev, bo_va, false);
++ if (ret != 0) {
++ pr_err("amdgpu_vm_bo_update failed\n");
++ return ret;
++ }
++
++ amdgpu_sync_fence(adev, sync, bo_va->last_pt_update);
++
++ /* Sync objects can't handle multiple GPUs (contexts) updating
++ * sync->last_vm_update. Fortunately we don't need it for
++ * KFD's purposes, so we can just drop that fence.
++ */
++ if (sync->last_vm_update) {
++ dma_fence_put(sync->last_vm_update);
++ sync->last_vm_update = NULL;
++ }
++
++ return 0;
++}
++
++static int map_bo_to_gpuvm(struct amdgpu_device *adev,
++ struct kfd_bo_va_list *entry, struct amdgpu_sync *sync,
++ bool no_update_pte)
++{
++ int ret;
++
++ /* Set virtual address for the allocation */
++ ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0,
++ amdgpu_bo_size(entry->bo_va->base.bo), entry->pte_flags);
++ if (ret != 0) {
++ pr_err("Failed to map VA 0x%llx in vm. ret %d\n",
++ entry->va, ret);
++ return ret;
++ }
++
++ if (no_update_pte)
++ return 0;
++
++ ret = update_gpuvm_pte(adev, entry, sync);
++ if (ret != 0) {
++ pr_err("update_gpuvm_pte() failed\n");
++ goto update_gpuvm_pte_failed;
++ }
++
++ return 0;
++
++update_gpuvm_pte_failed:
++ unmap_bo_from_gpuvm(adev, entry, sync);
++ return ret;
++}
++
++static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size)
++{
++ struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL);
++
++ if (!sg)
++ return NULL;
++ if (sg_alloc_table(sg, 1, GFP_KERNEL)) {
++ kfree(sg);
++ return NULL;
++ }
++ sg->sgl->dma_address = addr;
++ sg->sgl->length = size;
++#ifdef CONFIG_NEED_SG_DMA_LENGTH
++ sg->sgl->dma_length = size;
++#endif
++ return sg;
++}
++
++int amdgpu_amdkfd_gpuvm_sync_memory(
++ struct kgd_dev *kgd, struct kgd_mem *mem, bool intr)
++{
++ int ret = 0;
++ struct amdgpu_sync sync;
++ struct amdgpu_device *adev;
++
++ adev = get_amdgpu_device(kgd);
++ amdgpu_sync_create(&sync);
++
++ mutex_lock(&mem->lock);
++ amdgpu_sync_clone(adev, &mem->sync, &sync);
++ mutex_unlock(&mem->lock);
++
++ ret = amdgpu_sync_wait(&sync, intr);
++ amdgpu_sync_free(&sync);
++ return ret;
++}
++
++#define BOOL_TO_STR(b) (b == true) ? "true" : "false"
++
++int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
++ struct kgd_dev *kgd, uint64_t va, uint64_t size,
++ void *vm, struct kgd_mem **mem,
++ uint64_t *offset, uint32_t flags)
++{
++ bool aql_queue, public, readonly, execute, coherent, no_sub, userptr;
++ u64 alloc_flag;
++ uint32_t domain;
++ uint64_t *temp_offset;
++ struct sg_table *sg = NULL;
++
++ if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) {
++ pr_err("current hw doesn't support paged memory\n");
++ return -EINVAL;
++ }
++
++ domain = 0;
++ alloc_flag = 0;
++ temp_offset = NULL;
++
++ aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false;
++ public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false;
++ readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false;
++ execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false;
++ coherent = (flags & ALLOC_MEM_FLAGS_COHERENT) ? true : false;
++ no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false;
++ userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false;
++
++ /*
++ * Check on which domain to allocate BO
++ */
++ if (flags & ALLOC_MEM_FLAGS_VRAM) {
++ domain = AMDGPU_GEM_DOMAIN_VRAM;
++ alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
++ if (public) {
++ alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
++ temp_offset = offset;
++ }
++ alloc_flag |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
++ } else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) {
++ domain = AMDGPU_GEM_DOMAIN_GTT;
++ alloc_flag = 0;
++ temp_offset = offset;
++ } else if (flags & ALLOC_MEM_FLAGS_DOORBELL) {
++ domain = AMDGPU_GEM_DOMAIN_GTT;
++ alloc_flag = 0;
++ temp_offset = offset;
++ if (size > UINT_MAX)
++ return -EINVAL;
++ sg = create_doorbell_sg(*offset, size);
++ if (!sg)
++ return -ENOMEM;
++ }
++
++ if (offset && !userptr)
++ *offset = 0;
++
++ pr_debug("Allocate VA 0x%llx - 0x%llx domain %s aql %s\n",
++ va, va + size, domain_string(domain),
++ BOOL_TO_STR(aql_queue));
++
++ pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s coherent %s no_sub %s\n",
++ alloc_flag, BOOL_TO_STR(public),
++ BOOL_TO_STR(readonly), BOOL_TO_STR(execute),
++ BOOL_TO_STR(coherent), BOOL_TO_STR(no_sub));
++
++ return __alloc_memory_of_gpu(kgd, va, size, vm, mem,
++ temp_offset, domain,
++ alloc_flag, sg,
++ aql_queue, readonly, execute,
++ coherent, no_sub, userptr);
++}
++
++int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
++{
++ struct amdgpu_device *adev;
++ struct kfd_bo_va_list *entry, *tmp;
++ struct bo_vm_reservation_context ctx;
++ int ret = 0;
++ struct ttm_validate_buffer *bo_list_entry;
++ struct amdkfd_process_info *process_info;
++ unsigned long bo_size;
++
++ adev = get_amdgpu_device(kgd);
++ process_info = ((struct amdkfd_vm *)vm)->process_info;
++
++ bo_size = mem->bo->tbo.mem.size;
++
++ mutex_lock(&mem->lock);
++
++ if (mem->mapped_to_gpu_memory > 0) {
++ pr_err("BO VA 0x%llx size 0x%lx is already mapped to vm %p.\n",
++ mem->va, bo_size, vm);
++ mutex_unlock(&mem->lock);
++ return -EBUSY;
++ }
++
++ mutex_unlock(&mem->lock);
++ /* lock is not needed after this, since mem is unused and will
++ * be freed anyway
++ */
++
++ /* No more MMU notifiers */
++ amdgpu_mn_unregister(mem->bo);
++
++ /* Make sure restore workers don't access the BO any more */
++ bo_list_entry = &mem->validate_list;
++ mutex_lock(&process_info->lock);
++ list_del(&bo_list_entry->head);
++ mutex_unlock(&process_info->lock);
++
++ /* Free user pages if necessary */
++ if (mem->user_pages) {
++ pr_debug("%s: Freeing user_pages array\n", __func__);
++ if (mem->user_pages[0])
++ release_pages(mem->user_pages,
++ mem->bo->tbo.ttm->num_pages, 0);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
++ drm_free_large(mem->user_pages);
++#else
++ kvfree(mem->user_pages);
++#endif
++ }
++
++ ret = reserve_bo_and_cond_vms(mem, NULL, VA_DO_NOT_CARE, &ctx);
++ if (unlikely(ret != 0))
++ return ret;
++
++ /* The eviction fence should be removed by the last unmap.
++ * TODO: Log an error condition if the bo still has the eviction fence
++ * attached
++ */
++ amdgpu_amdkfd_remove_eviction_fence(mem->bo,
++ process_info->eviction_fence,
++ NULL, NULL);
++ pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
++ mem->va + bo_size * (1 + mem->aql_queue));
++
++ /* Remove from VM internal data structures */
++ list_for_each_entry_safe(entry, tmp, &mem->bo_va_list, bo_list) {
++ remove_bo_from_vm((struct amdgpu_device *)entry->kgd_dev,
++ entry, bo_size);
++ }
++
++ ret = unreserve_bo_and_vms(&ctx, false, false);
++
++ /* Free the sync object */
++ amdgpu_sync_free(&mem->sync);
++
++ /* If the SG is not NULL, it's one we created for a doorbell
++ * BO. We need to free it.
++ */
++ if (mem->bo->tbo.sg) {
++ sg_free_table(mem->bo->tbo.sg);
++ kfree(mem->bo->tbo.sg);
++ }
++
++ /* Free the BO*/
++ amdgpu_bo_unref(&mem->bo);
++ kfree(mem);
++
++ return ret;
++}
++
++int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
++{
++ struct amdgpu_device *adev;
++ int ret;
++ struct amdgpu_bo *bo;
++ uint32_t domain;
++ struct kfd_bo_va_list *entry;
++ struct bo_vm_reservation_context ctx;
++ struct kfd_bo_va_list *bo_va_entry = NULL;
++ struct kfd_bo_va_list *bo_va_entry_aql = NULL;
++ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm;
++ unsigned long bo_size;
++ bool is_invalid_userptr;
++
++ adev = get_amdgpu_device(kgd);
++
++ /* Make sure restore is not running concurrently. Since we
++ * don't map invalid userptr BOs, we rely on the next restore
++ * worker to do the mapping
++ */
++ mutex_lock(&mem->process_info->lock);
++
++ /* Lock mmap-sem. If we find an invalid userptr BO, we can be
++ * sure that the MMU notifier is no longer running
++ * concurrently and the queues are actually stopped
++ */
++ down_read(&current->mm->mmap_sem);
++ is_invalid_userptr = atomic_read(&mem->invalid);
++ up_read(&current->mm->mmap_sem);
++
++ mutex_lock(&mem->lock);
++
++ bo = mem->bo;
++
++ if (!bo) {
++ pr_err("Invalid BO when mapping memory to GPU\n");
++ return -EINVAL;
++ }
++
++ domain = mem->domain;
++ bo_size = bo->tbo.mem.size;
++
++ pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n",
++ mem->va,
++ mem->va + bo_size * (1 + mem->aql_queue),
++ vm, domain_string(domain));
++
++ ret = reserve_bo_and_vm(mem, vm, &ctx);
++ if (unlikely(ret != 0))
++ goto bo_reserve_failed;
++
++ /* Userptr can be marked as "not invalid", but not actually be
++ * validated yet (still in the system domain). In that case
++ * the queues are still stopped and we can leave mapping for
++ * the next restore worker
++ */
++ if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM)
++ is_invalid_userptr = true;
++
++ if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, mem)) {
++ ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, false,
++ &bo_va_entry);
++ if (ret != 0)
++ goto add_bo_to_vm_failed;
++ if (mem->aql_queue) {
++ ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm,
++ true, &bo_va_entry_aql);
++ if (ret != 0)
++ goto add_bo_to_vm_failed_aql;
++ }
++ }
++
++ if (mem->mapped_to_gpu_memory == 0 &&
++ !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
++ /* Validate BO only once. The eviction fence gets added to BO
++ * the first time it is mapped. Validate will wait for all
++ * background evictions to complete.
++ */
++ ret = amdgpu_amdkfd_bo_validate(bo, domain, true);
++ if (ret) {
++ pr_debug("Validate failed\n");
++ goto map_bo_to_gpuvm_failed;
++ }
++ }
++
++ list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
++ if (entry->bo_va->base.vm == vm && !entry->is_mapped) {
++ pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n",
++ entry->va, entry->va + bo_size,
++ entry);
++
++ ret = map_bo_to_gpuvm(adev, entry, ctx.sync,
++ is_invalid_userptr);
++ if (ret != 0) {
++ pr_err("Failed to map radeon bo to gpuvm\n");
++ goto map_bo_to_gpuvm_failed;
++ }
++ entry->is_mapped = true;
++ mem->mapped_to_gpu_memory++;
++ pr_debug("\t INC mapping count %d\n",
++ mem->mapped_to_gpu_memory);
++ }
++ }
++
++ if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) == NULL)
++ amdgpu_bo_fence(bo,
++ &kfd_vm->process_info->eviction_fence->base,
++ true);
++ ret = unreserve_bo_and_vms(&ctx, false, false);
++
++ mutex_unlock(&mem->process_info->lock);
++ mutex_unlock(&mem->lock);
++ return ret;
++
++map_bo_to_gpuvm_failed:
++ if (bo_va_entry_aql)
++ remove_bo_from_vm(adev, bo_va_entry_aql, bo_size);
++add_bo_to_vm_failed_aql:
++ if (bo_va_entry)
++ remove_bo_from_vm(adev, bo_va_entry, bo_size);
++add_bo_to_vm_failed:
++ unreserve_bo_and_vms(&ctx, false, false);
++bo_reserve_failed:
++ mutex_unlock(&mem->process_info->lock);
++ mutex_unlock(&mem->lock);
++ return ret;
++}
++
++static u64 get_vm_pd_gpu_offset(void *vm)
++{
++ struct amdgpu_vm *avm = (struct amdgpu_vm *) vm;
++ struct amdgpu_device *adev =
++ amdgpu_ttm_adev(avm->root.base.bo->tbo.bdev);
++ u64 offset;
++
++ BUG_ON(avm == NULL);
++
++ amdgpu_bo_reserve(avm->root.base.bo, false);
++
++ offset = amdgpu_bo_gpu_offset(avm->root.base.bo);
++
++ amdgpu_bo_unreserve(avm->root.base.bo);
++
++ /* On some ASICs the FB doesn't start at 0. Adjust FB offset
++ * to an actual MC address.
++ */
++ if (adev->gart.gart_funcs->get_vm_pde)
++ offset = amdgpu_gart_get_vm_pde(adev, offset);
++
++ return offset;
++}
++
++int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm,
++ void **process_info,
++ struct dma_fence **ef)
++{
++ int ret;
++ struct amdkfd_vm *new_vm;
++ struct amdkfd_process_info *info;
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ new_vm = kzalloc(sizeof(*new_vm), GFP_KERNEL);
++ if (new_vm == NULL)
++ return -ENOMEM;
++
++ /* Initialize the VM context, allocate the page directory and zero it */
++ ret = amdgpu_vm_init(adev, &new_vm->base, AMDGPU_VM_CONTEXT_COMPUTE);
++ if (ret != 0) {
++ pr_err("Failed init vm ret %d\n", ret);
++ /* Undo everything related to the new VM context */
++ goto vm_init_fail;
++ }
++ new_vm->adev = adev;
++
++ if (!*process_info) {
++ info = kzalloc(sizeof(*info), GFP_KERNEL);
++ if (!info) {
++ pr_err("Failed to create amdkfd_process_info");
++ ret = -ENOMEM;
++ goto alloc_process_info_fail;
++ }
++
++ mutex_init(&info->lock);
++ INIT_LIST_HEAD(&info->vm_list_head);
++ INIT_LIST_HEAD(&info->kfd_bo_list);
++ INIT_LIST_HEAD(&info->userptr_valid_list);
++ INIT_LIST_HEAD(&info->userptr_inval_list);
++
++ info->eviction_fence =
++ amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
++ current->mm);
++ if (info->eviction_fence == NULL) {
++ pr_err("Failed to create eviction fence\n");
++ goto create_evict_fence_fail;
++ }
++
++ info->pid = get_task_pid(current->group_leader,
++ PIDTYPE_PID);
++ atomic_set(&info->evicted_bos, 0);
++ INIT_DELAYED_WORK(&info->work,
++ amdgpu_amdkfd_restore_userptr_worker);
++
++ *process_info = info;
++ *ef = dma_fence_get(&info->eviction_fence->base);
++ }
++
++ new_vm->process_info = *process_info;
++
++ mutex_lock(&new_vm->process_info->lock);
++ list_add_tail(&new_vm->vm_list_node,
++ &(new_vm->process_info->vm_list_head));
++ new_vm->process_info->n_vms++;
++ mutex_unlock(&new_vm->process_info->lock);
++
++ *vm = (void *) new_vm;
++
++ pr_debug("Created process vm %p\n", *vm);
++
++ return ret;
++
++create_evict_fence_fail:
++ kfree(info);
++alloc_process_info_fail:
++ amdgpu_vm_fini(adev, &new_vm->base);
++vm_init_fail:
++ kfree(new_vm);
++ return ret;
++
++}
++
++void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *) vm;
++ struct amdgpu_vm *avm = &kfd_vm->base;
++ struct amdgpu_bo *pd;
++ struct amdkfd_process_info *process_info;
++
++ if (WARN_ON(!kgd || !vm))
++ return;
++
++ pr_debug("Destroying process vm %p\n", vm);
++ /* Release eviction fence from PD */
++ pd = avm->root.base.bo;
++ amdgpu_bo_reserve(pd, false);
++ amdgpu_bo_fence(pd, NULL, false);
++ amdgpu_bo_unreserve(pd);
++
++ process_info = kfd_vm->process_info;
++
++ mutex_lock(&process_info->lock);
++ process_info->n_vms--;
++ list_del(&kfd_vm->vm_list_node);
++ mutex_unlock(&process_info->lock);
++
++ /* Release per-process resources */
++ if (!process_info->n_vms) {
++ WARN_ON(!list_empty(&process_info->kfd_bo_list));
++ WARN_ON(!list_empty(&process_info->userptr_valid_list));
++ WARN_ON(!list_empty(&process_info->userptr_inval_list));
++
++ dma_fence_put(&process_info->eviction_fence->base);
++ cancel_delayed_work_sync(&process_info->work);
++ put_pid(process_info->pid);
++ kfree(process_info);
++ }
++
++ /* Release the VM context */
++ amdgpu_vm_fini(adev, avm);
++ kfree(vm);
++}
++
++uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm)
++{
++ return get_vm_pd_gpu_offset(vm) >> AMDGPU_GPU_PAGE_SHIFT;
++}
++
++int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
++ struct kfd_vm_fault_info *mem)
++{
++ struct amdgpu_device *adev;
++
++ adev = (struct amdgpu_device *) kgd;
++ if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) {
++ *mem = *adev->mc.vm_fault_info;
++ mb();
++ atomic_set(&adev->mc.vm_fault_info_updated, 0);
++ }
++ return 0;
++}
++
++static bool is_mem_on_local_device(struct kgd_dev *kgd,
++ struct list_head *bo_va_list, void *vm)
++{
++ struct kfd_bo_va_list *entry;
++
++ list_for_each_entry(entry, bo_va_list, bo_list) {
++ if (entry->kgd_dev == kgd && entry->bo_va->base.vm == vm)
++ return true;
++ }
++
++ return false;
++}
++
++int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
++{
++ struct kfd_bo_va_list *entry;
++ struct amdgpu_device *adev;
++ unsigned int mapped_before;
++ int ret = 0;
++ struct bo_vm_reservation_context ctx;
++ struct amdkfd_process_info *process_info;
++ unsigned long bo_size;
++
++ adev = (struct amdgpu_device *) kgd;
++ process_info = ((struct amdkfd_vm *)vm)->process_info;
++
++ bo_size = mem->bo->tbo.mem.size;
++
++ mutex_lock(&mem->lock);
++
++ /*
++ * Make sure that this BO mapped on KGD before unmappping it
++ */
++ if (!is_mem_on_local_device(kgd, &mem->bo_va_list, vm)) {
++ ret = -EINVAL;
++ goto out;
++ }
++
++ if (mem->mapped_to_gpu_memory == 0) {
++ pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n",
++ mem->va, bo_size, vm);
++ ret = -EINVAL;
++ goto out;
++ }
++ mapped_before = mem->mapped_to_gpu_memory;
++
++ ret = reserve_bo_and_cond_vms(mem, vm, VA_MAPPED, &ctx);
++ if (unlikely(ret != 0))
++ goto out;
++
++ pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n",
++ mem->va,
++ mem->va + bo_size * (1 + mem->aql_queue),
++ vm);
++
++ list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
++ if (entry->bo_va->base.vm == vm && entry->is_mapped) {
++ pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
++ entry->va,
++ entry->va + bo_size,
++ entry);
++
++ ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync);
++ if (ret == 0) {
++ entry->is_mapped = false;
++ } else {
++ pr_err("failed to unmap VA 0x%llx\n",
++ mem->va);
++ goto unreserve_out;
++ }
++
++ mem->mapped_to_gpu_memory--;
++ pr_debug("\t DEC mapping count %d\n",
++ mem->mapped_to_gpu_memory);
++ }
++ }
++
++ /* If BO is unmapped from all VMs, unfence it. It can be evicted if
++ * required.
++ */
++ if (mem->mapped_to_gpu_memory == 0 &&
++ !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm))
++ amdgpu_amdkfd_remove_eviction_fence(mem->bo,
++ process_info->eviction_fence,
++ NULL, NULL);
++
++ if (mapped_before == mem->mapped_to_gpu_memory) {
++ pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n",
++ mem->va, bo_size, vm);
++ ret = -EINVAL;
++ }
++
++unreserve_out:
++ unreserve_bo_and_vms(&ctx, false, false);
++out:
++ mutex_unlock(&mem->lock);
++ return ret;
++}
++
++int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma)
++{
++ struct amdgpu_device *adev;
++
++ adev = get_amdgpu_device(kgd);
++ if (!adev) {
++ pr_err("Could not get amdgpu device in %s\n", __func__);
++ return -ENODEV;
++ }
++
++ return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev);
++}
++
++int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
++ struct kgd_mem *mem, void **kptr)
++{
++ int ret;
++ struct amdgpu_bo *bo = mem->bo;
++
++ if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
++ pr_err("userptr can't be mapped to kernel\n");
++ return -EINVAL;
++ }
++
++ /* delete kgd_mem from kfd_bo_list to avoid re-validating
++ * this BO in BO's restoring after eviction.
++ */
++ mutex_lock(&mem->process_info->lock);
++
++ list_del_init(&mem->validate_list.head);
++
++ ret = __map_bo_to_kernel(bo, AMDGPU_GEM_DOMAIN_GTT, kptr);
++ if (!ret)
++ mem->kptr = *kptr;
++
++ mutex_unlock(&mem->process_info->lock);
++
++ return ret;
++}
++
++static int pin_bo_wo_map(struct kgd_mem *mem)
++{
++ struct amdgpu_bo *bo = mem->bo;
++ int ret = 0;
++
++ ret = amdgpu_bo_reserve(bo, false);
++ if (unlikely(ret != 0))
++ return ret;
++
++ ret = amdgpu_bo_pin(bo, mem->domain, NULL);
++ amdgpu_bo_unreserve(bo);
++
++ return ret;
++}
++
++static void unpin_bo_wo_map(struct kgd_mem *mem)
++{
++ struct amdgpu_bo *bo = mem->bo;
++ int ret = 0;
++
++ ret = amdgpu_bo_reserve(bo, false);
++ if (unlikely(ret != 0))
++ return;
++
++ amdgpu_bo_unpin(bo);
++ amdgpu_bo_unreserve(bo);
++}
++
++#define AMD_GPU_PAGE_SHIFT PAGE_SHIFT
++#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT)
++
++static int get_sg_table(struct amdgpu_device *adev,
++ struct kgd_mem *mem, uint64_t offset,
++ uint64_t size, struct sg_table **ret_sg)
++{
++ struct amdgpu_bo *bo = mem->bo;
++ struct sg_table *sg = NULL;
++ unsigned long bus_addr;
++ unsigned int chunks;
++ unsigned int i;
++ struct scatterlist *s;
++ uint64_t offset_in_page;
++ unsigned int page_size;
++ int ret;
++
++ sg = kmalloc(sizeof(*sg), GFP_KERNEL);
++ if (!sg) {
++ ret = -ENOMEM;
++ goto out;
++ }
++
++ if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM)
++ page_size = AMD_GPU_PAGE_SIZE;
++ else
++ page_size = PAGE_SIZE;
++
++
++ offset_in_page = offset & (page_size - 1);
++ chunks = (size + offset_in_page + page_size - 1)
++ / page_size;
++
++ ret = sg_alloc_table(sg, chunks, GFP_KERNEL);
++ if (unlikely(ret))
++ goto out;
++
++ if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) {
++ bus_addr = bo->tbo.offset + adev->mc.aper_base + offset;
++
++ for_each_sg(sg->sgl, s, sg->orig_nents, i) {
++ uint64_t chunk_size, length;
++
++ chunk_size = page_size - offset_in_page;
++ length = min(size, chunk_size);
++
++ sg_set_page(s, NULL, length, offset_in_page);
++ s->dma_address = bus_addr;
++ s->dma_length = length;
++
++ size -= length;
++ offset_in_page = 0;
++ bus_addr += length;
++ }
++ } else {
++ struct page **pages;
++ unsigned int cur_page;
++
++ pages = bo->tbo.ttm->pages;
++
++ cur_page = offset / page_size;
++ for_each_sg(sg->sgl, s, sg->orig_nents, i) {
++ uint64_t chunk_size, length;
++
++ chunk_size = page_size - offset_in_page;
++ length = min(size, chunk_size);
++
++ sg_set_page(s, pages[cur_page], length, offset_in_page);
++ s->dma_address = page_to_phys(pages[cur_page]);
++ s->dma_length = length;
++
++ size -= length;
++ offset_in_page = 0;
++ cur_page++;
++ }
++ }
++
++ *ret_sg = sg;
++ return 0;
++out:
++ kfree(sg);
++ *ret_sg = NULL;
++ return ret;
++}
++
++int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd,
++ struct kgd_mem *mem, uint64_t offset,
++ uint64_t size, struct sg_table **ret_sg)
++{
++ int ret;
++ struct amdgpu_device *adev;
++
++ ret = pin_bo_wo_map(mem);
++ if (unlikely(ret != 0))
++ return ret;
++
++ adev = get_amdgpu_device(kgd);
++
++ ret = get_sg_table(adev, mem, offset, size, ret_sg);
++ if (ret)
++ unpin_bo_wo_map(mem);
++
++ return ret;
++}
++
++void amdgpu_amdkfd_gpuvm_unpin_put_sg_table(
++ struct kgd_mem *mem, struct sg_table *sg)
++{
++ sg_free_table(sg);
++ kfree(sg);
++
++ unpin_bo_wo_map(mem);
++}
++
++int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
++ struct dma_buf *dma_buf,
++ uint64_t va, void *vm,
++ struct kgd_mem **mem, uint64_t *size,
++ uint64_t *mmap_offset)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++ struct drm_gem_object *obj;
++ struct amdgpu_bo *bo;
++ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm;
++
++ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops)
++ /* Can't handle non-graphics buffers */
++ return -EINVAL;
++
++ obj = dma_buf->priv;
++ if (obj->dev->dev_private != adev)
++ /* Can't handle buffers from other devices */
++ return -EINVAL;
++
++ bo = gem_to_amdgpu_bo(obj);
++ if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
++ AMDGPU_GEM_DOMAIN_GTT |
++ AMDGPU_GEM_DOMAIN_DGMA)))
++ /* Only VRAM and GTT BOs are supported */
++ return -EINVAL;
++
++ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
++ if (*mem == NULL)
++ return -ENOMEM;
++
++ if (size)
++ *size = amdgpu_bo_size(bo);
++
++ if (mmap_offset)
++ *mmap_offset = amdgpu_bo_mmap_offset(bo);
++
++ INIT_LIST_HEAD(&(*mem)->bo_va_list);
++ mutex_init(&(*mem)->lock);
++ (*mem)->mapping_flags =
++ AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
++ AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC;
++
++ (*mem)->bo = amdgpu_bo_ref(bo);
++ (*mem)->va = va;
++ if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)
++ (*mem)->domain = AMDGPU_GEM_DOMAIN_VRAM;
++ else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT)
++ (*mem)->domain = AMDGPU_GEM_DOMAIN_GTT;
++ else
++ (*mem)->domain = AMDGPU_GEM_DOMAIN_DGMA;
++ (*mem)->mapped_to_gpu_memory = 0;
++ (*mem)->process_info = kfd_vm->process_info;
++ add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, false);
++ amdgpu_sync_create(&(*mem)->sync);
++
++ return 0;
++}
++
++int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm,
++ struct kgd_mem *mem,
++ struct dma_buf **dmabuf)
++{
++ struct amdgpu_device *adev = NULL;
++ struct amdgpu_bo *bo = NULL;
++ struct drm_gem_object *gobj = NULL;
++
++ if (!dmabuf || !kgd || !vm || !mem)
++ return -EINVAL;
++
++ adev = get_amdgpu_device(kgd);
++ bo = mem->bo;
++
++ gobj = amdgpu_gem_prime_foreign_bo(adev, bo);
++ if (gobj == NULL) {
++ pr_err("Export BO failed. Unable to find/create GEM object\n");
++ return -EINVAL;
++ }
++
++ *dmabuf = amdgpu_gem_prime_export(adev->ddev, gobj, 0);
++ return 0;
++}
++
++static int process_validate_vms(struct amdkfd_process_info *process_info)
++{
++ struct amdkfd_vm *peer_vm;
++ int ret;
++
++ list_for_each_entry(peer_vm, &process_info->vm_list_head,
++ vm_list_node) {
++ ret = vm_validate_pt_pd_bos(&peer_vm->base);
++ if (ret)
++ return ret;
++ }
++
++ return 0;
++}
++
++/* Evict a userptr BO by stopping the queues if necessary
++ *
++ * Runs in MMU notifier, may be in RECLAIM_FS context. This means it
++ * cannot do any memory allocations, and cannot take any locks that
++ * are held elsewhere while allocating memory. Therefore this is as
++ * simple as possible, using atomic counters.
++ *
++ * It doesn't do anything to the BO itself. The real work happens in
++ * restore, where we get updated page addresses. This function only
++ * ensures that GPU access to the BO is stopped.
++ */
++int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
++ struct mm_struct *mm)
++{
++ struct amdkfd_process_info *process_info = mem->process_info;
++ int invalid, evicted_bos;
++ int r = 0;
++
++ invalid = atomic_inc_return(&mem->invalid);
++ evicted_bos = atomic_inc_return(&process_info->evicted_bos);
++ if (evicted_bos == 1) {
++ /* First eviction, stop the queues */
++ r = kgd2kfd->quiesce_mm(NULL, mm);
++ if (r != 0)
++ pr_err("Failed to quiesce KFD\n");
++ schedule_delayed_work(&process_info->work, 1);
++ }
++
++ return r;
++}
++
++/* Update invalid userptr BOs
++ *
++ * Moves invalidated (evicted) userptr BOs from userptr_valid_list to
++ * userptr_inval_list and updates user pages for all BOs that have
++ * been invalidated since their last update.
++ */
++static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
++ struct mm_struct *mm)
++{
++ struct kgd_mem *mem, *tmp_mem;
++ struct amdgpu_bo *bo;
++ int invalid, ret;
++
++ /* Move all invalidated BOs to the userptr_inval_list and
++ * release their user pages by migration to the CPU domain
++ */
++ list_for_each_entry_safe(mem, tmp_mem,
++ &process_info->userptr_valid_list,
++ validate_list.head) {
++ if (!atomic_read(&mem->invalid))
++ continue; /* BO is still valid */
++
++ bo = mem->bo;
++
++ if (amdgpu_bo_reserve(bo, true))
++ return -EAGAIN;
++ amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false);
++ amdgpu_bo_unreserve(bo);
++ if (ret) {
++ pr_err("%s: Failed to invalidate userptr BO\n",
++ __func__);
++ return -EAGAIN;
++ }
++
++ list_move_tail(&mem->validate_list.head,
++ &process_info->userptr_inval_list);
++ }
++
++ if (list_empty(&process_info->userptr_inval_list))
++ return 0; /* All evicted userptr BOs were freed */
++
++ /* Go through userptr_inval_list and update any invalid user_pages */
++ list_for_each_entry(mem, &process_info->userptr_inval_list,
++ validate_list.head) {
++ invalid = atomic_read(&mem->invalid);
++ if (!invalid)
++ /* BO hasn't been invalidated since the last
++ * revalidation attempt. Keep its BO list.
++ */
++ continue;
++
++ bo = mem->bo;
++
++ if (!mem->user_pages) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
++ mem->user_pages =
++ drm_calloc_large(bo->tbo.ttm->num_pages,
++ sizeof(struct page *));
++#else
++ mem->user_pages =
++ kvmalloc_array(bo->tbo.ttm->num_pages,
++ sizeof(struct page *),
++ GFP_KERNEL | __GFP_ZERO);
++#endif
++ if (!mem->user_pages) {
++ pr_err("%s: Failed to allocate pages array\n",
++ __func__);
++ return -ENOMEM;
++ }
++ } else if (mem->user_pages[0]) {
++ release_pages(mem->user_pages,
++ bo->tbo.ttm->num_pages, 0);
++ }
++
++ /* Get updated user pages */
++ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
++ mem->user_pages);
++ if (ret) {
++ mem->user_pages[0] = NULL;
++ pr_info("%s: Failed to get user pages: %d\n",
++ __func__, ret);
++ /* Pretend it succeeded. It will fail later
++ * with a VM fault if the GPU tries to access
++ * it. Better than hanging indefinitely with
++ * stalled user mode queues.
++ */
++ }
++
++ /* Mark the BO as valid unless it was invalidated
++ * again concurrently
++ */
++ if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid)
++ return -EAGAIN;
++ }
++ return 0;
++}
++
++/* Validate invalid userptr BOs
++ *
++ * Validates BOs on the userptr_inval_list, and moves them back to the
++ * userptr_valid_list. Also updates GPUVM page tables with new page
++ * addresses and waits for the page table updates to complete.
++ */
++static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
++{
++ struct amdgpu_bo_list_entry *pd_bo_list_entries;
++ struct list_head resv_list, duplicates;
++ struct ww_acquire_ctx ticket;
++ struct amdgpu_sync sync;
++
++ struct amdkfd_vm *peer_vm;
++ struct kgd_mem *mem, *tmp_mem;
++ struct amdgpu_bo *bo;
++ int i, ret;
++
++ pd_bo_list_entries = kcalloc(process_info->n_vms,
++ sizeof(struct amdgpu_bo_list_entry),
++ GFP_KERNEL);
++ if (!pd_bo_list_entries) {
++ pr_err("%s: Failed to allocate PD BO list entries\n", __func__);
++ return -ENOMEM;
++ }
++
++ INIT_LIST_HEAD(&resv_list);
++ INIT_LIST_HEAD(&duplicates);
++
++ /* Get all the page directory BOs that need to be reserved */
++ i = 0;
++ list_for_each_entry(peer_vm, &process_info->vm_list_head,
++ vm_list_node)
++ amdgpu_vm_get_pd_bo(&peer_vm->base, &resv_list,
++ &pd_bo_list_entries[i++]);
++ /* Add the userptr_inval_list entries to resv_list */
++ list_for_each_entry(mem, &process_info->userptr_inval_list,
++ validate_list.head) {
++ list_add_tail(&mem->resv_list.head, &resv_list);
++ mem->resv_list.bo = mem->validate_list.bo;
++ mem->resv_list.shared = mem->validate_list.shared;
++ }
++
++ /* Reserve all BOs and page tables for validation */
++ ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates);
++ WARN(!list_empty(&duplicates), "Duplicates should be empty");
++ if (ret)
++ goto out;
++
++ amdgpu_sync_create(&sync);
++
++ /* Avoid triggering eviction fences when unmapping invalid
++ * userptr BOs (waits for all fences, doesn't use
++ * FENCE_OWNER_VM)
++ */
++ list_for_each_entry(peer_vm, &process_info->vm_list_head,
++ vm_list_node)
++ amdgpu_amdkfd_remove_eviction_fence(peer_vm->base.root.base.bo,
++ process_info->eviction_fence,
++ NULL, NULL);
++
++ ret = process_validate_vms(process_info);
++ if (ret)
++ goto unreserve_out;
++
++ /* Validate BOs and update GPUVM page tables */
++ list_for_each_entry_safe(mem, tmp_mem,
++ &process_info->userptr_inval_list,
++ validate_list.head) {
++ struct kfd_bo_va_list *bo_va_entry;
++
++ bo = mem->bo;
++
++ /* Copy pages array and validate the BO if we got user pages */
++ if (mem->user_pages[0]) {
++ amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm,
++ mem->user_pages);
++ amdgpu_ttm_placement_from_domain(bo, mem->domain);
++ ret = ttm_bo_validate(&bo->tbo, &bo->placement,
++ false, false);
++ if (ret) {
++ pr_err("%s: failed to validate BO\n", __func__);
++ goto unreserve_out;
++ }
++ }
++
++ /* Validate succeeded, now the BO owns the pages, free
++ * our copy of the pointer array. Put this BO back on
++ * the userptr_valid_list. If we need to revalidate
++ * it, we need to start from scratch.
++ */
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
++ drm_free_large(mem->user_pages);
++#else
++ kvfree(mem->user_pages);
++#endif
++ mem->user_pages = NULL;
++ list_move_tail(&mem->validate_list.head,
++ &process_info->userptr_valid_list);
++
++ /* Update mapping. If the BO was not validated
++ * (because we couldn't get user pages), this will
++ * clear the page table entries, which will result in
++ * VM faults if the GPU tries to access the invalid
++ * memory.
++ */
++ list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) {
++ if (!bo_va_entry->is_mapped)
++ continue;
++
++ ret = update_gpuvm_pte((struct amdgpu_device *)
++ bo_va_entry->kgd_dev,
++ bo_va_entry, &sync);
++ if (ret) {
++ pr_err("%s: update PTE failed\n", __func__);
++ /* make sure this gets validated again */
++ atomic_inc(&mem->invalid);
++ goto unreserve_out;
++ }
++ }
++ }
++unreserve_out:
++ list_for_each_entry(peer_vm, &process_info->vm_list_head,
++ vm_list_node)
++ amdgpu_bo_fence(peer_vm->base.root.base.bo,
++ &process_info->eviction_fence->base, true);
++ ttm_eu_backoff_reservation(&ticket, &resv_list);
++ amdgpu_sync_wait(&sync, false);
++ amdgpu_sync_free(&sync);
++out:
++ kfree(pd_bo_list_entries);
++
++ return ret;
++}
++
++/* Worker callback to restore evicted userptr BOs
++ *
++ * Tries to update and validate all userptr BOs. If successful and no
++ * concurrent evictions happened, the queues are restarted. Otherwise,
++ * reschedule for another attempt later.
++ */
++static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
++{
++ struct delayed_work *dwork = to_delayed_work(work);
++ struct amdkfd_process_info *process_info =
++ container_of(dwork, struct amdkfd_process_info, work);
++ struct task_struct *usertask;
++ struct mm_struct *mm;
++ int evicted_bos;
++
++ evicted_bos = atomic_read(&process_info->evicted_bos);
++ if (!evicted_bos)
++ return;
++
++ /* Reference task and mm in case of concurrent process termination */
++ usertask = get_pid_task(process_info->pid, PIDTYPE_PID);
++ if (!usertask)
++ return;
++ mm = get_task_mm(usertask);
++ if (!mm) {
++ put_task_struct(usertask);
++ return;
++ }
++
++ mutex_lock(&process_info->lock);
++
++ if (update_invalid_user_pages(process_info, mm))
++ goto unlock_out;
++ /* userptr_inval_list can be empty if all evicted userptr BOs
++ * have been freed. In that case there is nothing to validate
++ * and we can just restart the queues.
++ */
++ if (!list_empty(&process_info->userptr_inval_list)) {
++ if (atomic_read(&process_info->evicted_bos) != evicted_bos)
++ goto unlock_out; /* Concurrent eviction, try again */
++
++ if (validate_invalid_user_pages(process_info))
++ goto unlock_out;
++ }
++ /* Final check for concurrent evicton and atomic update. If
++ * another eviction happens after successful update, it will
++ * be a first eviction that calls quiesce_mm. The eviction
++ * reference counting inside KFD will handle this case.
++ */
++ if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) !=
++ evicted_bos)
++ goto unlock_out;
++ evicted_bos = 0;
++ if (kgd2kfd->resume_mm(NULL, mm)) {
++ pr_err("%s: Failed to resume KFD\n", __func__);
++ /* No recovery from this failure. Probably the CP is
++ * hanging. No point trying again.
++ */
++ }
++unlock_out:
++ mutex_unlock(&process_info->lock);
++ mmput(mm);
++ put_task_struct(usertask);
++
++ /* If validation failed, reschedule another attempt */
++ if (evicted_bos)
++ schedule_delayed_work(&process_info->work, 1);
++}
++
++/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
++ * KFD process identified by process_info
++ *
++ * @process_info: amdkfd_process_info of the KFD process
++ *
++ * After memory eviction, restore thread calls this function. The function
++ * should be called when the Process is still valid. BO restore involves -
++ *
++ * 1. Release old eviction fence and create new one
++ * 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list.
++ * 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of
++ * BOs that need to be reserved.
++ * 4. Reserve all the BOs
++ * 5. Validate of PD and PT BOs.
++ * 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence
++ * 7. Add fence to all PD and PT BOs.
++ * 8. Unreserve all BOs
++ */
++
++int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
++{
++ struct amdgpu_bo_list_entry *pd_bo_list;
++ struct amdkfd_process_info *process_info = info;
++ struct amdkfd_vm *peer_vm;
++ struct kgd_mem *mem;
++ struct bo_vm_reservation_context ctx;
++ struct amdgpu_amdkfd_fence *new_fence;
++ int ret = 0, i;
++ struct list_head duplicate_save;
++ struct amdgpu_sync sync_obj;
++
++ INIT_LIST_HEAD(&duplicate_save);
++ INIT_LIST_HEAD(&ctx.list);
++ INIT_LIST_HEAD(&ctx.duplicates);
++
++ pd_bo_list = kcalloc(process_info->n_vms,
++ sizeof(struct amdgpu_bo_list_entry),
++ GFP_KERNEL);
++ if (pd_bo_list == NULL)
++ return -ENOMEM;
++
++ i = 0;
++ mutex_lock(&process_info->lock);
++ list_for_each_entry(peer_vm, &process_info->vm_list_head,
++ vm_list_node)
++ amdgpu_vm_get_pd_bo(&peer_vm->base, &ctx.list,
++ &pd_bo_list[i++]);
++
++ /* Reserve all BOs and page tables/directory. Add all BOs from
++ * kfd_bo_list to ctx.list
++ */
++ list_for_each_entry(mem, &process_info->kfd_bo_list,
++ validate_list.head) {
++
++ list_add_tail(&mem->resv_list.head, &ctx.list);
++ mem->resv_list.bo = mem->validate_list.bo;
++ mem->resv_list.shared = mem->validate_list.shared;
++ }
++
++ ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list,
++ false, &duplicate_save);
++ if (ret) {
++ pr_debug("Memory eviction: TTM Reserve Failed. Try again\n");
++ goto ttm_reserve_fail;
++ }
++
++ amdgpu_sync_create(&sync_obj);
++ ctx.sync = &sync_obj;
++
++ /* Validate PDs and PTs */
++ ret = process_validate_vms(process_info);
++ if (ret)
++ goto validate_map_fail;
++
++ /* Wait for PD/PTs validate to finish */
++ /* FIXME: I think this isn't needed */
++ list_for_each_entry(peer_vm, &process_info->vm_list_head,
++ vm_list_node) {
++ struct amdgpu_bo *bo = peer_vm->base.root.base.bo;
++
++ ttm_bo_wait(&bo->tbo, false, false);
++ }
++
++ /* Validate BOs and map them to GPUVM (update VM page tables). */
++ list_for_each_entry(mem, &process_info->kfd_bo_list,
++ validate_list.head) {
++
++ struct amdgpu_bo *bo = mem->bo;
++ uint32_t domain = mem->domain;
++ struct kfd_bo_va_list *bo_va_entry;
++
++ ret = amdgpu_amdkfd_bo_validate(bo, domain, false);
++ if (ret) {
++ pr_debug("Memory eviction: Validate BOs failed. Try again\n");
++ goto validate_map_fail;
++ }
++
++ list_for_each_entry(bo_va_entry, &mem->bo_va_list,
++ bo_list) {
++ ret = update_gpuvm_pte((struct amdgpu_device *)
++ bo_va_entry->kgd_dev,
++ bo_va_entry,
++ ctx.sync);
++ if (ret) {
++ pr_debug("Memory eviction: update PTE failed. Try again\n");
++ goto validate_map_fail;
++ }
++ }
++ }
++
++ amdgpu_sync_wait(ctx.sync, false);
++
++ /* Release old eviction fence and create new one, because fence only
++ * goes from unsignaled to signaled, fence cannot be reused.
++ * Use context and mm from the old fence.
++ */
++ new_fence = amdgpu_amdkfd_fence_create(
++ process_info->eviction_fence->base.context,
++ process_info->eviction_fence->mm);
++ if (!new_fence) {
++ pr_err("Failed to create eviction fence\n");
++ ret = -ENOMEM;
++ goto validate_map_fail;
++ }
++ dma_fence_put(&process_info->eviction_fence->base);
++ process_info->eviction_fence = new_fence;
++ *ef = dma_fence_get(&new_fence->base);
++
++ /* Wait for validate to finish and attach new eviction fence */
++ list_for_each_entry(mem, &process_info->kfd_bo_list,
++ validate_list.head)
++ ttm_bo_wait(&mem->bo->tbo, false, false);
++ list_for_each_entry(mem, &process_info->kfd_bo_list,
++ validate_list.head)
++ amdgpu_bo_fence(mem->bo,
++ &process_info->eviction_fence->base, true);
++
++ /* Attach eviction fence to PD / PT BOs */
++ list_for_each_entry(peer_vm, &process_info->vm_list_head,
++ vm_list_node) {
++ struct amdgpu_bo *bo = peer_vm->base.root.base.bo;
++
++ amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true);
++ }
++validate_map_fail:
++ ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list);
++ amdgpu_sync_free(&sync_obj);
++ttm_reserve_fail:
++ mutex_unlock(&process_info->lock);
++evict_fence_fail:
++ kfree(pd_bo_list);
++ return ret;
++}
++
++int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem,
++ uint64_t src_offset, struct kgd_mem *dst_mem,
++ uint64_t dst_offset, uint64_t size,
++ struct dma_fence **f, uint64_t *actual_size)
++{
++ struct amdgpu_device *adev = NULL;
++ struct ttm_mem_reg *src = NULL, *dst = NULL;
++ struct ttm_buffer_object *src_ttm_bo, *dst_ttm_bo;
++ struct drm_mm_node *src_mm, *dst_mm;
++ struct amdgpu_ring *ring;
++ struct ww_acquire_ctx ticket;
++ struct list_head list;
++ struct ttm_validate_buffer resv_list[2];
++ uint64_t src_start, dst_start;
++ uint64_t src_left, dst_left, cur_copy_size, total_copy_size = 0;
++ struct dma_fence *fence = NULL;
++ int r;
++
++ if (!kgd || !src_mem || !dst_mem)
++ return -EINVAL;
++
++ if (actual_size)
++ *actual_size = 0;
++
++ adev = get_amdgpu_device(kgd);
++ src_ttm_bo = &src_mem->bo->tbo;
++ dst_ttm_bo = &dst_mem->bo->tbo;
++ src = &src_ttm_bo->mem;
++ dst = &dst_ttm_bo->mem;
++ src_mm = (struct drm_mm_node *)src->mm_node;
++ dst_mm = (struct drm_mm_node *)dst->mm_node;
++
++ ring = adev->mman.buffer_funcs_ring;
++
++ INIT_LIST_HEAD(&list);
++
++ resv_list[0].bo = src_ttm_bo;
++ resv_list[0].shared = true;
++ resv_list[1].bo = dst_ttm_bo;
++ resv_list[1].shared = true;
++
++ list_add_tail(&resv_list[0].head, &list);
++ list_add_tail(&resv_list[1].head, &list);
++
++ if (!ring->ready) {
++ pr_err("Trying to move memory with ring turned off.\n");
++ return -EINVAL;
++ }
++
++ r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL);
++ if (r) {
++ pr_err("Copy buffer failed. Unable to reserve bo (%d)\n", r);
++ return r;
++ }
++
++ switch (src->mem_type) {
++ case TTM_PL_TT:
++ r = amdgpu_ttm_bind(src_ttm_bo, src);
++ if (r) {
++ DRM_ERROR("Copy failed. Cannot bind to gart\n");
++ goto copy_fail;
++ }
++ break;
++ case TTM_PL_VRAM:
++ /* VRAM could be scattered. Find the node in which the offset
++ * belongs to
++ */
++ while (src_offset >= (src_mm->size << PAGE_SHIFT)) {
++ src_offset -= (src_mm->size << PAGE_SHIFT);
++ ++src_mm;
++ }
++ break;
++ default:
++ DRM_ERROR("Unknown placement %d\n", src->mem_type);
++ r = -EINVAL;
++ goto copy_fail;
++ }
++ src_start = src_mm->start << PAGE_SHIFT;
++ src_start += src_ttm_bo->bdev->man[src->mem_type].gpu_offset;
++ src_start += src_offset;
++ src_left = (src_mm->size << PAGE_SHIFT) - src_offset;
++
++ switch (dst->mem_type) {
++ case TTM_PL_TT:
++ r = amdgpu_ttm_bind(dst_ttm_bo, dst);
++ if (r) {
++ DRM_ERROR("Copy failed. Cannot bind to gart\n");
++ goto copy_fail;
++ }
++ break;
++ case TTM_PL_VRAM:
++ while (dst_offset >= (dst_mm->size << PAGE_SHIFT)) {
++ dst_offset -= (dst_mm->size << PAGE_SHIFT);
++ ++dst_mm;
++ }
++ break;
++ default:
++ DRM_ERROR("Unknown placement %d\n", dst->mem_type);
++ r = -EINVAL;
++ goto copy_fail;
++ }
++ dst_start = dst_mm->start << PAGE_SHIFT;
++ dst_start += dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset;
++ dst_start += dst_offset;
++ dst_left = (dst_mm->size << PAGE_SHIFT) - dst_offset;
++
++ do {
++ struct dma_fence *next;
++
++ /* src_left/dst_left: amount of space left in the current node
++ * Copy minimum of (src_left, dst_left, amount of bytes left to
++ * copy)
++ */
++ cur_copy_size = min3(src_left, dst_left,
++ (size - total_copy_size));
++
++ r = amdgpu_copy_buffer(ring, src_start, dst_start,
++ cur_copy_size, NULL, &next, false, false);
++ if (r)
++ break;
++
++ /* Just keep the last fence */
++ dma_fence_put(fence);
++ fence = next;
++
++ total_copy_size += cur_copy_size;
++ /* Required amount of bytes copied. Done. */
++ if (total_copy_size >= size)
++ break;
++
++ /* If end of src or dst node is reached, move to next node */
++ src_left -= cur_copy_size;
++ if (!src_left) {
++ ++src_mm;
++ src_start = src_mm->start << PAGE_SHIFT;
++ src_start +=
++ src_ttm_bo->bdev->man[src->mem_type].gpu_offset;
++ src_left = src_mm->size << PAGE_SHIFT;
++ } else
++ src_start += cur_copy_size;
++
++ dst_left -= cur_copy_size;
++ if (!dst_left) {
++ ++dst_mm;
++ dst_start = dst_mm->start << PAGE_SHIFT;
++ dst_start +=
++ dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset;
++ dst_left = dst_mm->size << PAGE_SHIFT;
++ } else
++ dst_start += cur_copy_size;
++
++ } while (total_copy_size < size);
++
++ /* Failure could occur after partial copy. So fill in amount copied
++ * and fence, still fill-in
++ */
++ if (actual_size)
++ *actual_size = total_copy_size;
++
++ if (fence) {
++ amdgpu_bo_fence(src_mem->bo, fence, true);
++ amdgpu_bo_fence(dst_mem->bo, fence, true);
++ }
++
++ if (f)
++ *f = fence;
++
++copy_fail:
++ ttm_eu_backoff_reservation(&ticket, &list);
++ return r;
++}
++
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+index 9c472c5..2be2e05 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+@@ -817,11 +817,7 @@ static struct drm_driver kms_driver = {
+ .driver_features =
+ DRIVER_USE_AGP |
+ DRIVER_HAVE_IRQ | DRIVER_IRQ_SHARED | DRIVER_GEM |
+-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)
+ DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ,
+-#else
+- DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET,
+-#endif
+ .load = amdgpu_driver_load_kms,
+ .open = amdgpu_driver_open_kms,
+ .postclose = amdgpu_driver_postclose_kms,
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+old mode 100644
+new mode 100755
+index 283dc1b..f421505
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+@@ -36,6 +36,7 @@
+ #include <drm/drm_cache.h>
+ #include "amdgpu.h"
+ #include "amdgpu_trace.h"
++#include "amdgpu_amdkfd.h"
+
+ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo)
+ {
+@@ -46,6 +47,8 @@ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo)
+
+ if (bo->tbo.mem.mem_type == AMDGPU_PL_DGMA_IMPORT)
+ kfree(tbo->mem.bus.addr);
++ if (bo->kfd_bo)
++ amdgpu_amdkfd_unreserve_system_memory_limit(bo);
+ amdgpu_bo_kunmap(bo);
+
+ if (bo->gem_base.import_attach)
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+old mode 100644
+new mode 100755
+index 8a91658..f73dba5
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+@@ -89,6 +89,7 @@ struct amdgpu_bo {
+
+ struct ttm_bo_kmap_obj dma_buf_vmap;
+ struct amdgpu_mn *mn;
++ struct kgd_mem *kfd_bo;
+
+ union {
+ struct list_head mn_list;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+old mode 100644
+new mode 100755
+index 322d2529..af8e544
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+@@ -36,6 +36,7 @@
+ /* some special values for the owner field */
+ #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul)
+ #define AMDGPU_FENCE_OWNER_VM ((void*)1ul)
++#define AMDGPU_FENCE_OWNER_KFD ((void *)2ul)
+
+ #define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
+ #define AMDGPU_FENCE_FLAG_INT (1 << 1)
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+old mode 100644
+new mode 100755
+index c586f44..7ee8247
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+@@ -31,6 +31,7 @@
+ #include <drm/drmP.h>
+ #include "amdgpu.h"
+ #include "amdgpu_trace.h"
++#include "amdgpu_amdkfd.h"
+
+ struct amdgpu_sync_entry {
+ struct hlist_node node;
+@@ -84,11 +85,20 @@ static bool amdgpu_sync_same_dev(struct amdgpu_device *adev,
+ */
+ static void *amdgpu_sync_get_owner(struct dma_fence *f)
+ {
+- struct amd_sched_fence *s_fence = to_amd_sched_fence(f);
++ struct amd_sched_fence *s_fence;
++ struct amdgpu_amdkfd_fence *kfd_fence;
++
++ if (f == NULL)
++ return AMDGPU_FENCE_OWNER_UNDEFINED;
+
++ s_fence = to_amd_sched_fence(f);
+ if (s_fence)
+ return s_fence->owner;
+
++ kfd_fence = to_amdgpu_amdkfd_fence(f);
++ if (kfd_fence)
++ return AMDGPU_FENCE_OWNER_KFD;
++
+ return AMDGPU_FENCE_OWNER_UNDEFINED;
+ }
+
+@@ -171,7 +181,8 @@ int amdgpu_sync_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync,
+ * @resv: reservation object with embedded fence
+ * @shared: true if we should only sync to the exclusive fence
+ *
+- * Sync to the fence
++ * Sync to the fence except if it is KFD eviction fence and owner is
++ * AMDGPU_FENCE_OWNER_VM.
+ */
+ int amdgpu_sync_resv(struct amdgpu_device *adev,
+ struct amdgpu_sync *sync,
+@@ -198,11 +209,15 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
+ for (i = 0; i < flist->shared_count; ++i) {
+ f = rcu_dereference_protected(flist->shared[i],
+ reservation_object_held(resv));
++ fence_owner = amdgpu_sync_get_owner(f);
++ if (fence_owner == AMDGPU_FENCE_OWNER_KFD &&
++ owner != AMDGPU_FENCE_OWNER_UNDEFINED)
++ continue;
++
+ if (amdgpu_sync_same_dev(adev, f)) {
+ /* VM updates are only interesting
+ * for other VM updates and moves.
+ */
+- fence_owner = amdgpu_sync_get_owner(f);
+ if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
+ (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
+ ((owner == AMDGPU_FENCE_OWNER_VM) !=
+@@ -297,6 +312,31 @@ struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync)
+ return NULL;
+ }
+
++int amdgpu_sync_clone(struct amdgpu_device *adev,
++ struct amdgpu_sync *source,
++ struct amdgpu_sync *clone)
++{
++ struct amdgpu_sync_entry *e;
++ struct hlist_node *tmp;
++ struct dma_fence *f;
++ int i, r;
++
++ hash_for_each_safe(source->fences, i, tmp, e, node) {
++
++ f = e->fence;
++ if (!dma_fence_is_signaled(f)) {
++ r = amdgpu_sync_fence(adev, clone, f);
++ if (r)
++ return r;
++ } else {
++ hash_del(&e->node);
++ dma_fence_put(f);
++ kmem_cache_free(amdgpu_sync_slab, e);
++ }
++ }
++ return 0;
++}
++
+ int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr)
+ {
+ struct amdgpu_sync_entry *e;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
+old mode 100644
+new mode 100755
+index dc76879..8e29bc7
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
+@@ -49,6 +49,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
+ struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync,
+ struct amdgpu_ring *ring);
+ struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync);
++int amdgpu_sync_clone(struct amdgpu_device *adev, struct amdgpu_sync *source,
++ struct amdgpu_sync *clone);
+ int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr);
+ void amdgpu_sync_free(struct amdgpu_sync *sync);
+ int amdgpu_sync_init(void);
+diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
+old mode 100644
+new mode 100755
+index 9f34fab..f22f7a8
+--- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
++++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
+@@ -272,6 +272,7 @@
+ # define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0)
+ # define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4)
+ # define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5)
++# define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29)
+ #define PACKET3_SET_RESOURCES 0xA0
+ /* 1. header
+ * 2. CONTROL
+diff --git a/drivers/gpu/drm/amd/amdgpu/vid.h b/drivers/gpu/drm/amd/amdgpu/vid.h
+old mode 100644
+new mode 100755
+index 323e21c..d09592a
+--- a/drivers/gpu/drm/amd/amdgpu/vid.h
++++ b/drivers/gpu/drm/amd/amdgpu/vid.h
+@@ -27,6 +27,8 @@
+ #define SDMA1_REGISTER_OFFSET 0x200 /* not a register */
+ #define SDMA_MAX_INSTANCE 2
+
++#define KFD_VI_SDMA_QUEUE_OFFSET 0x80 /* not a register */
++
+ /* crtc instance offsets */
+ #define CRTC0_REGISTER_OFFSET (0x1b9c - 0x1b9c)
+ #define CRTC1_REGISTER_OFFSET (0x1d9c - 0x1b9c)
+diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
+old mode 100644
+new mode 100755
+index f55a0f8..dba08ec
+--- a/drivers/gpu/drm/amd/amdkfd/Makefile
++++ b/drivers/gpu/drm/amd/amdkfd/Makefile
+@@ -26,5 +26,3 @@ amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o
+
+ obj-$(CONFIG_HSA_AMD) += amdkfd.o
+
+-AMDKFD_FULL_PATH = $(src)
+-include $(AMDKFD_FULL_PATH)/backport/Makefile
+diff --git a/drivers/gpu/drm/amd/amdkfd/backport/backport.h b/drivers/gpu/drm/amd/amdkfd/backport/backport.h
+index 8b13b98..e1f8c1d 100644
+--- a/drivers/gpu/drm/amd/amdkfd/backport/backport.h
++++ b/drivers/gpu/drm/amd/amdkfd/backport/backport.h
+@@ -2,12 +2,5 @@
+ #define AMDKFD_BACKPORT_H
+
+ #include <linux/version.h>
+-#if defined(BUILD_AS_DKMS)
+-#include <kcl/kcl_amd_asic_type.h>
+-#endif
+-#include <kcl/kcl_compat.h>
+-#include <kcl/kcl_pci.h>
+-#include <kcl/kcl_mn.h>
+-#include <kcl/kcl_fence.h>
+
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index b2795af..207a05e 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -25,9 +25,7 @@
+ #include <linux/err.h>
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+ #include <linux/sched/mm.h>
+-#endif
+ #include <linux/slab.h>
+ #include <linux/uaccess.h>
+ #include <linux/compat.h>
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+index 5f597a6..4e94081 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+@@ -811,11 +811,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
+ */
+ pgdat = NODE_DATA(numa_node_id);
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
+- mem_in_bytes += pgdat->node_zones[zone_type].present_pages;
+-#else
+ mem_in_bytes += pgdat->node_zones[zone_type].managed_pages;
+-#endif
+ mem_in_bytes <<= PAGE_SHIFT;
+
+ sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+index c6b447d..6b3a1fa 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+@@ -326,11 +326,6 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
+
+ static int kfd_resume(struct kfd_dev *kfd);
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
+-void kfd_init_processes_srcu(void);
+-void kfd_cleanup_processes_srcu(void);
+-#endif
+-
+ static const struct kfd_device_info *lookup_device_info(unsigned short did)
+ {
+ size_t i;
+@@ -633,10 +628,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+
+ kfd_ib_mem_init(kfd);
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
+- kfd_init_processes_srcu();
+-#endif
+-
+ if (kfd_resume(kfd)) {
+ dev_err(kfd_device, "Error resuming kfd\n");
+ goto kfd_resume_error;
+@@ -678,9 +669,6 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
+ {
+ if (kfd->init_complete) {
+ kgd2kfd_suspend(kfd);
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
+- kfd_cleanup_processes_srcu();
+-#endif
+ kfd_cwsr_fini(kfd);
+ device_queue_manager_uninit(kfd->dqm);
+ kfd_interrupt_exit(kfd);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+index 8debe6e..7eacf42 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+@@ -24,10 +24,8 @@
+ #include <linux/slab.h>
+ #include <linux/types.h>
+ #include <linux/uaccess.h>
+-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+ #include <linux/sched/mm.h>
+ #include <linux/sched/signal.h>
+-#endif
+ #include <linux/mman.h>
+ #include <linux/memory.h>
+ #include "kfd_priv.h"
+@@ -269,13 +267,7 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
+ {
+ struct kfd_event *ev;
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *node;
+-
+- hash_for_each_possible(p->events, ev, node, events, id)
+-#else
+ hash_for_each_possible(p->events, ev, events, id)
+-#endif
+ if (ev->event_id == id)
+ return ev;
+
+@@ -420,13 +412,7 @@ static void destroy_events(struct kfd_process *p)
+ struct hlist_node *tmp;
+ unsigned int hash_bkt;
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *node;
+-
+- hash_for_each_safe(p->events, hash_bkt, node, tmp, ev, events)
+-#else
+ hash_for_each_safe(p->events, hash_bkt, tmp, ev, events)
+-#endif
+ destroy_event(p, ev);
+ }
+
+@@ -972,16 +958,9 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
+ int bkt;
+ bool send_signal = true;
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *node;
+- ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
+-
+- hash_for_each(p->events, bkt, node, ev, events)
+-#else
+ ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
+
+ hash_for_each(p->events, bkt, ev, events)
+-#endif
+ if (ev->type == type) {
+ send_signal = false;
+ dev_dbg(kfd_device,
+@@ -1114,9 +1093,6 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+ int bkt;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_hsa_memory_exception_data memory_exception_data;
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *node;
+-#endif
+
+ if (!p)
+ return; /* Presumably process exited. */
+@@ -1136,11 +1112,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+ }
+ mutex_lock(&p->event_mutex);
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- hash_for_each(p->events, bkt, node, ev, events) {
+-#else
+ hash_for_each(p->events, bkt, ev, events) {
+-#endif
+ if (ev->type == KFD_EVENT_TYPE_MEMORY) {
+ ev->memory_exception_data = memory_exception_data;
+ set_event(ev);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+index 4f4392a..47dcf4a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+@@ -61,11 +61,7 @@ int kfd_interrupt_init(struct kfd_dev *kfd)
+ return r;
+ }
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
+- kfd->ih_wq = create_rt_workqueue("KFD IH");
+-#else
+ kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1);
+-#endif
+ spin_lock_init(&kfd->interrupt_lock);
+
+ INIT_WORK(&kfd->interrupt_work, interrupt_wq);
+@@ -115,15 +111,9 @@ bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry)
+ count = kfifo_in(&kfd->ih_fifo, ih_ring_entry,
+ kfd->device_info->ih_ring_entry_size);
+ if (count != kfd->device_info->ih_ring_entry_size) {
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
+- dev_err(kfd_chardev(),
+- "Interrupt ring overflow, dropping interrupt %d\n",
+- count);
+-#else
+ dev_err_ratelimited(kfd_chardev(),
+ "Interrupt ring overflow, dropping interrupt %d\n",
+ count);
+-#endif
+ return false;
+ }
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
+index c6be3ba..e67eb9f 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
+@@ -192,21 +192,13 @@ int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p,
+ {
+ int r;
+ struct kfd_ipc_obj *entry, *found = NULL;
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *tmp_node;
+-#endif
+
+ mutex_lock(&kfd_ipc_handles.lock);
+ /* Convert the user provided handle to hash key and search only in that
+ * bucket
+ */
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- hlist_for_each_entry(entry, tmp_node,
+- &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) {
+-#else
+ hlist_for_each_entry(entry,
+ &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) {
+-#endif
+ if (!memcmp(entry->share_handle, share_handle,
+ sizeof(entry->share_handle))) {
+ found = entry;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+index 64bf653..5724d33 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+@@ -465,19 +465,15 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+
+ static int debugfs_show_mqd(struct seq_file *m, void *data)
+ {
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct cik_mqd), false);
+-#endif
+ return 0;
+ }
+
+ static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+ {
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct cik_sdma_rlc_registers), false);
+-#endif
+ return 0;
+ }
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+index 0713cac..6c302d2 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+@@ -455,19 +455,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+
+ static int debugfs_show_mqd(struct seq_file *m, void *data)
+ {
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct v9_mqd), false);
+-#endif
+ return 0;
+ }
+
+ static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+ {
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct v9_sdma_mqd), false);
+-#endif
+ return 0;
+ }
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+index a5ba6f7..5c26e5a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+@@ -468,19 +468,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+
+ static int debugfs_show_mqd(struct seq_file *m, void *data)
+ {
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct vi_mqd), false);
+-#endif
+ return 0;
+ }
+
+ static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+ {
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct vi_sdma_mqd), false);
+-#endif
+ return 0;
+ }
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+index 9fcb6fb..7cca7b4 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+@@ -410,10 +410,8 @@ int pm_debugfs_runlist(struct seq_file *m, void *data)
+ return 0;
+ }
+
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2)
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false);
+-#endif
+
+ return 0;
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+old mode 100644
+new mode 100755
+index ebe311e..88fdfc9
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+@@ -36,11 +36,7 @@
+ #include <linux/interval_tree.h>
+ #include <linux/seq_file.h>
+ #include <linux/kref.h>
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
+-#include <linux/kfifo-new.h>
+-#else
+ #include <linux/kfifo.h>
+-#endif
+ #include <kgd_kfd_interface.h>
+
+ #include <drm/amd_rdma.h>
+@@ -727,7 +723,7 @@ struct kfd_process {
+ size_t signal_event_count;
+ bool signal_event_limit_reached;
+
+- struct rb_root bo_interval_tree;
++ struct rb_root_cached bo_interval_tree;
+
+ /* Information used for memory eviction */
+ void *process_info;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+index b458995..c798fa3 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+@@ -23,10 +23,8 @@
+ #include <linux/mutex.h>
+ #include <linux/log2.h>
+ #include <linux/sched.h>
+-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+ #include <linux/sched/mm.h>
+ #include <linux/sched/task.h>
+-#endif
+ #include <linux/slab.h>
+ #if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ #include <linux/amd-iommu.h>
+@@ -50,20 +48,7 @@ struct mm_struct;
+ static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
+ static DEFINE_MUTEX(kfd_processes_mutex);
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
+-static struct srcu_struct kfd_processes_srcu;
+-void kfd_init_processes_srcu(void)
+-{
+- init_srcu_struct(&kfd_processes_srcu);
+-}
+-
+-void kfd_cleanup_processes_srcu(void)
+-{
+- cleanup_srcu_struct(&kfd_processes_srcu);
+-}
+-#else
+ DEFINE_STATIC_SRCU(kfd_processes_srcu);
+-#endif
+
+ static struct workqueue_struct *kfd_process_wq;
+
+@@ -81,11 +66,7 @@ static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep);
+ void kfd_process_create_wq(void)
+ {
+ if (!kfd_process_wq)
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
+- kfd_process_wq = create_workqueue("kfd_process_wq");
+-#else
+ kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
+-#endif
+ }
+
+ void kfd_process_destroy_wq(void)
+@@ -273,15 +254,8 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
+ {
+ struct kfd_process *process;
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *node;
+-
+- hash_for_each_possible_rcu(kfd_processes_table, process, node,
+- kfd_processes, (uintptr_t)mm)
+-#else
+ hash_for_each_possible_rcu(kfd_processes_table, process,
+ kfd_processes, (uintptr_t)mm)
+-#endif
+ if (process->mm == mm)
+ return process;
+
+@@ -586,7 +560,7 @@ static struct kfd_process *create_process(const struct task_struct *thread,
+ if (!process)
+ goto err_alloc_process;
+
+- process->bo_interval_tree = RB_ROOT;
++ process->bo_interval_tree = RB_ROOT_CACHED;
+
+ process->pasid = kfd_pasid_alloc();
+ if (process->pasid == 0)
+@@ -1026,13 +1000,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
+
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *node;
+-
+- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) {
+-#else
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-#endif
+ if (p->pasid == pasid) {
+ kref_get(&p->ref);
+ ret_p = p;
+@@ -1051,13 +1019,7 @@ void kfd_suspend_all_processes(void)
+ unsigned int temp;
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *node;
+-
+- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) {
+-#else
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-#endif
+ if (cancel_delayed_work_sync(&p->eviction_work.dwork))
+ dma_fence_put(p->eviction_work.quiesce_fence);
+ cancel_delayed_work_sync(&p->restore_work);
+@@ -1077,13 +1039,7 @@ int kfd_resume_all_processes(void)
+ unsigned int temp;
+ int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *node;
+-
+- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) {
+-#else
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-#endif
+ if (!schedule_delayed_work(&p->restore_work, 0)) {
+ pr_err("Restore process %d failed during resume\n",
+ p->pasid);
+@@ -1171,13 +1127,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
+
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+- struct hlist_node *node;
+-
+- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) {
+-#else
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-#endif
+ seq_printf(m, "Process %d PASID %d:\n",
+ p->lead_thread->tgid, p->pasid);
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+index ffd8e0f..d08e3de 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+@@ -122,9 +122,7 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev)
+ struct kfd_mem_properties *mem;
+ struct kfd_cache_properties *cache;
+ struct kfd_iolink_properties *iolink;
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ struct kfd_perf_properties *perf;
+-#endif
+
+ list_del(&dev->list);
+
+@@ -149,14 +147,12 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev)
+ kfree(iolink);
+ }
+
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ while (dev->perf_props.next != &dev->perf_props) {
+ perf = container_of(dev->perf_props.next,
+ struct kfd_perf_properties, list);
+ list_del(&perf->list);
+ kfree(perf);
+ }
+-#endif
+
+ kfree(dev);
+ }
+@@ -192,9 +188,7 @@ struct kfd_topology_device *kfd_create_topology_device(
+ INIT_LIST_HEAD(&dev->mem_props);
+ INIT_LIST_HEAD(&dev->cache_props);
+ INIT_LIST_HEAD(&dev->io_link_props);
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ INIT_LIST_HEAD(&dev->perf_props);
+-#endif
+
+ list_add_tail(&dev->list, device_list);
+
+@@ -374,7 +368,6 @@ static struct kobj_type cache_type = {
+ .sysfs_ops = &cache_ops,
+ };
+
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ /****** Sysfs of Performance Counters ******/
+
+ struct kfd_perf_attr {
+@@ -407,7 +400,6 @@ static struct kfd_perf_attr perf_attr_iommu[] = {
+ KFD_PERF_DESC(counter_ids, 0),
+ };
+ /****************************************/
+-#endif
+
+ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+ char *buffer)
+@@ -546,9 +538,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
+ struct kfd_iolink_properties *iolink;
+ struct kfd_cache_properties *cache;
+ struct kfd_mem_properties *mem;
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ struct kfd_perf_properties *perf;
+-#endif
+
+ if (dev->kobj_iolink) {
+ list_for_each_entry(iolink, &dev->io_link_props, list)
+@@ -590,7 +580,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
+ dev->kobj_mem = NULL;
+ }
+
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ if (dev->kobj_perf) {
+ list_for_each_entry(perf, &dev->perf_props, list) {
+ kfree(perf->attr_group);
+@@ -600,7 +589,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
+ kobject_put(dev->kobj_perf);
+ dev->kobj_perf = NULL;
+ }
+-#endif
+
+ if (dev->kobj_node) {
+ sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid);
+@@ -618,11 +606,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ struct kfd_iolink_properties *iolink;
+ struct kfd_cache_properties *cache;
+ struct kfd_mem_properties *mem;
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ struct kfd_perf_properties *perf;
+ uint32_t num_attrs;
+ struct attribute **attrs;
+-#endif
+ int ret;
+ uint32_t i;
+
+@@ -653,11 +639,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ if (!dev->kobj_iolink)
+ return -ENOMEM;
+
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node);
+ if (!dev->kobj_perf)
+ return -ENOMEM;
+-#endif
+
+ /*
+ * Creating sysfs files for node properties
+@@ -749,7 +733,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ i++;
+ }
+
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ /* All hardware blocks have the same number of attributes. */
+ num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr);
+ list_for_each_entry(perf, &dev->perf_props, list) {
+@@ -775,7 +758,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ if (ret < 0)
+ return ret;
+ }
+-#endif
+
+ return 0;
+ }
+@@ -942,7 +924,6 @@ static void find_system_memory(const struct dmi_header *dm,
+ }
+ }
+
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ /*
+ * Performance counters information is not part of CRAT but we would like to
+ * put them in the sysfs under topology directory for Thunk to get the data.
+@@ -966,7 +947,6 @@ static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev)
+
+ return 0;
+ }
+-#endif
+
+ /* kfd_add_non_crat_information - Add information that is not currently
+ * defined in CRAT but is necessary for KFD topology
+@@ -1074,11 +1054,9 @@ int kfd_topology_init(void)
+ }
+ }
+
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ kdev = list_first_entry(&temp_topology_device_list,
+ struct kfd_topology_device, list);
+ kfd_add_perf_to_topology(kdev);
+-#endif
+
+ down_write(&topology_lock);
+ kfd_topology_update_device_list(&temp_topology_device_list,
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+index b59b32c..f22d420 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+@@ -141,14 +141,12 @@ struct kfd_iolink_properties {
+ struct attribute attr;
+ };
+
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ struct kfd_perf_properties {
+ struct list_head list;
+ char block_name[16];
+ uint32_t max_concurrent;
+ struct attribute_group *attr_group;
+ };
+-#endif
+
+ struct kfd_topology_device {
+ struct list_head list;
+@@ -160,17 +158,13 @@ struct kfd_topology_device {
+ struct list_head cache_props;
+ uint32_t io_link_count;
+ struct list_head io_link_props;
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ struct list_head perf_props;
+-#endif
+ struct kfd_dev *gpu;
+ struct kobject *kobj_node;
+ struct kobject *kobj_mem;
+ struct kobject *kobj_cache;
+ struct kobject *kobj_iolink;
+-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ struct kobject *kobj_perf;
+-#endif
+ struct attribute attr_gpuid;
+ struct attribute attr_name;
+ struct attribute attr_props;
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+index 2780641..977b21b 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+@@ -707,11 +707,7 @@ static int dm_display_resume(struct drm_device *ddev)
+
+ err:
+ DRM_ERROR("Restoring old state failed with %i\n", ret);
+-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
+- drm_atomic_state_free(state);
+-#else
+ drm_atomic_state_put(state);
+-#endif
+
+ return ret;
+ }
+diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+old mode 100644
+new mode 100755
+index 36f3766..b6cf2d5
+--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
++++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+@@ -30,6 +30,7 @@
+
+ #include <linux/types.h>
+ #include <linux/bitmap.h>
++#include <linux/dma-buf.h>
+
+ struct pci_dev;
+
+@@ -40,6 +41,46 @@ struct kfd_dev;
+ struct kgd_dev;
+
+ struct kgd_mem;
++struct kfd_process_device;
++struct amdgpu_bo;
++
++enum kfd_preempt_type {
++ KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN = 0,
++ KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
++};
++
++struct kfd_vm_fault_info {
++ uint64_t page_addr;
++ uint32_t vmid;
++ uint32_t mc_id;
++ uint32_t status;
++ bool prot_valid;
++ bool prot_read;
++ bool prot_write;
++ bool prot_exec;
++};
++
++struct kfd_cu_info {
++ uint32_t num_shader_engines;
++ uint32_t num_shader_arrays_per_engine;
++ uint32_t num_cu_per_sh;
++ uint32_t cu_active_number;
++ uint32_t cu_ao_mask;
++ uint32_t simd_per_cu;
++ uint32_t max_waves_per_simd;
++ uint32_t wave_front_size;
++ uint32_t max_scratch_slots_per_cu;
++ uint32_t lds_size;
++ uint32_t cu_bitmap[4][4];
++};
++
++/* For getting GPU local memory information from KGD */
++struct kfd_local_mem_info {
++ uint64_t local_mem_size_private;
++ uint64_t local_mem_size_public;
++ uint32_t vram_width;
++ uint32_t mem_clk_max;
++};
+
+ enum kgd_memory_pool {
+ KGD_POOL_SYSTEM_CACHEABLE = 1,
+@@ -72,6 +113,21 @@ struct kgd2kfd_shared_resources {
+ /* Bit n == 1 means Queue n is available for KFD */
+ DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES);
+
++ /* Doorbell assignments (SOC15 and later chips only). Only
++ * specific doorbells are routed to each SDMA engine. Others
++ * are routed to IH and VCN. They are not usable by the CP.
++ *
++ * Any doorbell number D that satisfies the following condition
++ * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val
++ *
++ * KFD currently uses 1024 (= 0x3ff) doorbells per process. If
++ * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means
++ * mask would be set to 0x1f8 and val set to 0x0f0.
++ */
++ unsigned int sdma_doorbell[2][2];
++ unsigned int reserved_doorbell_mask;
++ unsigned int reserved_doorbell_val;
++
+ /* Base address of doorbell aperture. */
+ phys_addr_t doorbell_physical_address;
+
+@@ -80,8 +136,41 @@ struct kgd2kfd_shared_resources {
+
+ /* Number of bytes at start of aperture reserved for KGD. */
+ size_t doorbell_start_offset;
++
++ /* GPUVM address space size in bytes */
++ uint64_t gpuvm_size;
+ };
+
++struct tile_config {
++ uint32_t *tile_config_ptr;
++ uint32_t *macro_tile_config_ptr;
++ uint32_t num_tile_configs;
++ uint32_t num_macro_tile_configs;
++
++ uint32_t gb_addr_config;
++ uint32_t num_banks;
++ uint32_t num_ranks;
++};
++
++/*
++ * Allocation flag domains currently only VRAM and GTT domain supported
++ */
++#define ALLOC_MEM_FLAGS_VRAM (1 << 0)
++#define ALLOC_MEM_FLAGS_GTT (1 << 1)
++#define ALLOC_MEM_FLAGS_USERPTR (1 << 2)
++#define ALLOC_MEM_FLAGS_DOORBELL (1 << 3)
++
++/*
++ * Allocation flags attributes/access options.
++ */
++#define ALLOC_MEM_FLAGS_NONPAGED (1 << 31)
++#define ALLOC_MEM_FLAGS_READONLY (1 << 30)
++#define ALLOC_MEM_FLAGS_PUBLIC (1 << 29)
++#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28)
++#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27)
++#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26)
++#define ALLOC_MEM_FLAGS_COHERENT (1 << 25)
++
+ /**
+ * struct kfd2kgd_calls
+ *
+@@ -90,7 +179,7 @@ struct kgd2kfd_shared_resources {
+ *
+ * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture
+ *
+- * @get_vmem_size: Retrieves (physical) size of VRAM
++ * @get_local_mem_info: Retrieves information about GPU local memory
+ *
+ * @get_gpu_clock_counter: Retrieves GPU clock counter
+ *
+@@ -112,6 +201,12 @@ struct kgd2kfd_shared_resources {
+ * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot.
+ * used only for no HWS mode.
+ *
++ * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs.
++ * Array is allocated with kmalloc, needs to be freed with kfree by caller.
++ *
++ * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs.
++ * Array is allocated with kmalloc, needs to be freed with kfree by caller.
++ *
+ * @hqd_is_occupies: Checks if a hqd slot is occupied.
+ *
+ * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot.
+@@ -121,8 +216,34 @@ struct kgd2kfd_shared_resources {
+ * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that
+ * SDMA hqd slot.
+ *
++ * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs
++ *
++ * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs
++ *
+ * @get_fw_version: Returns FW versions from the header
+ *
++ * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to
++ * IOMMU when address translation failed
++ *
++ * @get_cu_info: Retrieves activated cu info
++ *
++ * @get_dmabuf_info: Returns information about a dmabuf if it was
++ * created by the GPU driver
++ *
++ * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object
++ * Supports only DMA buffers created by GPU driver on the same GPU
++ *
++ * @export_dmabuf: Emports a KFD BO for sharing with other process
++ *
++ * @submit_ib: Submits an IB to the engine specified by inserting the IB to
++ * the corresonded ring (ring type).
++ *
++ * @restore_process_bos: Restore all BOs that belongs to the process
++ *
++ * @copy_mem_to_mem: Copies size bytes from source BO to destination BO
++ *
++ * @get_vram_usage: Returns current VRAM usage
++ *
+ * This structure contains function pointers to services that the kgd driver
+ * provides to amdkfd driver.
+ *
+@@ -134,11 +255,23 @@ struct kfd2kgd_calls {
+
+ void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj);
+
+- uint64_t (*get_vmem_size)(struct kgd_dev *kgd);
++ void(*get_local_mem_info)(struct kgd_dev *kgd,
++ struct kfd_local_mem_info *mem_info);
+ uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd);
+
+ uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd);
+
++ int (*create_process_vm)(struct kgd_dev *kgd, void **vm,
++ void **process_info, struct dma_fence **ef);
++ void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm);
++
++ int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem);
++ void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem);
++
++ uint32_t (*get_process_page_dir)(void *vm);
++
++ int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem);
++
+ /* Register access functions */
+ void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid,
+ uint32_t sh_mem_config, uint32_t sh_mem_ape1_base,
+@@ -151,16 +284,28 @@ struct kfd2kgd_calls {
+ uint32_t hpd_size, uint64_t hpd_gpu_addr);
+
+ int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id);
++
+
+ int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+- uint32_t queue_id, uint32_t __user *wptr);
++ uint32_t queue_id, uint32_t __user *wptr,
++ uint32_t wptr_shift, uint32_t wptr_mask,
++ struct mm_struct *mm);
++
++ int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd,
++ uint32_t __user *wptr, struct mm_struct *mm);
++
++ int (*hqd_dump)(struct kgd_dev *kgd,
++ uint32_t pipe_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
+
+- int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd);
++ int (*hqd_sdma_dump)(struct kgd_dev *kgd,
++ uint32_t engine_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
+
+ bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address,
+ uint32_t pipe_id, uint32_t queue_id);
+
+- int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type,
++ int (*hqd_destroy)(struct kgd_dev *kgd, void *mqd, uint32_t reset_type,
+ unsigned int timeout, uint32_t pipe_id,
+ uint32_t queue_id);
+
+@@ -168,7 +313,7 @@ struct kfd2kgd_calls {
+
+ int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd,
+ unsigned int timeout);
+-
++
+ int (*address_watch_disable)(struct kgd_dev *kgd);
+ int (*address_watch_execute)(struct kgd_dev *kgd,
+ unsigned int watch_point_id,
+@@ -187,11 +332,72 @@ struct kfd2kgd_calls {
+ uint16_t (*get_atc_vmid_pasid_mapping_pasid)(
+ struct kgd_dev *kgd,
+ uint8_t vmid);
++ uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd);
+ void (*write_vmid_invalidate_request)(struct kgd_dev *kgd,
+ uint8_t vmid);
+
++ int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid);
++
++ int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);
++
++ int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va,
++ uint64_t size, void *vm,
++ struct kgd_mem **mem, uint64_t *offset,
++ uint32_t flags);
++ int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem,
++ void *vm);
++ int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem,
++ void *vm);
++ int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem,
++ void *vm);
++
+ uint16_t (*get_fw_version)(struct kgd_dev *kgd,
+ enum kgd_engine_type type);
++
++ void (*set_num_of_requests)(struct kgd_dev *kgd,
++ uint8_t num_of_requests);
++ int (*alloc_memory_of_scratch)(struct kgd_dev *kgd,
++ uint64_t va, uint32_t vmid);
++ int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable,
++ uint8_t element_size, uint8_t index_stride, uint8_t mtype);
++ void (*get_cu_info)(struct kgd_dev *kgd,
++ struct kfd_cu_info *cu_info);
++ int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma);
++ int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd,
++ struct kgd_mem *mem, void **kptr);
++ void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t page_table_base);
++
++ int (*pin_get_sg_table_bo)(struct kgd_dev *kgd,
++ struct kgd_mem *mem, uint64_t offset,
++ uint64_t size, struct sg_table **ret_sg);
++ void (*unpin_put_sg_table_bo)(struct kgd_mem *mem,
++ struct sg_table *sg);
++
++ int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd,
++ struct kgd_dev **dma_buf_kgd, uint64_t *bo_size,
++ void *metadata_buffer, size_t buffer_size,
++ uint32_t *metadata_size, uint32_t *flags);
++ int (*import_dmabuf)(struct kgd_dev *kgd, struct dma_buf *dmabuf,
++ uint64_t va, void *vm, struct kgd_mem **mem,
++ uint64_t *size, uint64_t *mmap_offset);
++ int (*export_dmabuf)(struct kgd_dev *kgd, void *vm, struct kgd_mem *mem,
++ struct dma_buf **dmabuf);
++
++ int (*get_vm_fault_info)(struct kgd_dev *kgd,
++ struct kfd_vm_fault_info *info);
++ int (*submit_ib)(struct kgd_dev *kgd, enum kgd_engine_type engine,
++ uint32_t vmid, uint64_t gpu_addr,
++ uint32_t *ib_cmd, uint32_t ib_len);
++ int (*get_tile_config)(struct kgd_dev *kgd,
++ struct tile_config *config);
++
++ int (*restore_process_bos)(void *process_info, struct dma_fence **ef);
++ int (*copy_mem_to_mem)(struct kgd_dev *kgd, struct kgd_mem *src_mem,
++ uint64_t src_offset, struct kgd_mem *dst_mem,
++ uint64_t dest_offset, uint64_t size,
++ struct dma_fence **f, uint64_t *actual_size);
++ uint64_t (*get_vram_usage)(struct kgd_dev *kgd);
+ };
+
+ /**
+@@ -210,6 +416,13 @@ struct kfd2kgd_calls {
+ *
+ * @resume: Notifies amdkfd about a resume action done to a kgd device
+ *
++ * @quiesce_mm: Quiesce all user queue access to specified MM address space
++ *
++ * @resume_mm: Resume user queue access to specified MM address space
++ *
++ * @schedule_evict_and_restore_process: Schedules work queue that will prepare
++ * for safe eviction of KFD BOs that belong to the specified process.
++ *
+ * This structure contains function callback pointers so the kgd driver
+ * will notify to the amdkfd about certain status changes.
+ *
+@@ -224,9 +437,13 @@ struct kgd2kfd_calls {
+ void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry);
+ void (*suspend)(struct kfd_dev *kfd);
+ int (*resume)(struct kfd_dev *kfd);
++ int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm);
++ int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm);
++ int (*schedule_evict_and_restore_process)(struct mm_struct *mm,
++ struct dma_fence *fence);
+ };
+
+ int kgd2kfd_init(unsigned interface_version,
+ const struct kgd2kfd_calls **g2f);
+
+-#endif /* KGD_KFD_INTERFACE_H_INCLUDED */
++#endif /* KGD_KFD_INTERFACE_H_INCLUDED */
+diff --git a/drivers/gpu/drm/amd/include/v9_structs.h b/drivers/gpu/drm/amd/include/v9_structs.h
+old mode 100644
+new mode 100755
+index 2fb25ab..ceaf493
+--- a/drivers/gpu/drm/amd/include/v9_structs.h
++++ b/drivers/gpu/drm/amd/include/v9_structs.h
+@@ -29,10 +29,10 @@ struct v9_sdma_mqd {
+ uint32_t sdmax_rlcx_rb_base;
+ uint32_t sdmax_rlcx_rb_base_hi;
+ uint32_t sdmax_rlcx_rb_rptr;
++ uint32_t sdmax_rlcx_rb_rptr_hi;
+ uint32_t sdmax_rlcx_rb_wptr;
++ uint32_t sdmax_rlcx_rb_wptr_hi;
+ uint32_t sdmax_rlcx_rb_wptr_poll_cntl;
+- uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi;
+- uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo;
+ uint32_t sdmax_rlcx_rb_rptr_addr_hi;
+ uint32_t sdmax_rlcx_rb_rptr_addr_lo;
+ uint32_t sdmax_rlcx_ib_cntl;
+@@ -44,29 +44,29 @@ struct v9_sdma_mqd {
+ uint32_t sdmax_rlcx_skip_cntl;
+ uint32_t sdmax_rlcx_context_status;
+ uint32_t sdmax_rlcx_doorbell;
+- uint32_t sdmax_rlcx_virtual_addr;
+- uint32_t sdmax_rlcx_ape1_cntl;
++ uint32_t sdmax_rlcx_status;
+ uint32_t sdmax_rlcx_doorbell_log;
+- uint32_t reserved_22;
+- uint32_t reserved_23;
+- uint32_t reserved_24;
+- uint32_t reserved_25;
+- uint32_t reserved_26;
+- uint32_t reserved_27;
+- uint32_t reserved_28;
+- uint32_t reserved_29;
+- uint32_t reserved_30;
+- uint32_t reserved_31;
+- uint32_t reserved_32;
+- uint32_t reserved_33;
+- uint32_t reserved_34;
+- uint32_t reserved_35;
+- uint32_t reserved_36;
+- uint32_t reserved_37;
+- uint32_t reserved_38;
+- uint32_t reserved_39;
+- uint32_t reserved_40;
+- uint32_t reserved_41;
++ uint32_t sdmax_rlcx_watermark;
++ uint32_t sdmax_rlcx_doorbell_offset;
++ uint32_t sdmax_rlcx_csa_addr_lo;
++ uint32_t sdmax_rlcx_csa_addr_hi;
++ uint32_t sdmax_rlcx_ib_sub_remain;
++ uint32_t sdmax_rlcx_preempt;
++ uint32_t sdmax_rlcx_dummy_reg;
++ uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi;
++ uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo;
++ uint32_t sdmax_rlcx_rb_aql_cntl;
++ uint32_t sdmax_rlcx_minor_ptr_update;
++ uint32_t sdmax_rlcx_midcmd_data0;
++ uint32_t sdmax_rlcx_midcmd_data1;
++ uint32_t sdmax_rlcx_midcmd_data2;
++ uint32_t sdmax_rlcx_midcmd_data3;
++ uint32_t sdmax_rlcx_midcmd_data4;
++ uint32_t sdmax_rlcx_midcmd_data5;
++ uint32_t sdmax_rlcx_midcmd_data6;
++ uint32_t sdmax_rlcx_midcmd_data7;
++ uint32_t sdmax_rlcx_midcmd_data8;
++ uint32_t sdmax_rlcx_midcmd_cntl;
+ uint32_t reserved_42;
+ uint32_t reserved_43;
+ uint32_t reserved_44;
+diff --git a/drivers/gpu/drm/amd/include/vi_structs.h b/drivers/gpu/drm/amd/include/vi_structs.h
+old mode 100644
+new mode 100755
+index 2023482..717fbae
+--- a/drivers/gpu/drm/amd/include/vi_structs.h
++++ b/drivers/gpu/drm/amd/include/vi_structs.h
+@@ -153,6 +153,8 @@ struct vi_sdma_mqd {
+ uint32_t reserved_125;
+ uint32_t reserved_126;
+ uint32_t reserved_127;
++ uint32_t sdma_engine_id;
++ uint32_t sdma_queue_id;
+ };
+
+ struct vi_mqd {
+diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
+old mode 100644
+new mode 100755
+index 2292462..82d97f3
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -2983,6 +2983,87 @@ bool pci_acs_path_enabled(struct pci_dev *start,
+ }
+
+ /**
++ * pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port
++ * @dev: the PCI device
++ *
++ * Return 0 if the device is capable of generating AtomicOp requests,
++ * all upstream bridges support AtomicOp routing, egress blocking is disabled
++ * on all upstream ports, and the root port supports 32-bit, 64-bit and/or
++ * 128-bit AtomicOp completion, or negative otherwise.
++ */
++int pci_enable_atomic_ops_to_root(struct pci_dev *dev)
++{
++ struct pci_bus *bus = dev->bus;
++
++ if (!pci_is_pcie(dev))
++ return -EINVAL;
++
++ switch (pci_pcie_type(dev)) {
++ /*
++ * PCIe 3.0, 6.15 specifies that endpoints and root ports are permitted
++ * to implement AtomicOp requester capabilities.
++ */
++ case PCI_EXP_TYPE_ENDPOINT:
++ case PCI_EXP_TYPE_LEG_END:
++ case PCI_EXP_TYPE_RC_END:
++ break;
++ default:
++ return -EINVAL;
++ }
++
++ while (bus->parent) {
++ struct pci_dev *bridge = bus->self;
++ u32 cap;
++
++ pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap);
++
++ switch (pci_pcie_type(bridge)) {
++ /*
++ * Upstream, downstream and root ports may implement AtomicOp
++ * routing capabilities. AtomicOp routing via a root port is
++ * not considered.
++ */
++ case PCI_EXP_TYPE_UPSTREAM:
++ case PCI_EXP_TYPE_DOWNSTREAM:
++ if (!(cap & PCI_EXP_DEVCAP2_ATOMIC_ROUTE))
++ return -EINVAL;
++ break;
++
++ /*
++ * Root ports are permitted to implement AtomicOp completion
++ * capabilities.
++ */
++ case PCI_EXP_TYPE_ROOT_PORT:
++ if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
++ PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
++ PCI_EXP_DEVCAP2_ATOMIC_COMP128)))
++ return -EINVAL;
++ break;
++ }
++
++ /*
++ * Upstream ports may block AtomicOps on egress.
++ */
++ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) {
++ u32 ctl2;
++
++ pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2,
++ &ctl2);
++ if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_BLOCK)
++ return -EINVAL;
++ }
++
++ bus = bus->parent;
++ }
++
++ pcie_capability_set_word(dev, PCI_EXP_DEVCTL2,
++ PCI_EXP_DEVCTL2_ATOMIC_REQ);
++
++ return 0;
++}
++EXPORT_SYMBOL(pci_enable_atomic_ops_to_root);
++
++/**
+ * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge
+ * @dev: the PCI device
+ * @pin: the INTx pin (1=INTA, 2=INTB, 3=INTC, 4=INTD)
+diff --git a/include/drm/amd_rdma.h b/include/drm/amd_rdma.h
+new file mode 100644
+index 0000000..b0cab3c
+--- /dev/null
++++ b/include/drm/amd_rdma.h
+@@ -0,0 +1,70 @@
++/*
++ * Copyright 2015 Advanced Micro Devices, Inc.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
++ * OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++/* @file This file defined kernel interfaces to communicate with amdkfd */
++
++#ifndef AMD_RDMA_H_
++#define AMD_RDMA_H_
++
++
++/**
++ * Structure describing information needed to P2P access from another device
++ * to specific location of GPU memory
++ */
++struct amd_p2p_info {
++ uint64_t va; /**< Specify user virt. address
++ * which this page table
++ * described
++ */
++ uint64_t size; /**< Specify total size of
++ * allocation
++ */
++ struct pid *pid; /**< Specify process pid to which
++ * virtual address belongs
++ */
++ struct sg_table *pages; /**< Specify DMA/Bus addresses */
++ void *priv; /**< Pointer set by AMD kernel
++ * driver
++ */
++};
++
++/**
++ * Structure providing function pointers to support rdma/p2p requirements.
++ * to specific location of GPU memory
++ */
++struct amd_rdma_interface {
++ int (*get_pages)(uint64_t address, uint64_t length, struct pid *pid,
++ struct amd_p2p_info **amd_p2p_data,
++ void (*free_callback)(void *client_priv),
++ void *client_priv);
++ int (*put_pages)(struct amd_p2p_info **amd_p2p_data);
++ int (*is_gpu_address)(uint64_t address, struct pid *pid);
++ int (*get_page_size)(uint64_t address, uint64_t length, struct pid *pid,
++ unsigned long *page_size);
++};
++
++
++int amdkfd_query_rdma_interface(const struct amd_rdma_interface **rdma);
++
++
++#endif /* AMD_RDMA_H_ */
++
+diff --git a/include/linux/pci.h b/include/linux/pci.h
+old mode 100644
+new mode 100755
+index b1abbcc..3df545d
+--- a/include/linux/pci.h
++++ b/include/linux/pci.h
+@@ -2072,6 +2072,7 @@ void pci_request_acs(void);
+ bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags);
+ bool pci_acs_path_enabled(struct pci_dev *start,
+ struct pci_dev *end, u16 acs_flags);
++int pci_enable_atomic_ops_to_root(struct pci_dev *dev);
+
+ #define PCI_VPD_LRDT 0x80 /* Large Resource Data Type */
+ #define PCI_VPD_LRDT_ID(x) ((x) | PCI_VPD_LRDT)
+diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
+index 5bb2b45..de5367c 100644
+--- a/include/uapi/linux/kfd_ioctl.h
++++ b/include/uapi/linux/kfd_ioctl.h
+@@ -23,15 +23,15 @@
+ #ifndef KFD_IOCTL_H_INCLUDED
+ #define KFD_IOCTL_H_INCLUDED
+
+-#include <drm/drm.h>
++#include <linux/types.h>
+ #include <linux/ioctl.h>
+
+ #define KFD_IOCTL_MAJOR_VERSION 1
+-#define KFD_IOCTL_MINOR_VERSION 1
++#define KFD_IOCTL_MINOR_VERSION 2
+
+ struct kfd_ioctl_get_version_args {
+- __u32 major_version; /* from KFD */
+- __u32 minor_version; /* from KFD */
++ uint32_t major_version; /* from KFD */
++ uint32_t minor_version; /* from KFD */
+ };
+
+ /* For kfd_ioctl_create_queue_args.queue_type. */
+@@ -43,36 +43,51 @@ struct kfd_ioctl_get_version_args {
+ #define KFD_MAX_QUEUE_PRIORITY 15
+
+ struct kfd_ioctl_create_queue_args {
+- __u64 ring_base_address; /* to KFD */
+- __u64 write_pointer_address; /* from KFD */
+- __u64 read_pointer_address; /* from KFD */
+- __u64 doorbell_offset; /* from KFD */
+-
+- __u32 ring_size; /* to KFD */
+- __u32 gpu_id; /* to KFD */
+- __u32 queue_type; /* to KFD */
+- __u32 queue_percentage; /* to KFD */
+- __u32 queue_priority; /* to KFD */
+- __u32 queue_id; /* from KFD */
+-
+- __u64 eop_buffer_address; /* to KFD */
+- __u64 eop_buffer_size; /* to KFD */
+- __u64 ctx_save_restore_address; /* to KFD */
+- __u64 ctx_save_restore_size; /* to KFD */
++ uint64_t ring_base_address; /* to KFD */
++ uint64_t write_pointer_address; /* from KFD */
++ uint64_t read_pointer_address; /* from KFD */
++ uint64_t doorbell_offset; /* from KFD */
++
++ uint32_t ring_size; /* to KFD */
++ uint32_t gpu_id; /* to KFD */
++ uint32_t queue_type; /* to KFD */
++ uint32_t queue_percentage; /* to KFD */
++ uint32_t queue_priority; /* to KFD */
++ uint32_t queue_id; /* from KFD */
++
++ uint64_t eop_buffer_address; /* to KFD */
++ uint64_t eop_buffer_size; /* to KFD */
++ uint64_t ctx_save_restore_address; /* to KFD */
++ uint32_t ctx_save_restore_size; /* to KFD */
++ uint32_t ctl_stack_size; /* to KFD */
+ };
+
+ struct kfd_ioctl_destroy_queue_args {
+- __u32 queue_id; /* to KFD */
+- __u32 pad;
++ uint32_t queue_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ struct kfd_ioctl_update_queue_args {
+- __u64 ring_base_address; /* to KFD */
++ uint64_t ring_base_address; /* to KFD */
++
++ uint32_t queue_id; /* to KFD */
++ uint32_t ring_size; /* to KFD */
++ uint32_t queue_percentage; /* to KFD */
++ uint32_t queue_priority; /* to KFD */
++};
+
+- __u32 queue_id; /* to KFD */
+- __u32 ring_size; /* to KFD */
+- __u32 queue_percentage; /* to KFD */
+- __u32 queue_priority; /* to KFD */
++struct kfd_ioctl_set_cu_mask_args {
++ uint32_t queue_id; /* to KFD */
++ uint32_t num_cu_mask; /* to KFD */
++ uint64_t cu_mask_ptr; /* to KFD */
++};
++
++struct kfd_ioctl_get_queue_wave_state_args {
++ uint64_t ctl_stack_address; /* to KFD */
++ uint32_t ctl_stack_used_size; /* from KFD */
++ uint32_t save_area_used_size; /* from KFD */
++ uint32_t queue_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
+@@ -80,13 +95,20 @@ struct kfd_ioctl_update_queue_args {
+ #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
+
+ struct kfd_ioctl_set_memory_policy_args {
+- __u64 alternate_aperture_base; /* to KFD */
+- __u64 alternate_aperture_size; /* to KFD */
++ uint64_t alternate_aperture_base; /* to KFD */
++ uint64_t alternate_aperture_size; /* to KFD */
++
++ uint32_t gpu_id; /* to KFD */
++ uint32_t default_policy; /* to KFD */
++ uint32_t alternate_policy; /* to KFD */
++ uint32_t pad;
++};
+
+- __u32 gpu_id; /* to KFD */
+- __u32 default_policy; /* to KFD */
+- __u32 alternate_policy; /* to KFD */
+- __u32 pad;
++struct kfd_ioctl_set_trap_handler_args {
++ uint64_t tba_addr;
++ uint64_t tma_addr;
++ uint32_t gpu_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ /*
+@@ -97,35 +119,52 @@ struct kfd_ioctl_set_memory_policy_args {
+ */
+
+ struct kfd_ioctl_get_clock_counters_args {
+- __u64 gpu_clock_counter; /* from KFD */
+- __u64 cpu_clock_counter; /* from KFD */
+- __u64 system_clock_counter; /* from KFD */
+- __u64 system_clock_freq; /* from KFD */
++ uint64_t gpu_clock_counter; /* from KFD */
++ uint64_t cpu_clock_counter; /* from KFD */
++ uint64_t system_clock_counter; /* from KFD */
++ uint64_t system_clock_freq; /* from KFD */
+
+- __u32 gpu_id; /* to KFD */
+- __u32 pad;
++ uint32_t gpu_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ #define NUM_OF_SUPPORTED_GPUS 7
+
+ struct kfd_process_device_apertures {
+- __u64 lds_base; /* from KFD */
+- __u64 lds_limit; /* from KFD */
+- __u64 scratch_base; /* from KFD */
+- __u64 scratch_limit; /* from KFD */
+- __u64 gpuvm_base; /* from KFD */
+- __u64 gpuvm_limit; /* from KFD */
+- __u32 gpu_id; /* from KFD */
+- __u32 pad;
++ uint64_t lds_base; /* from KFD */
++ uint64_t lds_limit; /* from KFD */
++ uint64_t scratch_base; /* from KFD */
++ uint64_t scratch_limit; /* from KFD */
++ uint64_t gpuvm_base; /* from KFD */
++ uint64_t gpuvm_limit; /* from KFD */
++ uint32_t gpu_id; /* from KFD */
++ uint32_t pad;
+ };
+
++/* This IOCTL and the limited NUM_OF_SUPPORTED_GPUS is deprecated. Use
++ * kfd_ioctl_get_process_apertures_new instead, which supports
++ * arbitrary numbers of GPUs.
++ */
+ struct kfd_ioctl_get_process_apertures_args {
+ struct kfd_process_device_apertures
+ process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
+
+ /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */
+- __u32 num_of_nodes;
+- __u32 pad;
++ uint32_t num_of_nodes;
++ uint32_t pad;
++};
++
++struct kfd_ioctl_get_process_apertures_new_args {
++ /* User allocated. Pointer to struct kfd_process_device_apertures
++ * filled in by Kernel
++ */
++ uint64_t kfd_process_device_apertures_ptr;
++ /* to KFD - indicates amount of memory present in
++ * kfd_process_device_apertures_ptr
++ * from KFD - Number of entries filled by KFD.
++ */
++ uint32_t num_of_nodes;
++ uint32_t pad;
+ };
+
+ #define MAX_ALLOWED_NUM_POINTS 100
+@@ -133,103 +172,245 @@ struct kfd_ioctl_get_process_apertures_args {
+ #define MAX_ALLOWED_WAC_BUFF_SIZE 128
+
+ struct kfd_ioctl_dbg_register_args {
+- __u32 gpu_id; /* to KFD */
+- __u32 pad;
++ uint32_t gpu_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ struct kfd_ioctl_dbg_unregister_args {
+- __u32 gpu_id; /* to KFD */
+- __u32 pad;
++ uint32_t gpu_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ struct kfd_ioctl_dbg_address_watch_args {
+- __u64 content_ptr; /* a pointer to the actual content */
+- __u32 gpu_id; /* to KFD */
+- __u32 buf_size_in_bytes; /*including gpu_id and buf_size */
++ uint64_t content_ptr; /* a pointer to the actual content */
++ uint32_t gpu_id; /* to KFD */
++ uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */
+ };
+
+ struct kfd_ioctl_dbg_wave_control_args {
+- __u64 content_ptr; /* a pointer to the actual content */
+- __u32 gpu_id; /* to KFD */
+- __u32 buf_size_in_bytes; /*including gpu_id and buf_size */
++ uint64_t content_ptr; /* a pointer to the actual content */
++ uint32_t gpu_id; /* to KFD */
++ uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */
+ };
+
+ /* Matching HSA_EVENTTYPE */
+-#define KFD_IOC_EVENT_SIGNAL 0
+-#define KFD_IOC_EVENT_NODECHANGE 1
+-#define KFD_IOC_EVENT_DEVICESTATECHANGE 2
+-#define KFD_IOC_EVENT_HW_EXCEPTION 3
+-#define KFD_IOC_EVENT_SYSTEM_EVENT 4
+-#define KFD_IOC_EVENT_DEBUG_EVENT 5
+-#define KFD_IOC_EVENT_PROFILE_EVENT 6
+-#define KFD_IOC_EVENT_QUEUE_EVENT 7
+-#define KFD_IOC_EVENT_MEMORY 8
+-
+-#define KFD_IOC_WAIT_RESULT_COMPLETE 0
+-#define KFD_IOC_WAIT_RESULT_TIMEOUT 1
+-#define KFD_IOC_WAIT_RESULT_FAIL 2
+-
+-#define KFD_SIGNAL_EVENT_LIMIT 256
++#define KFD_IOC_EVENT_SIGNAL 0
++#define KFD_IOC_EVENT_NODECHANGE 1
++#define KFD_IOC_EVENT_DEVICESTATECHANGE 2
++#define KFD_IOC_EVENT_HW_EXCEPTION 3
++#define KFD_IOC_EVENT_SYSTEM_EVENT 4
++#define KFD_IOC_EVENT_DEBUG_EVENT 5
++#define KFD_IOC_EVENT_PROFILE_EVENT 6
++#define KFD_IOC_EVENT_QUEUE_EVENT 7
++#define KFD_IOC_EVENT_MEMORY 8
++
++#define KFD_IOC_WAIT_RESULT_COMPLETE 0
++#define KFD_IOC_WAIT_RESULT_TIMEOUT 1
++#define KFD_IOC_WAIT_RESULT_FAIL 2
++
++#define KFD_SIGNAL_EVENT_LIMIT 4096
+
+ struct kfd_ioctl_create_event_args {
+- __u64 event_page_offset; /* from KFD */
+- __u32 event_trigger_data; /* from KFD - signal events only */
+- __u32 event_type; /* to KFD */
+- __u32 auto_reset; /* to KFD */
+- __u32 node_id; /* to KFD - only valid for certain
++ uint64_t event_page_offset; /* from KFD */
++ uint32_t event_trigger_data; /* from KFD - signal events only */
++ uint32_t event_type; /* to KFD */
++ uint32_t auto_reset; /* to KFD */
++ uint32_t node_id; /* to KFD - only valid for certain
+ event types */
+- __u32 event_id; /* from KFD */
+- __u32 event_slot_index; /* from KFD */
++ uint32_t event_id; /* from KFD */
++ uint32_t event_slot_index; /* from KFD */
+ };
+
+ struct kfd_ioctl_destroy_event_args {
+- __u32 event_id; /* to KFD */
+- __u32 pad;
++ uint32_t event_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ struct kfd_ioctl_set_event_args {
+- __u32 event_id; /* to KFD */
+- __u32 pad;
++ uint32_t event_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ struct kfd_ioctl_reset_event_args {
+- __u32 event_id; /* to KFD */
+- __u32 pad;
++ uint32_t event_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ struct kfd_memory_exception_failure {
+- __u32 NotPresent; /* Page not present or supervisor privilege */
+- __u32 ReadOnly; /* Write access to a read-only page */
+- __u32 NoExecute; /* Execute access to a page marked NX */
+- __u32 pad;
++ uint32_t NotPresent; /* Page not present or supervisor privilege */
++ uint32_t ReadOnly; /* Write access to a read-only page */
++ uint32_t NoExecute; /* Execute access to a page marked NX */
++ uint32_t imprecise; /* Can't determine the exact fault address */
+ };
+
+-/* memory exception data*/
++/* memory exception data */
+ struct kfd_hsa_memory_exception_data {
+ struct kfd_memory_exception_failure failure;
+- __u64 va;
+- __u32 gpu_id;
+- __u32 pad;
++ uint64_t va;
++ uint32_t gpu_id;
++ uint32_t pad;
+ };
+
+-/* Event data*/
++/* Event data */
+ struct kfd_event_data {
+ union {
+ struct kfd_hsa_memory_exception_data memory_exception_data;
+ }; /* From KFD */
+- __u64 kfd_event_data_ext; /* pointer to an extension structure
+- for future exception types */
+- __u32 event_id; /* to KFD */
+- __u32 pad;
++ uint64_t kfd_event_data_ext; /* pointer to an extension structure
++ for future exception types */
++ uint32_t event_id; /* to KFD */
++ uint32_t pad;
+ };
+
+ struct kfd_ioctl_wait_events_args {
+- __u64 events_ptr; /* pointed to struct
++ uint64_t events_ptr; /* pointed to struct
+ kfd_event_data array, to KFD */
+- __u32 num_events; /* to KFD */
+- __u32 wait_for_all; /* to KFD */
+- __u32 timeout; /* to KFD */
+- __u32 wait_result; /* from KFD */
++ uint32_t num_events; /* to KFD */
++ uint32_t wait_for_all; /* to KFD */
++ uint32_t timeout; /* to KFD */
++ uint32_t wait_result; /* from KFD */
++};
++
++struct kfd_ioctl_alloc_memory_of_scratch_args {
++ uint64_t va_addr; /* to KFD */
++ uint64_t size; /* to KFD */
++ uint32_t gpu_id; /* to KFD */
++ uint32_t pad;
++};
++
++/* Allocation flags: memory types */
++#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM (1 << 0)
++#define KFD_IOC_ALLOC_MEM_FLAGS_GTT (1 << 1)
++#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 2)
++#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 3)
++/* Allocation flags: attributes/access options */
++#define KFD_IOC_ALLOC_MEM_FLAGS_NONPAGED (1 << 31)
++#define KFD_IOC_ALLOC_MEM_FLAGS_READONLY (1 << 30)
++#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC (1 << 29)
++#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28)
++#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27)
++#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26)
++#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 25)
++
++struct kfd_ioctl_alloc_memory_of_gpu_args {
++ uint64_t va_addr; /* to KFD */
++ uint64_t size; /* to KFD */
++ uint64_t handle; /* from KFD */
++ uint64_t mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */
++ uint32_t gpu_id; /* to KFD */
++ uint32_t flags;
++};
++
++struct kfd_ioctl_free_memory_of_gpu_args {
++ uint64_t handle; /* to KFD */
++};
++
++struct kfd_ioctl_map_memory_to_gpu_args {
++ uint64_t handle; /* to KFD */
++ uint64_t device_ids_array_ptr; /* to KFD */
++ uint32_t device_ids_array_size; /* to KFD */
++ uint32_t pad;
++};
++
++struct kfd_ioctl_unmap_memory_from_gpu_args {
++ uint64_t handle; /* to KFD */
++ uint64_t device_ids_array_ptr; /* to KFD */
++ uint32_t device_ids_array_size; /* to KFD */
++ uint32_t pad;
++};
++
++struct kfd_ioctl_set_process_dgpu_aperture_args {
++ uint64_t dgpu_base;
++ uint64_t dgpu_limit;
++ uint32_t gpu_id;
++ uint32_t pad;
++};
++
++struct kfd_ioctl_get_dmabuf_info_args {
++ uint64_t size; /* from KFD */
++ uint64_t metadata_ptr; /* to KFD */
++ uint32_t metadata_size; /* to KFD (space allocated by user)
++ * from KFD (actual metadata size) */
++ uint32_t gpu_id; /* from KFD */
++ uint32_t flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */
++ uint32_t dmabuf_fd; /* to KFD */
++};
++
++struct kfd_ioctl_import_dmabuf_args {
++ uint64_t va_addr; /* to KFD */
++ uint64_t handle; /* from KFD */
++ uint32_t gpu_id; /* to KFD */
++ uint32_t dmabuf_fd; /* to KFD */
++};
++
++struct kfd_ioctl_ipc_export_handle_args {
++ uint64_t handle; /* to KFD */
++ uint32_t share_handle[4]; /* from KFD */
++ uint32_t gpu_id; /* to KFD */
++ uint32_t pad;
++};
++
++struct kfd_ioctl_ipc_import_handle_args {
++ uint64_t handle; /* from KFD */
++ uint64_t va_addr; /* to KFD */
++ uint64_t mmap_offset; /* from KFD */
++ uint32_t share_handle[4]; /* to KFD */
++ uint32_t gpu_id; /* to KFD */
++ uint32_t pad;
++};
++
++struct kfd_ioctl_get_tile_config_args {
++ /* to KFD: pointer to tile array */
++ uint64_t tile_config_ptr;
++ /* to KFD: pointer to macro tile array */
++ uint64_t macro_tile_config_ptr;
++ /* to KFD: array size allocated by user mode
++ * from KFD: array size filled by kernel
++ */
++ uint32_t num_tile_configs;
++ /* to KFD: array size allocated by user mode
++ * from KFD: array size filled by kernel
++ */
++ uint32_t num_macro_tile_configs;
++
++ uint32_t gpu_id; /* to KFD */
++ uint32_t gb_addr_config; /* from KFD */
++ uint32_t num_banks; /* from KFD */
++ uint32_t num_ranks; /* from KFD */
++ /* struct size can be extended later if needed
++ * without breaking ABI compatibility
++ */
++};
++
++struct kfd_memory_range {
++ uint64_t va_addr;
++ uint64_t size;
++};
++
++/* flags definitions
++ * BIT0: 0: read operation, 1: write operation.
++ * This also identifies if the src or dst array belongs to remote process
++ */
++#define KFD_CROSS_MEMORY_RW_BIT (1 << 0)
++#define KFD_SET_CROSS_MEMORY_READ(flags) (flags &= ~KFD_CROSS_MEMORY_RW_BIT)
++#define KFD_SET_CROSS_MEMORY_WRITE(flags) (flags |= KFD_CROSS_MEMORY_RW_BIT)
++#define KFD_IS_CROSS_MEMORY_WRITE(flags) (flags & KFD_CROSS_MEMORY_RW_BIT)
++
++struct kfd_ioctl_cross_memory_copy_args {
++ /* to KFD: Process ID of the remote process */
++ uint32_t pid;
++ /* to KFD: See above definition */
++ uint32_t flags;
++ /* to KFD: Source GPU VM range */
++ uint64_t src_mem_range_array;
++ /* to KFD: Size of above array */
++ uint64_t src_mem_array_size;
++ /* to KFD: Destination GPU VM range */
++ uint64_t dst_mem_range_array;
++ /* to KFD: Size of above array */
++ uint64_t dst_mem_array_size;
++ /* from KFD: Total amount of bytes copied */
++ uint64_t bytes_copied;
+ };
+
+
+@@ -287,7 +468,56 @@ struct kfd_ioctl_wait_events_args {
+ #define AMDKFD_IOC_DBG_WAVE_CONTROL \
+ AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args)
+
++#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU \
++ AMDKFD_IOWR(0x11, struct kfd_ioctl_alloc_memory_of_gpu_args)
++
++#define AMDKFD_IOC_FREE_MEMORY_OF_GPU \
++ AMDKFD_IOWR(0x12, struct kfd_ioctl_free_memory_of_gpu_args)
++
++#define AMDKFD_IOC_MAP_MEMORY_TO_GPU \
++ AMDKFD_IOWR(0x13, struct kfd_ioctl_map_memory_to_gpu_args)
++
++#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU \
++ AMDKFD_IOWR(0x14, struct kfd_ioctl_unmap_memory_from_gpu_args)
++
++#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH \
++ AMDKFD_IOWR(0x15, struct kfd_ioctl_alloc_memory_of_scratch_args)
++
++#define AMDKFD_IOC_SET_CU_MASK \
++ AMDKFD_IOW(0x16, struct kfd_ioctl_set_cu_mask_args)
++
++#define AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE \
++ AMDKFD_IOW(0x17, \
++ struct kfd_ioctl_set_process_dgpu_aperture_args)
++
++#define AMDKFD_IOC_SET_TRAP_HANDLER \
++ AMDKFD_IOW(0x18, struct kfd_ioctl_set_trap_handler_args)
++
++#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW \
++ AMDKFD_IOWR(0x19, struct kfd_ioctl_get_process_apertures_new_args)
++
++#define AMDKFD_IOC_GET_DMABUF_INFO \
++ AMDKFD_IOWR(0x1A, struct kfd_ioctl_get_dmabuf_info_args)
++
++#define AMDKFD_IOC_IMPORT_DMABUF \
++ AMDKFD_IOWR(0x1B, struct kfd_ioctl_import_dmabuf_args)
++
++#define AMDKFD_IOC_GET_TILE_CONFIG \
++ AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_tile_config_args)
++
++#define AMDKFD_IOC_IPC_IMPORT_HANDLE \
++ AMDKFD_IOWR(0x1D, struct kfd_ioctl_ipc_import_handle_args)
++
++#define AMDKFD_IOC_IPC_EXPORT_HANDLE \
++ AMDKFD_IOWR(0x1E, struct kfd_ioctl_ipc_export_handle_args)
++
++#define AMDKFD_IOC_CROSS_MEMORY_COPY \
++ AMDKFD_IOWR(0x1F, struct kfd_ioctl_cross_memory_copy_args)
++
++#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE \
++ AMDKFD_IOWR(0x20, struct kfd_ioctl_get_queue_wave_state_args)
++
+ #define AMDKFD_COMMAND_START 0x01
+-#define AMDKFD_COMMAND_END 0x11
++#define AMDKFD_COMMAND_END 0x21
+
+ #endif
+diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
+old mode 100644
+new mode 100755
+index 87c2c84..1256851
+--- a/include/uapi/linux/pci_regs.h
++++ b/include/uapi/linux/pci_regs.h
+@@ -624,7 +624,9 @@
+ #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */
+ #define PCI_EXP_DEVCAP2_ARI 0x00000020 /* Alternative Routing-ID */
+ #define PCI_EXP_DEVCAP2_ATOMIC_ROUTE 0x00000040 /* Atomic Op routing */
++#define PCI_EXP_DEVCAP2_ATOMIC_COMP32 0x00000080 /* 32b AtomicOp completion */
+ #define PCI_EXP_DEVCAP2_ATOMIC_COMP64 0x00000100 /* Atomic 64-bit compare */
++#define PCI_EXP_DEVCAP2_ATOMIC_COMP128 0x00000200 /* 128b AtomicOp completion*/
+ #define PCI_EXP_DEVCAP2_LTR 0x00000800 /* Latency tolerance reporting */
+ #define PCI_EXP_DEVCAP2_OBFF_MASK 0x000c0000 /* OBFF support mechanism */
+ #define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */
+@@ -634,6 +636,7 @@
+ #define PCI_EXP_DEVCTL2_ARI 0x0020 /* Alternative Routing-ID */
+ #define PCI_EXP_DEVCTL2_ATOMIC_REQ 0x0040 /* Set Atomic requests */
+ #define PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK 0x0080 /* Block atomic egress */
++#define PCI_EXP_DEVCTL2_ATOMIC_BLOCK 0x0040 /* Block AtomicOp on egress */
+ #define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x0100 /* Allow IDO for requests */
+ #define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x0200 /* Allow IDO for completions */
+ #define PCI_EXP_DEVCTL2_LTR_EN 0x0400 /* Enable LTR mechanism */
+diff --git a/kernel/fork.c b/kernel/fork.c
+index a19ee25..70d8d5b 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1082,6 +1082,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+
+ return mm;
+ }
++EXPORT_SYMBOL_GPL(mm_access);
+
+ static void complete_vfork_done(struct task_struct *tsk)
+ {
+--
+2.7.4
+