diff options
Diffstat (limited to 'meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch')
-rw-r--r-- | meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch | 8695 |
1 files changed, 8695 insertions, 0 deletions
diff --git a/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch new file mode 100644 index 00000000..a27db153 --- /dev/null +++ b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch @@ -0,0 +1,8695 @@ +From 817ccd6f0987f83ddbf989602f0fbf320157f0a9 Mon Sep 17 00:00:00 2001 +From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> +Date: Thu, 18 Oct 2018 12:42:04 +0530 +Subject: [PATCH 1353/4131] compilation fix for amdkfd porting + +Signed-off-by: Sanjay R Mehta <sanju.mehta@amd.com> +Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/Makefile | 8 +- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 + + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 346 ++- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 185 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 196 ++ + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 537 ++++- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 590 ++++- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h | 62 + + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1227 ++++++++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2578 +++++++++++++++++++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 - + drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 + + drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 + + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + + drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 46 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h | 2 + + drivers/gpu/drm/amd/amdgpu/soc15d.h | 1 + + drivers/gpu/drm/amd/amdgpu/vid.h | 2 + + drivers/gpu/drm/amd/amdkfd/Makefile | 2 - + drivers/gpu/drm/amd/amdkfd/backport/backport.h | 7 - + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 - + drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 4 - + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 12 - + drivers/gpu/drm/amd/amdkfd/kfd_events.c | 28 - + drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 10 - + drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 8 - + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 4 - + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 4 - + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 4 - + drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 2 - + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 +- + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 52 +- + drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 22 - + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 6 - + drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 - + drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 231 +- + drivers/gpu/drm/amd/include/v9_structs.h | 48 +- + drivers/gpu/drm/amd/include/vi_structs.h | 2 + + drivers/pci/pci.c | 81 + + include/drm/amd_rdma.h | 70 + + include/linux/pci.h | 1 + + include/uapi/linux/kfd_ioctl.h | 442 +++- + include/uapi/linux/pci_regs.h | 3 + + kernel/fork.c | 1 + + 44 files changed, 6315 insertions(+), 537 deletions(-) + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/Makefile + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu.h + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h + create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c + create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h + create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c + create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/soc15d.h + mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/vid.h + mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/Makefile + mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_priv.h + mode change 100644 => 100755 drivers/gpu/drm/amd/include/kgd_kfd_interface.h + mode change 100644 => 100755 drivers/gpu/drm/amd/include/v9_structs.h + mode change 100644 => 100755 drivers/gpu/drm/amd/include/vi_structs.h + mode change 100644 => 100755 drivers/pci/pci.c + create mode 100644 include/drm/amd_rdma.h + mode change 100644 => 100755 include/linux/pci.h + mode change 100644 => 100755 include/uapi/linux/pci_regs.h + +diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile +old mode 100644 +new mode 100755 +index 57b8d5f..6b373d0 +--- a/drivers/gpu/drm/amd/amdgpu/Makefile ++++ b/drivers/gpu/drm/amd/amdgpu/Makefile +@@ -32,12 +32,11 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \ + amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \ + amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ + amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \ +- amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o ++ amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o amdgpu_amdkfd_fence.o + + # add asic specific block + amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \ + ci_smc.o ci_dpm.o dce_v8_0.o gfx_v7_0.o cik_sdma.o uvd_v4_2.o vce_v2_0.o \ +- amdgpu_amdkfd_gfx_v7.o + + amdgpu-$(CONFIG_DRM_AMDGPU_SI)+= si.o gmc_v6_0.o gfx_v6_0.o si_ih.o si_dma.o dce_v6_0.o si_dpm.o si_smc.o + +@@ -109,7 +108,10 @@ amdgpu-y += \ + # add amdkfd interfaces + amdgpu-y += \ + amdgpu_amdkfd.o \ +- amdgpu_amdkfd_gfx_v8.o ++ amdgpu_amdkfd_gfx_v7.o \ ++ amdgpu_amdkfd_gfx_v8.o \ ++ amdgpu_amdkfd_gfx_v9.o \ ++ amdgpu_amdkfd_gpuvm.o + + # add cgs + amdgpu-y += amdgpu_cgs.o +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +old mode 100644 +new mode 100755 +index fe23de8..bcf95e7 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -184,6 +184,7 @@ struct amdgpu_cs_parser; + struct amdgpu_job; + struct amdgpu_irq_src; + struct amdgpu_fpriv; ++struct kfd_vm_fault_info; + struct amdgpu_bo_va_mapping; + + enum amdgpu_cp_irq { +@@ -403,6 +404,7 @@ struct amdgpu_gem_object { + struct amdgpu_bo *bo; + }; + ++struct kgd_mem; + #define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_gem_object, base)->bo + + void amdgpu_gem_object_free(struct drm_gem_object *obj); +@@ -543,6 +545,9 @@ struct amdgpu_mc { + u64 private_aperture_end; + /* protects concurrent invalidation */ + spinlock_t invalidate_lock; ++ ++ struct kfd_vm_fault_info *vm_fault_info; ++ atomic_t vm_fault_info_updated; + }; + + /* +@@ -961,6 +966,7 @@ struct amdgpu_gfx_config { + }; + + struct amdgpu_cu_info { ++ uint32_t simd_per_cu; + uint32_t max_waves_per_simd; + uint32_t wave_front_size; + uint32_t max_scratch_slots_per_cu; +@@ -1649,6 +1655,7 @@ struct amdgpu_device { + /* record hw reset is performed */ + bool has_hw_reset; + u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; ++ spinlock_t tlb_invalidation_lock; + + /* record last mm index being written through WREG32*/ + unsigned long last_mm_index; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +old mode 100644 +new mode 100755 +index 7ec1915..ec8141f +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +@@ -20,23 +20,29 @@ + * OTHER DEALINGS IN THE SOFTWARE. + */ + ++#undef pr_fmt ++#define pr_fmt(fmt) "kfd2kgd: " fmt ++ + #include "amdgpu_amdkfd.h" +-#include "amd_shared.h" ++#include <linux/dma-buf.h> + #include <drm/drmP.h> + #include "amdgpu.h" + #include "amdgpu_gfx.h" + #include <linux/module.h> + +-const struct kfd2kgd_calls *kfd2kgd; ++#define AMDKFD_SKIP_UNCOMPILED_CODE 1 ++ + const struct kgd2kfd_calls *kgd2kfd; +-bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); ++bool (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); ++ ++unsigned int global_compute_vmid_bitmap = 0xFF00; + + int amdgpu_amdkfd_init(void) + { + int ret; + + #if defined(CONFIG_HSA_AMD_MODULE) +- int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); ++ int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); + + kgd2kfd_init_p = symbol_request(kgd2kfd_init); + +@@ -57,56 +63,68 @@ int amdgpu_amdkfd_init(void) + #else + ret = -ENOENT; + #endif +- ++ amdgpu_amdkfd_gpuvm_init_mem_limits(); + return ret; + } + +-bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev) ++void amdgpu_amdkfd_fini(void) + { ++ if (kgd2kfd) { ++ kgd2kfd->exit(); ++ symbol_put(kgd2kfd_init); ++ } ++} ++ ++void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) ++{ ++ const struct kfd2kgd_calls *kfd2kgd; ++ ++ if (!kgd2kfd) ++ return; ++ + switch (adev->asic_type) { + #ifdef CONFIG_DRM_AMDGPU_CIK + case CHIP_KAVERI: ++ case CHIP_HAWAII: + kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions(); + break; + #endif + case CHIP_CARRIZO: ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ case CHIP_POLARIS10: ++ case CHIP_POLARIS11: + kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); + break; ++ case CHIP_VEGA10: ++ case CHIP_RAVEN: ++ kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions(); ++ break; + default: +- return false; +- } +- +- return true; +-} +- +-void amdgpu_amdkfd_fini(void) +-{ +- if (kgd2kfd) { +- kgd2kfd->exit(); +- symbol_put(kgd2kfd_init); ++ dev_info(adev->dev, "kfd not supported on this ASIC\n"); ++ return; + } +-} + +-void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) +-{ +- if (kgd2kfd) +- adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, +- adev->pdev, kfd2kgd); ++ adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, ++ adev->pdev, kfd2kgd); + } + + void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) + { + int i; + int last_valid_bit; ++ + if (adev->kfd) { + struct kgd2kfd_shared_resources gpu_resources = { +- .compute_vmid_bitmap = 0xFF00, ++ .compute_vmid_bitmap = global_compute_vmid_bitmap, + .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec, +- .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe ++ .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe, ++ .gpuvm_size = (uint64_t)amdgpu_vm_size << 30 + }; + + /* this is going to have a few of the MSBs set that we need to +- * clear */ ++ * clear ++ */ + bitmap_complement(gpu_resources.queue_bitmap, + adev->gfx.mec.queue_bitmap, + KGD_MAX_QUEUES); +@@ -120,7 +138,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) + gpu_resources.queue_bitmap); + + /* According to linux/bitmap.h we shouldn't use bitmap_clear if +- * nbits is not compile time constant */ ++ * nbits is not compile time constant ++ */ + last_valid_bit = 1 /* only first MEC can have compute queues */ + * adev->gfx.mec.num_pipe_per_mec + * adev->gfx.mec.num_queue_per_pipe; +@@ -131,6 +150,28 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) + &gpu_resources.doorbell_physical_address, + &gpu_resources.doorbell_aperture_size, + &gpu_resources.doorbell_start_offset); ++ if (adev->asic_type >= CHIP_VEGA10) { ++ /* On SOC15 the BIF is involved in routing ++ * doorbells using the low 12 bits of the ++ * address. Communicate the assignments to ++ * KFD. KFD uses two doorbell pages per ++ * process in case of 64-bit doorbells so we ++ * can use each doorbell assignment twice. ++ */ ++ gpu_resources.sdma_doorbell[0][0] = ++ AMDGPU_DOORBELL64_sDMA_ENGINE0; ++ gpu_resources.sdma_doorbell[0][1] = ++ AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200; ++ gpu_resources.sdma_doorbell[1][0] = ++ AMDGPU_DOORBELL64_sDMA_ENGINE1; ++ gpu_resources.sdma_doorbell[1][1] = ++ AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200; ++ /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for ++ * SDMA, IH and VCN. So don't use them for the CP. ++ */ ++ gpu_resources.reserved_doorbell_mask = 0x1f0; ++ gpu_resources.reserved_doorbell_val = 0x0f0; ++ } + + kgd2kfd->device_init(adev->kfd, &gpu_resources); + } +@@ -167,24 +208,81 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev) + return r; + } + ++int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, ++ uint32_t vmid, uint64_t gpu_addr, ++ uint32_t *ib_cmd, uint32_t ib_len) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ struct amdgpu_job *job; ++ struct amdgpu_ib *ib; ++ struct amdgpu_ring *ring; ++ struct dma_fence *f = NULL; ++ int ret; ++ ++ switch (engine) { ++ case KGD_ENGINE_MEC1: ++ ring = &adev->gfx.compute_ring[0]; ++ break; ++ case KGD_ENGINE_SDMA1: ++ ring = &adev->sdma.instance[0].ring; ++ break; ++ case KGD_ENGINE_SDMA2: ++ ring = &adev->sdma.instance[1].ring; ++ break; ++ default: ++ pr_err("Invalid engine in IB submission: %d\n", engine); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = amdgpu_job_alloc(adev, 1, &job, NULL); ++ if (ret) ++ goto err; ++ ++ ib = &job->ibs[0]; ++ memset(ib, 0, sizeof(struct amdgpu_ib)); ++ ++ ib->gpu_addr = gpu_addr; ++ ib->ptr = ib_cmd; ++ ib->length_dw = ib_len; ++ /* This works for NO_HWS. TODO: need to handle without knowing VMID */ ++ job->vm_id = vmid; ++ ++ ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); ++ if (ret) { ++ DRM_ERROR("amdgpu: failed to schedule IB.\n"); ++ goto err_ib_sched; ++ } ++ ++ ret = dma_fence_wait(f, false); ++ ++err_ib_sched: ++ dma_fence_put(f); ++ amdgpu_job_free(job); ++err: ++ return ret; ++} ++ ++u32 pool_to_domain(enum kgd_memory_pool p) ++{ ++ switch (p) { ++ case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM; ++ default: return AMDGPU_GEM_DOMAIN_GTT; ++ } ++} ++ + int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + void **mem_obj, uint64_t *gpu_addr, + void **cpu_ptr) + { + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- struct kgd_mem **mem = (struct kgd_mem **) mem_obj; ++ struct amdgpu_bo *bo = NULL; + int r; +- +- BUG_ON(kgd == NULL); +- BUG_ON(gpu_addr == NULL); +- BUG_ON(cpu_ptr == NULL); +- +- *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL); +- if ((*mem) == NULL) +- return -ENOMEM; ++ uint64_t gpu_addr_tmp = 0; ++ void *cpu_ptr_tmp = NULL; + + r = amdgpu_bo_create(adev, size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT, +- AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, &(*mem)->bo); ++ AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &bo); + if (r) { + dev_err(adev->dev, + "failed to allocate BO for amdkfd (%d)\n", r); +@@ -192,64 +290,87 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + } + + /* map the buffer */ +- r = amdgpu_bo_reserve((*mem)->bo, true); ++ r = amdgpu_bo_reserve(bo, true); + if (r) { + dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r); + goto allocate_mem_reserve_bo_failed; + } + +- r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT, +- &(*mem)->gpu_addr); ++ r = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT, ++ &gpu_addr_tmp); + if (r) { + dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r); + goto allocate_mem_pin_bo_failed; + } +- *gpu_addr = (*mem)->gpu_addr; + +- r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); ++ r = amdgpu_bo_kmap(bo, &cpu_ptr_tmp); + if (r) { + dev_err(adev->dev, + "(%d) failed to map bo to kernel for amdkfd\n", r); + goto allocate_mem_kmap_bo_failed; + } +- *cpu_ptr = (*mem)->cpu_ptr; + +- amdgpu_bo_unreserve((*mem)->bo); ++ *mem_obj = bo; ++ *gpu_addr = gpu_addr_tmp; ++ *cpu_ptr = cpu_ptr_tmp; ++ ++ amdgpu_bo_unreserve(bo); + + return 0; + + allocate_mem_kmap_bo_failed: +- amdgpu_bo_unpin((*mem)->bo); ++ amdgpu_bo_unpin(bo); + allocate_mem_pin_bo_failed: +- amdgpu_bo_unreserve((*mem)->bo); ++ amdgpu_bo_unreserve(bo); + allocate_mem_reserve_bo_failed: +- amdgpu_bo_unref(&(*mem)->bo); ++ amdgpu_bo_unref(&bo); + + return r; + } + + void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) + { +- struct kgd_mem *mem = (struct kgd_mem *) mem_obj; ++ struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; + +- BUG_ON(mem == NULL); +- +- amdgpu_bo_reserve(mem->bo, true); +- amdgpu_bo_kunmap(mem->bo); +- amdgpu_bo_unpin(mem->bo); +- amdgpu_bo_unreserve(mem->bo); +- amdgpu_bo_unref(&(mem->bo)); +- kfree(mem); ++ amdgpu_bo_reserve(bo, true); ++ amdgpu_bo_kunmap(bo); ++ amdgpu_bo_unpin(bo); ++ amdgpu_bo_unreserve(bo); ++ amdgpu_bo_unref(&(bo)); + } + +-uint64_t get_vmem_size(struct kgd_dev *kgd) ++void get_local_mem_info(struct kgd_dev *kgd, ++ struct kfd_local_mem_info *mem_info) + { +- struct amdgpu_device *adev = +- (struct amdgpu_device *)kgd; ++ uint64_t address_mask; ++ resource_size_t aper_limit; ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + +- BUG_ON(kgd == NULL); ++ address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask : ++ ~((1ULL << 32) - 1); ++ aper_limit = adev->mc.aper_base + adev->mc.aper_size; ++ ++ memset(mem_info, 0, sizeof(*mem_info)); ++ if (!(adev->mc.aper_base & address_mask || ++ aper_limit & address_mask)) { ++ mem_info->local_mem_size_public = adev->mc.visible_vram_size; ++ mem_info->local_mem_size_private = adev->mc.real_vram_size - ++ adev->mc.visible_vram_size; ++ } else { ++ mem_info->local_mem_size_public = 0; ++ mem_info->local_mem_size_private = adev->mc.real_vram_size; ++ } ++ mem_info->vram_width = adev->mc.vram_width; + +- return adev->mc.real_vram_size; ++ pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n", ++ adev->mc.aper_base, aper_limit, ++ mem_info->local_mem_size_public, ++ mem_info->local_mem_size_private); ++ ++ if (amdgpu_sriov_vf(adev)) ++ mem_info->mem_clk_max = adev->clock.default_mclk / 100; ++ else ++ mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100; + } + + uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) +@@ -271,3 +392,106 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd) + + return amdgpu_dpm_get_sclk(adev, false) / 100; + } ++ ++void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ struct amdgpu_cu_info acu_info = adev->gfx.cu_info; ++ ++ memset(cu_info, 0, sizeof(*cu_info)); ++ if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap)) ++ return; ++ ++ cu_info->cu_active_number = acu_info.number; ++ cu_info->cu_ao_mask = acu_info.ao_cu_mask; ++ memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], ++ sizeof(acu_info.bitmap)); ++ cu_info->num_shader_engines = adev->gfx.config.max_shader_engines; ++ cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; ++ cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh; ++ cu_info->simd_per_cu = acu_info.simd_per_cu; ++ cu_info->max_waves_per_simd = acu_info.max_waves_per_simd; ++ cu_info->wave_front_size = acu_info.wave_front_size; ++ cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu; ++ cu_info->lds_size = acu_info.lds_size; ++} ++ ++int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, ++ struct kgd_dev **dma_buf_kgd, ++ uint64_t *bo_size, void *metadata_buffer, ++ size_t buffer_size, uint32_t *metadata_size, ++ uint32_t *flags) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ struct dma_buf *dma_buf; ++ struct drm_gem_object *obj; ++ struct amdgpu_bo *bo; ++ uint64_t metadata_flags; ++ int r = -EINVAL; ++ ++ dma_buf = dma_buf_get(dma_buf_fd); ++ if (IS_ERR(dma_buf)) ++ return PTR_ERR(dma_buf); ++ ++ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) ++ /* Can't handle non-graphics buffers */ ++ goto out_put; ++ ++ obj = dma_buf->priv; ++ if (obj->dev->driver != adev->ddev->driver) ++ /* Can't handle buffers from different drivers */ ++ goto out_put; ++ ++ adev = obj->dev->dev_private; ++ bo = gem_to_amdgpu_bo(obj); ++ if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | ++ AMDGPU_GEM_DOMAIN_GTT | ++ AMDGPU_GEM_DOMAIN_DGMA))) ++ /* Only VRAM, GTT and DGMA BOs are supported */ ++ goto out_put; ++ ++ r = 0; ++ if (dma_buf_kgd) ++ *dma_buf_kgd = (struct kgd_dev *)adev; ++ if (bo_size) ++ *bo_size = amdgpu_bo_size(bo); ++ if (metadata_size) ++ *metadata_size = bo->metadata_size; ++ if (metadata_buffer) ++ r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size, ++ metadata_size, &metadata_flags); ++ if (flags) { ++ /* If the preferred domain is DGMA, set flags to VRAM because ++ * KFD doesn't support allocating DGMA memory ++ */ ++ *flags = (bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | ++ AMDGPU_GEM_DOMAIN_DGMA)) ? ++ ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT; ++ ++ if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) ++ *flags |= ALLOC_MEM_FLAGS_PUBLIC; ++ } ++ ++out_put: ++ dma_buf_put(dma_buf); ++ return r; ++} ++ ++uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ uint64_t usage = ++ amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]); ++ return usage; ++} ++ ++bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, ++ u32 vmid) ++{ ++ if (adev->kfd) { ++ if ((1 << vmid) & global_compute_vmid_bitmap) ++ return true; ++ } ++ ++ return false; ++} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +old mode 100644 +new mode 100755 +index 6d3a10b..b259ba7 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +@@ -27,20 +27,109 @@ + + #include <linux/types.h> + #include <linux/mm.h> ++#include <linux/workqueue.h> ++#include <linux/mmu_context.h> + #include <kgd_kfd_interface.h> ++#include "amdgpu.h" ++ ++extern const struct kgd2kfd_calls *kgd2kfd; + + struct amdgpu_device; + ++struct kfd_bo_va_list { ++ struct list_head bo_list; ++ struct amdgpu_bo_va *bo_va; ++ void *kgd_dev; ++ bool is_mapped; ++ bool map_fail; ++ uint64_t va; ++ uint64_t pte_flags; ++}; ++ + struct kgd_mem { ++ struct mutex lock; + struct amdgpu_bo *bo; +- uint64_t gpu_addr; +- void *cpu_ptr; ++ struct list_head bo_va_list; ++ /* protected by amdkfd_process_info.lock */ ++ struct ttm_validate_buffer validate_list; ++ struct ttm_validate_buffer resv_list; ++ uint32_t domain; ++ unsigned int mapped_to_gpu_memory; ++ void *kptr; ++ uint64_t va; ++ ++ uint32_t mapping_flags; ++ ++ atomic_t invalid; ++ struct amdkfd_process_info *process_info; ++ struct page **user_pages; ++ ++ struct amdgpu_sync sync; ++ ++ /* flags bitfield */ ++ bool coherent : 1; ++ bool no_substitute : 1; ++ bool aql_queue : 1; ++}; ++ ++/* KFD Memory Eviction */ ++struct amdgpu_amdkfd_fence { ++ struct dma_fence base; ++ void *mm; ++ spinlock_t lock; ++ char timeline_name[TASK_COMM_LEN]; ++}; ++ ++struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, ++ void *mm); ++bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm); ++struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); ++ ++struct amdkfd_process_info { ++ /* List head of all VMs that belong to a KFD process */ ++ struct list_head vm_list_head; ++ /* List head for all KFD BOs that belong to a KFD process. */ ++ struct list_head kfd_bo_list; ++ /* List of userptr BOs that are valid or invalid */ ++ struct list_head userptr_valid_list; ++ struct list_head userptr_inval_list; ++ /* Lock to protect kfd_bo_list */ ++ struct mutex lock; ++ ++ /* Number of VMs */ ++ unsigned int n_vms; ++ /* Eviction Fence */ ++ struct amdgpu_amdkfd_fence *eviction_fence; ++ ++ /* MMU-notifier related fields */ ++ atomic_t evicted_bos; ++ struct delayed_work work; ++ struct pid *pid; ++}; ++ ++/* struct amdkfd_vm - ++ * For Memory Eviction KGD requires a mechanism to keep track of all KFD BOs ++ * belonging to a KFD process. All the VMs belonging to the same process point ++ * to the same amdkfd_process_info. ++ */ ++struct amdkfd_vm { ++ /* Keep base as the first parameter for pointer compatibility between ++ * amdkfd_vm and amdgpu_vm. ++ */ ++ struct amdgpu_vm base; ++ ++ /* List node in amdkfd_process_info.vm_list_head*/ ++ struct list_head vm_list_node; ++ ++ struct amdgpu_device *adev; ++ /* Points to the KFD process VM info*/ ++ struct amdkfd_process_info *process_info; + }; + ++ + int amdgpu_amdkfd_init(void); + void amdgpu_amdkfd_fini(void); + +-bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev); + + void amdgpu_amdkfd_suspend(struct amdgpu_device *adev); + int amdgpu_amdkfd_resume(struct amdgpu_device *adev); +@@ -50,17 +139,105 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); + void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); + void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); + ++int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); ++int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, ++ uint32_t vmid, uint64_t gpu_addr, ++ uint32_t *ib_cmd, uint32_t ib_len); ++int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, ++ struct dma_fence **ef); + struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); + struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); ++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void); ++int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, ++ uint64_t src_offset, struct kgd_mem *dst_mem, ++ uint64_t dest_offset, uint64_t size, struct dma_fence **f, ++ uint64_t *actual_size); ++ ++bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, ++ u32 vmid); + + /* Shared API */ ++int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm, ++ struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va); + int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + void **mem_obj, uint64_t *gpu_addr, + void **cpu_ptr); + void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); +-uint64_t get_vmem_size(struct kgd_dev *kgd); ++void get_local_mem_info(struct kgd_dev *kgd, ++ struct kfd_local_mem_info *mem_info); + uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); + + uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd); ++void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info); ++int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, ++ struct kgd_dev **dmabuf_kgd, ++ uint64_t *bo_size, void *metadata_buffer, ++ size_t buffer_size, uint32_t *metadata_size, ++ uint32_t *flags); ++uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd); ++ ++#define read_user_wptr(mmptr, wptr, dst) \ ++ ({ \ ++ bool valid = false; \ ++ if ((mmptr) && (wptr)) { \ ++ if ((mmptr) == current->mm) { \ ++ valid = !get_user((dst), (wptr)); \ ++ } else if (current->mm == NULL) { \ ++ use_mm(mmptr); \ ++ valid = !get_user((dst), (wptr)); \ ++ unuse_mm(mmptr); \ ++ } \ ++ } \ ++ valid; \ ++ }) ++ ++/* GPUVM API */ ++int amdgpu_amdkfd_gpuvm_sync_memory( ++ struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); ++int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( ++ struct kgd_dev *kgd, uint64_t va, uint64_t size, ++ void *vm, struct kgd_mem **mem, ++ uint64_t *offset, uint32_t flags); ++int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); ++int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); ++int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); + ++int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, ++ void **process_info, ++ struct dma_fence **ef); ++void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm); ++ ++uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm); ++ ++int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, ++ struct kfd_vm_fault_info *info); ++ ++int amdgpu_amdkfd_gpuvm_mmap_bo( ++ struct kgd_dev *kgd, struct vm_area_struct *vma); ++ ++int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, ++ struct kgd_mem *mem, void **kptr); ++ ++int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, ++ struct kgd_mem *mem, uint64_t offset, ++ uint64_t size, struct sg_table **ret_sg); ++void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( ++ struct kgd_mem *mem, struct sg_table *sg); ++int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, ++ struct dma_buf *dmabuf, ++ uint64_t va, void *vm, ++ struct kgd_mem **mem, uint64_t *size, ++ uint64_t *mmap_offset); ++int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, ++ struct kgd_mem *mem, ++ struct dma_buf **dmabuf); ++int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm); ++int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm); ++ ++void amdgpu_amdkfd_gpuvm_init_mem_limits(void); ++void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo); + #endif /* AMDGPU_AMDKFD_H_INCLUDED */ ++ +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c +new file mode 100644 +index 0000000..3961937 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c +@@ -0,0 +1,196 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include <linux/spinlock.h> ++#include <linux/atomic.h> ++#include <linux/stacktrace.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include "amdgpu_amdkfd.h" ++ ++const struct dma_fence_ops amd_kfd_fence_ops; ++static atomic_t fence_seq = ATOMIC_INIT(0); ++ ++static int amd_kfd_fence_signal(struct dma_fence *f); ++ ++/* Eviction Fence ++ * Fence helper functions to deal with KFD memory eviction. ++ * Big Idea - Since KFD submissions are done by user queues, a BO cannot be ++ * evicted unless all the user queues for that process are evicted. ++ * ++ * All the BOs in a process share an eviction fence. When process X wants ++ * to map VRAM memory but TTM can't find enough space, TTM will attempt to ++ * evict BOs from its LRU list. TTM checks if the BO is valuable to evict ++ * by calling ttm_bo_driver->eviction_valuable(). ++ * ++ * ttm_bo_driver->eviction_valuable() - will return false if the BO belongs ++ * to process X. Otherwise, it will return true to indicate BO can be ++ * evicted by TTM. ++ * ++ * If ttm_bo_driver->eviction_valuable returns true, then TTM will continue ++ * the evcition process for that BO by calling ttm_bo_evict --> amdgpu_bo_move ++ * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler. ++ * ++ * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to ++ * nofity when the BO is free to move. fence_add_callback --> enable_signaling ++ * --> amdgpu_amdkfd_fence.enable_signaling ++ * ++ * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will quiesce ++ * user queues and signal fence. The work item will also start another delayed ++ * work item to restore BOs ++ */ ++ ++struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, ++ void *mm) ++{ ++ struct amdgpu_amdkfd_fence *fence = NULL; ++ ++ fence = kzalloc(sizeof(*fence), GFP_KERNEL); ++ if (fence == NULL) ++ return NULL; ++ ++ /* mm_struct mm is used as void pointer to identify the parent ++ * KFD process. Don't dereference it. Fence and any threads using ++ * mm is guranteed to be released before process termination. ++ */ ++ fence->mm = mm; ++ get_task_comm(fence->timeline_name, current); ++ spin_lock_init(&fence->lock); ++ ++ dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock, ++ context, atomic_inc_return(&fence_seq)); ++ ++ return fence; ++} ++ ++struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f) ++{ ++ struct amdgpu_amdkfd_fence *fence; ++ ++ if (!f) ++ return NULL; ++ ++ fence = container_of(f, struct amdgpu_amdkfd_fence, base); ++ if (fence && f->ops == &amd_kfd_fence_ops) ++ return fence; ++ ++ return NULL; ++} ++ ++static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f) ++{ ++ return "amdgpu_amdkfd_fence"; ++} ++ ++static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f) ++{ ++ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); ++ ++ return fence->timeline_name; ++} ++ ++/** ++ * amd_kfd_fence_enable_signaling - This gets called when TTM wants to evict ++ * a KFD BO and schedules a job to move the BO. ++ * If fence is already signaled return true. ++ * If fence is not signaled schedule a evict KFD process work item. ++ */ ++static bool amd_kfd_fence_enable_signaling(struct dma_fence *f) ++{ ++ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); ++ ++ if (!fence) ++ return false; ++ ++ if (dma_fence_is_signaled(f)) ++ return true; ++ ++ if (!kgd2kfd->schedule_evict_and_restore_process( ++ (struct mm_struct *)fence->mm, f)) ++ return true; ++ ++ return false; ++} ++ ++static int amd_kfd_fence_signal(struct dma_fence *f) ++{ ++ unsigned long flags; ++ int ret; ++ ++ spin_lock_irqsave(f->lock, flags); ++ /* Set enabled bit so cb will called */ ++ set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags); ++ ret = dma_fence_signal_locked(f); ++ spin_unlock_irqrestore(f->lock, flags); ++ ++ return ret; ++} ++ ++/** ++ * amd_kfd_fence_release - callback that fence can be freed ++ * ++ * @fence: fence ++ * ++ * This function is called when the reference count becomes zero. ++ * It just RCU schedules freeing up the fence. ++*/ ++static void amd_kfd_fence_release(struct dma_fence *f) ++{ ++ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); ++ /* Unconditionally signal the fence. The process is getting ++ * terminated. ++ */ ++ if (WARN_ON(!fence)) ++ return; /* Not an amdgpu_amdkfd_fence */ ++ ++ amd_kfd_fence_signal(f); ++ kfree_rcu(f, rcu); ++} ++ ++/** ++ * amd_kfd_fence_check_mm - Check if @mm is same as that of the fence @f ++ * if same return TRUE else return FALSE. ++ * ++ * @f: [IN] fence ++ * @mm: [IN] mm that needs to be verified ++*/ ++bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm) ++{ ++ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); ++ ++ if (!fence) ++ return false; ++ else if (fence->mm == mm) ++ return true; ++ ++ return false; ++} ++ ++const struct dma_fence_ops amd_kfd_fence_ops = { ++ .get_driver_name = amd_kfd_fence_get_driver_name, ++ .get_timeline_name = amd_kfd_fence_get_timeline_name, ++ .enable_signaling = amd_kfd_fence_enable_signaling, ++ .signaled = NULL, ++ .wait = dma_fence_default_wait, ++ .release = amd_kfd_fence_release, ++}; ++ +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +old mode 100644 +new mode 100755 +index 5748504..6964ece +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +@@ -20,6 +20,9 @@ + * OTHER DEALINGS IN THE SOFTWARE. + */ + ++#undef pr_fmt ++#define pr_fmt(fmt) "kfd2kgd: " fmt ++ + #include <linux/fdtable.h> + #include <linux/uaccess.h> + #include <linux/firmware.h> +@@ -39,6 +42,14 @@ + #include "gmc/gmc_7_1_sh_mask.h" + #include "cik_structs.h" + ++#define AMDKFD_SKIP_UNCOMPILED_CODE 1 ++ ++enum hqd_dequeue_request_type { ++ NO_ACTION = 0, ++ DRAIN_PIPE, ++ RESET_WAVES ++}; ++ + enum { + MAX_TRAPID = 8, /* 3 bits in the bitfield. */ + MAX_WATCH_ADDRESSES = 4 +@@ -55,8 +66,8 @@ enum { + enum { + ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, + ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, +- ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, +- /* extend the mask to 26 bits to match the low address field */ ++ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000, ++ /* extend the mask to 26 bits in order to match the low address field */ + ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, + ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF + }; +@@ -81,30 +92,42 @@ union TCP_WATCH_CNTL_BITS { + float f32All; + }; + ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, ++ int fd, uint32_t handle, struct kgd_mem **mem); ++ ++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); ++ + /* + * Register access functions + */ + + static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, +- uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); +- ++ uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, ++ uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); + static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, +- unsigned int vmid); +- ++ unsigned int vmid); + static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, +- uint32_t hpd_size, uint64_t hpd_gpu_addr); ++ uint32_t hpd_size, uint64_t hpd_gpu_addr); + static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr); +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t wptr_shift, uint32_t wptr_mask, ++ struct mm_struct *mm); ++static int kgd_hqd_dump(struct kgd_dev *kgd, ++ uint32_t pipe_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs); ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, ++ uint32_t __user *wptr, struct mm_struct *mm); ++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, ++ uint32_t engine_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs); + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, +- uint32_t pipe_id, uint32_t queue_id); +- +-static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, ++ uint32_t pipe_id, uint32_t queue_id); ++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); ++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, ++ enum kfd_preempt_type reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id); +-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); + static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout); + static int kgd_address_watch_disable(struct kgd_dev *kgd); +@@ -124,21 +147,60 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid); + static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + uint8_t vmid); + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); ++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); ++static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req); ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid); ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype); ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base); ++static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd); ++ ++/* Because of REG_GET_FIELD() being used, we put this function in the ++ * asic specific file. ++ */ ++static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, ++ struct tile_config *config) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + +-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); ++ config->gb_addr_config = adev->gfx.config.gb_addr_config; ++ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, ++ MC_ARB_RAMCFG, NOOFBANK); ++ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, ++ MC_ARB_RAMCFG, NOOFRANKS); ++ ++ config->tile_config_ptr = adev->gfx.config.tile_mode_array; ++ config->num_tile_configs = ++ ARRAY_SIZE(adev->gfx.config.tile_mode_array); ++ config->macro_tile_config_ptr = ++ adev->gfx.config.macrotile_mode_array; ++ config->num_macro_tile_configs = ++ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); ++ ++ ++ return 0; ++} + + static const struct kfd2kgd_calls kfd2kgd = { + .init_gtt_mem_allocation = alloc_gtt_mem, + .free_gtt_mem = free_gtt_mem, +- .get_vmem_size = get_vmem_size, ++ .get_local_mem_info = get_local_mem_info, + .get_gpu_clock_counter = get_gpu_clock_counter, + .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, ++ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, ++ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, ++ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, ++ .open_graphic_handle = open_graphic_handle, + .program_sh_mem_settings = kgd_program_sh_mem_settings, + .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, + .init_pipeline = kgd_init_pipeline, + .init_interrupts = kgd_init_interrupts, + .hqd_load = kgd_hqd_load, + .hqd_sdma_load = kgd_hqd_sdma_load, ++ .hqd_dump = kgd_hqd_dump, ++ .hqd_sdma_dump = kgd_hqd_sdma_dump, + .hqd_is_occupied = kgd_hqd_is_occupied, + .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, + .hqd_destroy = kgd_hqd_destroy, +@@ -147,17 +209,50 @@ static const struct kfd2kgd_calls kfd2kgd = { + .address_watch_execute = kgd_address_watch_execute, + .wave_control_execute = kgd_wave_control_execute, + .address_watch_get_offset = kgd_address_watch_get_offset, +- .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid, +- .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, ++ .get_atc_vmid_pasid_mapping_pasid = ++ get_atc_vmid_pasid_mapping_pasid, ++ .get_atc_vmid_pasid_mapping_valid = ++ get_atc_vmid_pasid_mapping_valid, ++ .read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg, + .write_vmid_invalidate_request = write_vmid_invalidate_request, +- .get_fw_version = get_fw_version ++ .invalidate_tlbs = invalidate_tlbs, ++ .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, ++ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, ++ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, ++ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, ++ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, ++ .get_fw_version = get_fw_version, ++ .set_num_of_requests = set_num_of_requests, ++ .get_cu_info = get_cu_info, ++ .alloc_memory_of_scratch = alloc_memory_of_scratch, ++ .write_config_static_mem = write_config_static_mem, ++ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, ++ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, ++ .set_vm_context_page_table_base = set_vm_context_page_table_base, ++ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, ++ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, ++ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, ++ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, ++ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, ++ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, ++ .submit_ib = amdgpu_amdkfd_submit_ib, ++ .get_tile_config = amdgpu_amdkfd_get_tile_config, ++ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, ++ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, ++ .get_vram_usage = amdgpu_amdkfd_get_vram_usage + }; + +-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void) ++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions() + { + return (struct kfd2kgd_calls *)&kfd2kgd; + } + ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, ++ int fd, uint32_t handle, struct kgd_mem **mem) ++{ ++ return 0; ++} ++ + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) + { + return (struct amdgpu_device *)kgd; +@@ -186,7 +281,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + +- uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + lock_srbm(kgd, mec, pipe, queue_id, 0); +@@ -222,12 +317,12 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, + + /* + * We have to assume that there is no outstanding mapping. +- * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because +- * a mapping is in progress or because a mapping finished and the +- * SW cleared it. So the protocol is to always wait & clear. ++ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a ++ * mapping is in progress or because a mapping finished and the SW ++ * cleared it. So the protocol is to always wait & clear. + */ +- uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | +- ATC_VMID0_PASID_MAPPING__VALID_MASK; ++ uint32_t pasid_mapping = (pasid == 0) ? 0 : ++ (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; + + WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); + +@@ -273,8 +368,7 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) + + retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + + m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; +- +- pr_debug("kfd: sdma base address: 0x%x\n", retval); ++ pr_debug("sdma base address: 0x%x\n", retval); + + return retval; + } +@@ -290,26 +384,91 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) + } + + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr) ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t wptr_shift, uint32_t wptr_mask, ++ struct mm_struct *mm) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t wptr_shadow, is_wptr_shadow_valid; + struct cik_mqd *m; ++ uint32_t *mqd_hqd; ++ uint32_t reg, wptr_val, data; ++ bool valid_wptr = false; + + m = get_mqd(mqd); + +- is_wptr_shadow_valid = !get_user(wptr_shadow, wptr); +- if (is_wptr_shadow_valid) +- m->cp_hqd_pq_wptr = wptr_shadow; ++ acquire_queue(kgd, pipe_id, queue_id); ++ ++ /* HQD registers extend from CP_MQD_BASE_ADDR to CP_MQD_CONTROL. */ ++ mqd_hqd = &m->cp_mqd_base_addr_lo; ++ ++ for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) ++ WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); ++ ++ /* Copy userspace write pointer value to register. ++ * Activate doorbell logic to monitor subsequent changes. ++ */ ++ data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, ++ CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); ++ WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); ++ ++ /* read_user_ptr may take the mm->mmap_sem. ++ * release srbm_mutex to avoid circular dependency between ++ * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. ++ */ ++ release_queue(kgd); ++ valid_wptr = read_user_wptr(mm, wptr, wptr_val); + + acquire_queue(kgd, pipe_id, queue_id); +- gfx_v7_0_mqd_commit(adev, m); ++ if (valid_wptr) ++ WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); ++ ++ data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); ++ WREG32(mmCP_HQD_ACTIVE, data); ++ ++ + release_queue(kgd); + + return 0; + } + +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) ++static int kgd_hqd_dump(struct kgd_dev *kgd, ++ uint32_t pipe_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t i = 0, reg; ++#define HQD_N_REGS (35+4) ++#define DUMP_REG(addr) do { \ ++ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ ++ break; \ ++ (*dump)[i][0] = (addr) << 2; \ ++ (*dump)[i++][1] = RREG32(addr); \ ++ } while (0) ++ ++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); ++ if (*dump == NULL) ++ return -ENOMEM; ++ ++ acquire_queue(kgd, pipe_id, queue_id); ++ ++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); ++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); ++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); ++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); ++ ++ for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) ++ DUMP_REG(reg); ++ ++ release_queue(kgd); ++ ++ WARN_ON_ONCE(i != HQD_N_REGS); ++ *n_regs = i; ++ ++ return 0; ++} ++ ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, ++ uint32_t __user *wptr, struct mm_struct *mm) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + struct cik_sdma_rlc_registers *m; +@@ -320,17 +479,17 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); + +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, +- m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, ++ m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); + +- end_jiffies = msecs_to_jiffies(2000) + jiffies; + while (true) { +- data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) +- break; +- if (time_after(jiffies, end_jiffies)) +- return -ETIME; +- usleep_range(500, 1000); ++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); ++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) ++ break; ++ if (timeout == 0) ++ return -ETIME; ++ msleep(10); ++ timeout -= 10; + } + if (m->sdma_engine_id) { + data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); +@@ -344,25 +503,59 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) + WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); + } + +- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, +- m->sdma_rlc_doorbell); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, +- m->sdma_rlc_virtual_addr); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); ++ data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL, ++ ENABLE, 1); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr); ++ if (read_user_wptr(mm, wptr, data)) ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); ++ else ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, ++ m->sdma_rlc_rb_rptr); ++ ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, ++ m->sdma_rlc_virtual_addr); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); ++ + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, + m->sdma_rlc_rb_base_hi); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, + m->sdma_rlc_rb_rptr_addr_lo); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, + m->sdma_rlc_rb_rptr_addr_hi); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, +- m->sdma_rlc_rb_cntl); +- ++ data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL, ++ RB_ENABLE, 1); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); + return 0; + } + ++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, ++ uint32_t engine_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + ++ queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; ++ uint32_t i = 0, reg; ++#undef HQD_N_REGS ++#define HQD_N_REGS (19+4) ++ ++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); ++ if (*dump == NULL) ++ return -ENOMEM; ++ ++ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) ++ DUMP_REG(sdma_offset + reg); ++ for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; ++ reg++) ++ DUMP_REG(sdma_offset + reg); ++ ++ WARN_ON_ONCE(i != HQD_N_REGS); ++ *n_regs = i; ++ ++ return 0; ++} ++ + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + uint32_t pipe_id, uint32_t queue_id) + { +@@ -403,30 +596,99 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) + return false; + } + +-static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, ++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, ++ enum kfd_preempt_type reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t temp; +- int timeout = utimeout; ++ enum hqd_dequeue_request_type type; ++ unsigned long flags, end_jiffies; ++ int retry; + + acquire_queue(kgd, pipe_id, queue_id); + WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0); + +- WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); ++ switch (reset_type) { ++ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: ++ type = DRAIN_PIPE; ++ break; ++ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: ++ type = RESET_WAVES; ++ break; ++ default: ++ type = DRAIN_PIPE; ++ break; ++ } ++ ++ /* Workaround: If IQ timer is active and the wait time is close to or ++ * equal to 0, dequeueing is not safe. Wait until either the wait time ++ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is ++ * cleared before continuing. Also, ensure wait times are set to at ++ * least 0x3. ++ */ ++ local_irq_save(flags); ++ preempt_disable(); ++ retry = 5000; /* wait for 500 usecs at maximum */ ++ while (true) { ++ temp = RREG32(mmCP_HQD_IQ_TIMER); ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { ++ pr_debug("HW is processing IQ\n"); ++ goto loop; ++ } ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) ++ == 3) /* SEM-rearm is safe */ ++ break; ++ /* Wait time 3 is safe for CP, but our MMIO read/write ++ * time is close to 1 microsecond, so check for 10 to ++ * leave more buffer room ++ */ ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) ++ >= 10) ++ break; ++ pr_debug("IQ timer is active\n"); ++ } else ++ break; ++loop: ++ if (!retry) { ++ pr_err("CP HQD IQ timer status time out\n"); ++ break; ++ } ++ ndelay(100); ++ --retry; ++ } ++ retry = 1000; ++ while (true) { ++ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); ++ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) ++ break; ++ pr_debug("Dequeue request is pending\n"); + ++ if (!retry) { ++ pr_err("CP HQD dequeue request time out\n"); ++ break; ++ } ++ ndelay(100); ++ --retry; ++ } ++ local_irq_restore(flags); ++ preempt_enable(); ++ ++ WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); ++ ++ end_jiffies = (utimeout * HZ / 1000) + jiffies; + while (true) { + temp = RREG32(mmCP_HQD_ACTIVE); +- if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) ++ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) + break; +- if (timeout <= 0) { +- pr_err("kfd: cp queue preemption time out.\n"); ++ if (time_after(jiffies, end_jiffies)) { ++ pr_err("cp queue preemption time out\n"); + release_queue(kgd); + return -ETIME; + } +- msleep(20); +- timeout -= 20; ++ usleep_range(500, 1000); + } + + release_queue(kgd); +@@ -440,7 +702,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + struct cik_sdma_rlc_registers *m; + uint32_t sdma_base_addr; + uint32_t temp; +- int timeout = utimeout; ++ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); +@@ -451,12 +713,11 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + + while (true) { + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) ++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + break; +- if (timeout <= 0) ++ if (time_after(jiffies, end_jiffies)) + return -ETIME; +- msleep(20); +- timeout -= 20; ++ usleep_range(500, 1000); + } + + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); +@@ -464,6 +725,8 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | + SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); + ++ m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); ++ + return 0; + } + +@@ -481,8 +744,9 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd) + + /* Turning off this address until we set all the registers */ + for (i = 0; i < MAX_WATCH_ADDRESSES; i++) +- WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_CNTL], cntl.u32All); ++ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); + + return 0; + } +@@ -500,20 +764,24 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, + + /* Turning off this watch point until we set all the registers */ + cntl.bitfields.valid = 0; +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_CNTL], cntl.u32All); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); + +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_ADDR_HI], addr_hi); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_ADDR_HI], ++ addr_hi); + +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_ADDR_LO], addr_lo); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_ADDR_LO], ++ addr_lo); + + /* Enable the watch point */ + cntl.bitfields.valid = 1; + +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_CNTL], cntl.u32All); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); + + return 0; + } +@@ -567,7 +835,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); +- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; ++ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; + } + + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) +@@ -577,52 +845,90 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) + WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); + } + ++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ int vmid; ++ ++ for (vmid = 0; vmid < 16; vmid++) { ++ if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) ++ continue; ++ if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & ++ ATC_VMID0_PASID_MAPPING__VALID_MASK) { ++ if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & ++ ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { ++ WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); ++ break; ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype) ++{ ++ uint32_t reg; ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | ++ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | ++ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | ++ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; ++ ++ WREG32(mmSH_STATIC_MEM_CONFIG, reg); ++ return 0; ++} ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ lock_srbm(kgd, 0, 0, 0, vmid); ++ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); ++ unlock_srbm(kgd); ++ ++ return 0; ++} ++ ++ + static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + { + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + const union amdgpu_firmware_header *hdr; + +- BUG_ON(kgd == NULL); +- + switch (type) { + case KGD_ENGINE_PFP: +- hdr = (const union amdgpu_firmware_header *) +- adev->gfx.pfp_fw->data; ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; + break; + + case KGD_ENGINE_ME: +- hdr = (const union amdgpu_firmware_header *) +- adev->gfx.me_fw->data; ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; + break; + + case KGD_ENGINE_CE: +- hdr = (const union amdgpu_firmware_header *) +- adev->gfx.ce_fw->data; ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; + break; + + case KGD_ENGINE_MEC1: +- hdr = (const union amdgpu_firmware_header *) +- adev->gfx.mec_fw->data; ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; + break; + + case KGD_ENGINE_MEC2: +- hdr = (const union amdgpu_firmware_header *) +- adev->gfx.mec2_fw->data; ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; + break; + + case KGD_ENGINE_RLC: +- hdr = (const union amdgpu_firmware_header *) +- adev->gfx.rlc_fw->data; ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; + break; + + case KGD_ENGINE_SDMA1: +- hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[0].fw->data; ++ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; + break; + + case KGD_ENGINE_SDMA2: +- hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[1].fw->data; ++ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; + break; + + default: +@@ -636,3 +942,42 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + return hdr->common.ucode_version; + } + ++static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req) ++{ ++ uint32_t value; ++ struct amdgpu_device *adev = get_amdgpu_device(dev); ++ ++ value = RREG32(mmATC_ATS_DEBUG); ++ value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK; ++ value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT); ++ ++ WREG32(mmATC_ATS_DEBUG, value); ++} ++ ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ /* TODO: Don't use hardcoded VMIDs */ ++ if (vmid < 8 || vmid > 15) { ++ pr_err("trying to set page table base for wrong VMID\n"); ++ return; ++ } ++ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); ++} ++ ++ /** ++ * read_vmid_from_vmfault_reg - read vmid from register ++ * ++ * adev: amdgpu_device pointer ++ * @vmid: vmid pointer ++ * read vmid from register (CIK). ++ */ ++static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ ++ uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); ++ ++ return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID); ++} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +old mode 100644 +new mode 100755 +index c5044d5..2ff10e9 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +@@ -20,6 +20,9 @@ + * OTHER DEALINGS IN THE SOFTWARE. + */ + ++#undef pr_fmt ++#define pr_fmt(fmt) "kfd2kgd: " fmt ++ + #include <linux/module.h> + #include <linux/fdtable.h> + #include <linux/uaccess.h> +@@ -28,7 +31,7 @@ + #include "amdgpu.h" + #include "amdgpu_amdkfd.h" + #include "amdgpu_ucode.h" +-#include "gfx_v8_0.h" ++#include "amdgpu_amdkfd_gfx_v8.h" + #include "gca/gfx_8_0_sh_mask.h" + #include "gca/gfx_8_0_d.h" + #include "gca/gfx_8_0_enum.h" +@@ -39,7 +42,31 @@ + #include "vi_structs.h" + #include "vid.h" + +-struct cik_sdma_rlc_registers; ++enum hqd_dequeue_request_type { ++ NO_ACTION = 0, ++ DRAIN_PIPE, ++ RESET_WAVES, ++ SAVE_WAVES ++}; ++ ++static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { ++ mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, ++ mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, ++ mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, ++ mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL ++}; ++ ++ ++struct vi_sdma_mqd; ++ ++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem); ++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); ++ ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, ++ int fd, uint32_t handle, struct kgd_mem **mem); ++ ++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); + + /* + * Register access functions +@@ -55,17 +82,26 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, + uint32_t hpd_size, uint64_t hpd_gpu_addr); + static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr); +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t wptr_shift, uint32_t wptr_mask, ++ struct mm_struct *mm); ++static int kgd_hqd_dump(struct kgd_dev *kgd, ++ uint32_t pipe_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs); ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, ++ uint32_t __user *wptr, struct mm_struct *mm); ++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, ++ uint32_t engine_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs); + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + uint32_t pipe_id, uint32_t queue_id); + static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); +-static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, ++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, ++ enum kfd_preempt_type reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id); + static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout); +-static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); + static int kgd_address_watch_disable(struct kgd_dev *kgd); + static int kgd_address_watch_execute(struct kgd_dev *kgd, + unsigned int watch_point_id, +@@ -84,20 +120,61 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, + static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + uint8_t vmid); + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); ++static void set_num_of_requests(struct kgd_dev *kgd, ++ uint8_t num_of_requests); ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid); ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype); ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base); ++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); ++ ++/* Because of REG_GET_FIELD() being used, we put this function in the ++ * asic specific file. ++ */ ++static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, ++ struct tile_config *config) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ ++ config->gb_addr_config = adev->gfx.config.gb_addr_config; ++ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, ++ MC_ARB_RAMCFG, NOOFBANK); ++ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, ++ MC_ARB_RAMCFG, NOOFRANKS); ++ ++ config->tile_config_ptr = adev->gfx.config.tile_mode_array; ++ config->num_tile_configs = ++ ARRAY_SIZE(adev->gfx.config.tile_mode_array); ++ config->macro_tile_config_ptr = ++ adev->gfx.config.macrotile_mode_array; ++ config->num_macro_tile_configs = ++ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); ++ ++ return 0; ++} + + static const struct kfd2kgd_calls kfd2kgd = { + .init_gtt_mem_allocation = alloc_gtt_mem, + .free_gtt_mem = free_gtt_mem, +- .get_vmem_size = get_vmem_size, ++ .get_local_mem_info = get_local_mem_info, + .get_gpu_clock_counter = get_gpu_clock_counter, + .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, ++ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, ++ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, ++ .create_process_gpumem = create_process_gpumem, ++ .destroy_process_gpumem = destroy_process_gpumem, ++ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, ++ .open_graphic_handle = open_graphic_handle, + .program_sh_mem_settings = kgd_program_sh_mem_settings, + .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, + .init_pipeline = kgd_init_pipeline, + .init_interrupts = kgd_init_interrupts, + .hqd_load = kgd_hqd_load, + .hqd_sdma_load = kgd_hqd_sdma_load, ++ .hqd_dump = kgd_hqd_dump, ++ .hqd_sdma_dump = kgd_hqd_sdma_dump, + .hqd_is_occupied = kgd_hqd_is_occupied, + .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, + .hqd_destroy = kgd_hqd_destroy, +@@ -111,14 +188,56 @@ static const struct kfd2kgd_calls kfd2kgd = { + .get_atc_vmid_pasid_mapping_valid = + get_atc_vmid_pasid_mapping_valid, + .write_vmid_invalidate_request = write_vmid_invalidate_request, +- .get_fw_version = get_fw_version ++ .invalidate_tlbs = invalidate_tlbs, ++ .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, ++ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, ++ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, ++ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, ++ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, ++ .get_fw_version = get_fw_version, ++ .set_num_of_requests = set_num_of_requests, ++ .get_cu_info = get_cu_info, ++ .alloc_memory_of_scratch = alloc_memory_of_scratch, ++ .write_config_static_mem = write_config_static_mem, ++ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, ++ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, ++ .set_vm_context_page_table_base = set_vm_context_page_table_base, ++ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, ++ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, ++ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, ++ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, ++ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, ++ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, ++ .submit_ib = amdgpu_amdkfd_submit_ib, ++ .get_tile_config = amdgpu_amdkfd_get_tile_config, ++ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, ++ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, ++ .get_vram_usage = amdgpu_amdkfd_get_vram_usage + }; + +-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void) ++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions() + { + return (struct kfd2kgd_calls *)&kfd2kgd; + } + ++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem) ++{ ++ return 0; ++} ++ ++/* Destroys the GPU allocation and frees the kgd_mem structure */ ++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) ++{ ++ ++} ++ ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, ++ int fd, uint32_t handle, struct kgd_mem **mem) ++{ ++ return 0; ++} ++ + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) + { + return (struct amdgpu_device *)kgd; +@@ -147,7 +266,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + +- uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + lock_srbm(kgd, mec, pipe, queue_id, 0); +@@ -216,21 +335,28 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) + uint32_t mec; + uint32_t pipe; + +- mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + lock_srbm(kgd, mec, pipe, 0, 0); + +- WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK); ++ WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | ++ CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); + + unlock_srbm(kgd); + + return 0; + } + +-static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) ++static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m) + { +- return 0; ++ uint32_t retval; ++ ++ retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + ++ m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET; ++ pr_debug("sdma base address: 0x%x\n", retval); ++ ++ return retval; + } + + static inline struct vi_mqd *get_mqd(void *mqd) +@@ -238,9 +364,9 @@ static inline struct vi_mqd *get_mqd(void *mqd) + return (struct vi_mqd *)mqd; + } + +-static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) ++static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) + { +- return (struct cik_sdma_rlc_registers *)mqd; ++ return (struct vi_sdma_mqd *)mqd; + } + + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +@@ -252,16 +378,18 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, + struct vi_mqd *m; + uint32_t *mqd_hqd; + uint32_t reg, wptr_val, data; ++ bool valid_wptr = false; + + m = get_mqd(mqd); + + acquire_queue(kgd, pipe_id, queue_id); +- /*HIQ is set during driver init period with vmid set to 0. For SRIOV +- * world switching support let the RLC know about the HIQ. +- * +- * Workaround: This causes reboots on CZ. Disable this on CZ, which +- * doesn't support SRIOV anyway. +- */ ++ ++ /* HIQ is set during driver init period with vmid set to 0. For SRIOV ++ * world switching support let the RLC know about the HIQ. ++ * ++ * Workaround: This causes reboots on CZ. Disable this on CZ, which ++ * doesn't support SRIOV anyway. ++ */ + if (m->cp_hqd_vmid == 0 && + adev->asic_type != CHIP_CARRIZO) { + uint32_t value, mec, pipe; +@@ -304,7 +432,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, + CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); + WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); + +- if (read_user_wptr(mm, wptr, wptr_val)) ++ /* read_user_ptr may take the mm->mmap_sem. ++ * release srbm_mutex to avoid circular dependency between ++ * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. ++ */ ++ release_queue(kgd); ++ valid_wptr = read_user_wptr(mm, wptr, wptr_val); ++ acquire_queue(kgd, pipe_id, queue_id); ++ if (valid_wptr) + WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); + + data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); +@@ -315,8 +450,138 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, + return 0; + } + +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) ++static int kgd_hqd_dump(struct kgd_dev *kgd, ++ uint32_t pipe_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t i = 0, reg; ++#define HQD_N_REGS (54+4) ++#define DUMP_REG(addr) do { \ ++ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ ++ break; \ ++ (*dump)[i][0] = (addr) << 2; \ ++ (*dump)[i++][1] = RREG32(addr); \ ++ } while (0) ++ ++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); ++ if (*dump == NULL) ++ return -ENOMEM; ++ ++ acquire_queue(kgd, pipe_id, queue_id); ++ ++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); ++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); ++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); ++ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); ++ ++ for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++) ++ DUMP_REG(reg); ++ ++ release_queue(kgd); ++ ++ WARN_ON_ONCE(i != HQD_N_REGS); ++ *n_regs = i; ++ ++ return 0; ++} ++ ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, ++ uint32_t __user *wptr, struct mm_struct *mm) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ struct vi_sdma_mqd *m; ++ uint32_t sdma_base_addr; ++ uint32_t temp, timeout = 2000; ++ uint32_t data; ++ ++ m = get_sdma_mqd(mqd); ++ sdma_base_addr = get_sdma_base_addr(m); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, ++ m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); ++ ++ while (true) { ++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); ++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) ++ break; ++ if (timeout == 0) ++ return -ETIME; ++ msleep(10); ++ timeout -= 10; ++ } ++ if (m->sdma_engine_id) { ++ data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); ++ data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, ++ RESUME_CTX, 0); ++ WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); ++ } else { ++ data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); ++ data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, ++ RESUME_CTX, 0); ++ WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); ++ } ++ ++ data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, ++ ENABLE, 1); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); ++ ++ if (read_user_wptr(mm, wptr, data)) ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); ++ else ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, ++ m->sdmax_rlcx_rb_rptr); ++ ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, ++ m->sdmax_rlcx_virtual_addr); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, ++ m->sdmax_rlcx_rb_base_hi); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, ++ m->sdmax_rlcx_rb_rptr_addr_lo); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, ++ m->sdmax_rlcx_rb_rptr_addr_hi); ++ ++ data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, ++ RB_ENABLE, 1); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); ++ ++ return 0; ++} ++ ++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, ++ uint32_t engine_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs) + { ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + ++ queue_id * KFD_VI_SDMA_QUEUE_OFFSET; ++ uint32_t i = 0, reg; ++#undef HQD_N_REGS ++#define HQD_N_REGS (19+4+2+3+7) ++ ++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); ++ if (*dump == NULL) ++ return -ENOMEM; ++ ++ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) ++ DUMP_REG(sdma_offset + reg); ++ for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; ++ reg++) ++ DUMP_REG(sdma_offset + reg); ++ for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; ++ reg++) ++ DUMP_REG(sdma_offset + reg); ++ for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG; ++ reg++) ++ DUMP_REG(sdma_offset + reg); ++ for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL; ++ reg++) ++ DUMP_REG(sdma_offset + reg); ++ ++ WARN_ON_ONCE(i != HQD_N_REGS); ++ *n_regs = i; ++ + return 0; + } + +@@ -345,7 +610,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct cik_sdma_rlc_registers *m; ++ struct vi_sdma_mqd *m; + uint32_t sdma_base_addr; + uint32_t sdma_rlc_rb_cntl; + +@@ -360,29 +625,102 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) + return false; + } + +-static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, ++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, ++ enum kfd_preempt_type reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t temp; +- int timeout = utimeout; ++ enum hqd_dequeue_request_type type; ++ unsigned long flags, end_jiffies; ++ int retry; ++ struct vi_mqd *m = get_mqd(mqd); + + acquire_queue(kgd, pipe_id, queue_id); + +- WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); ++ if (m->cp_hqd_vmid == 0) ++ WREG32_FIELD(RLC_CP_SCHEDULERS, scheduler1, 0); + ++ switch (reset_type) { ++ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: ++ type = DRAIN_PIPE; ++ break; ++ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: ++ type = RESET_WAVES; ++ break; ++ default: ++ type = DRAIN_PIPE; ++ break; ++ } ++ ++ /* Workaround: If IQ timer is active and the wait time is close to or ++ * equal to 0, dequeueing is not safe. Wait until either the wait time ++ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is ++ * cleared before continuing. Also, ensure wait times are set to at ++ * least 0x3. ++ */ ++ local_irq_save(flags); ++ preempt_disable(); ++ retry = 5000; /* wait for 500 usecs at maximum */ ++ while (true) { ++ temp = RREG32(mmCP_HQD_IQ_TIMER); ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { ++ pr_debug("HW is processing IQ\n"); ++ goto loop; ++ } ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) ++ == 3) /* SEM-rearm is safe */ ++ break; ++ /* Wait time 3 is safe for CP, but our MMIO read/write ++ * time is close to 1 microsecond, so check for 10 to ++ * leave more buffer room ++ */ ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) ++ >= 10) ++ break; ++ pr_debug("IQ timer is active\n"); ++ } else ++ break; ++loop: ++ if (!retry) { ++ pr_err("CP HQD IQ timer status time out\n"); ++ break; ++ } ++ ndelay(100); ++ --retry; ++ } ++ retry = 1000; ++ while (true) { ++ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); ++ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) ++ break; ++ pr_debug("Dequeue request is pending\n"); ++ ++ if (!retry) { ++ pr_err("CP HQD dequeue request time out\n"); ++ break; ++ } ++ ndelay(100); ++ --retry; ++ } ++ local_irq_restore(flags); ++ preempt_enable(); ++ ++ WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); ++ ++ end_jiffies = (utimeout * HZ / 1000) + jiffies; + while (true) { + temp = RREG32(mmCP_HQD_ACTIVE); +- if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) ++ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) + break; +- if (timeout <= 0) { +- pr_err("kfd: cp queue preemption time out.\n"); ++ if (time_after(jiffies, end_jiffies)) { ++ pr_err("cp queue preemption time out.\n"); + release_queue(kgd); + return -ETIME; + } +- msleep(20); +- timeout -= 20; ++ usleep_range(500, 1000); + } + + release_queue(kgd); +@@ -393,10 +731,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct cik_sdma_rlc_registers *m; ++ struct vi_sdma_mqd *m; + uint32_t sdma_base_addr; + uint32_t temp; +- int timeout = utimeout; ++ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); +@@ -407,18 +745,19 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + + while (true) { + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) ++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + break; +- if (timeout <= 0) ++ if (time_after(jiffies, end_jiffies)) + return -ETIME; +- msleep(20); +- timeout -= 20; ++ usleep_range(500, 1000); + } + + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, ++ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | ++ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); ++ ++ m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); + + return 0; + } +@@ -440,7 +779,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); +- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; ++ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; + } + + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) +@@ -450,8 +789,83 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) + WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); + } + ++/* ++ * FIXME: Poliars test failed with this package, FIJI works fine ++ * From the CP spec it does not official support the invalidation ++ * with the specified pasid in the package, so disable it for V8 ++ * ++ */ ++#ifdef V8_SUPPORT_IT_OFFICIAL ++static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) ++{ ++ signed long r; ++ struct dma_fence *f; ++ struct amdgpu_ring *ring = &adev->gfx.kiq.ring; ++ ++ mutex_lock(&adev->gfx.kiq.ring_mutex); ++ amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ ++ amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); ++ amdgpu_ring_write(ring, ++ PACKET3_INVALIDATE_TLBS_DST_SEL(1) | ++ PACKET3_INVALIDATE_TLBS_PASID(pasid)); ++ amdgpu_fence_emit(ring, &f); ++ amdgpu_ring_commit(ring); ++ mutex_unlock(&adev->gfx.kiq.ring_mutex); ++ ++ r = dma_fence_wait(f, false); ++ if (r) ++ DRM_ERROR("wait for kiq fence error: %ld.\n", r); ++ dma_fence_put(f); ++ ++ return r; ++} ++#endif ++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ int vmid; ++ ++#ifdef V8_SUPPORT_IT_OFFICIAL ++ struct amdgpu_ring *ring = &adev->gfx.kiq.ring; ++ ++ if (ring->ready) ++ return invalidate_tlbs_with_kiq(adev, pasid); ++#endif ++ ++ for (vmid = 0; vmid < 16; vmid++) { ++ if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) ++ continue; ++ if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & ++ ATC_VMID0_PASID_MAPPING__VALID_MASK) { ++ if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & ++ ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { ++ WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); ++ break; ++ } ++ } ++ } ++ ++ return 0; ++} ++ + static int kgd_address_watch_disable(struct kgd_dev *kgd) + { ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ union TCP_WATCH_CNTL_BITS cntl; ++ unsigned int i; ++ ++ cntl.u32All = 0; ++ ++ cntl.bitfields.valid = 0; ++ cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; ++ cntl.bitfields.atc = 1; ++ ++ /* Turning off this address until we set all the registers */ ++ for (i = 0; i < MAX_WATCH_ADDRESSES; i++) ++ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); ++ + return 0; + } + +@@ -461,6 +875,32 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, + uint32_t addr_hi, + uint32_t addr_lo) + { ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ union TCP_WATCH_CNTL_BITS cntl; ++ ++ cntl.u32All = cntl_val; ++ ++ /* Turning off this watch point until we set all the registers */ ++ cntl.bitfields.valid = 0; ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); ++ ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_ADDR_HI], ++ addr_hi); ++ ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_ADDR_LO], ++ addr_lo); ++ ++ /* Enable the watch point */ ++ cntl.bitfields.valid = 1; ++ ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX ++ + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); ++ + return 0; + } + +@@ -493,6 +933,32 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, + unsigned int watch_point_id, + unsigned int reg_offset) + { ++ return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; ++} ++ ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype) ++{ ++ uint32_t reg; ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | ++ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | ++ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | ++ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; ++ ++ WREG32(mmSH_STATIC_MEM_CONFIG, reg); ++ return 0; ++} ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ lock_srbm(kgd, 0, 0, 0, vmid); ++ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); ++ unlock_srbm(kgd); ++ + return 0; + } + +@@ -501,47 +967,45 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + const union amdgpu_firmware_header *hdr; + +- BUG_ON(kgd == NULL); +- + switch (type) { + case KGD_ENGINE_PFP: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.pfp_fw->data; ++ adev->gfx.pfp_fw->data; + break; + + case KGD_ENGINE_ME: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.me_fw->data; ++ adev->gfx.me_fw->data; + break; + + case KGD_ENGINE_CE: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.ce_fw->data; ++ adev->gfx.ce_fw->data; + break; + + case KGD_ENGINE_MEC1: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.mec_fw->data; ++ adev->gfx.mec_fw->data; + break; + + case KGD_ENGINE_MEC2: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.mec2_fw->data; ++ adev->gfx.mec2_fw->data; + break; + + case KGD_ENGINE_RLC: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.rlc_fw->data; ++ adev->gfx.rlc_fw->data; + break; + + case KGD_ENGINE_SDMA1: + hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[0].fw->data; ++ adev->sdma.instance[0].fw->data; + break; + + case KGD_ENGINE_SDMA2: + hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[1].fw->data; ++ adev->sdma.instance[1].fw->data; + break; + + default: +@@ -554,3 +1018,21 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + /* Only 12 bit in use*/ + return hdr->common.ucode_version; + } ++ ++static void set_num_of_requests(struct kgd_dev *kgd, ++ uint8_t num_of_requests) ++{ ++ pr_debug("This is a stub\n"); ++} ++ ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ /* TODO: Don't use hardcoded VMIDs */ ++ if (vmid < 8 || vmid > 15) { ++ pr_err("trying to set page table base for wrong VMID\n"); ++ return; ++ } ++ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); ++} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h +new file mode 100644 +index 0000000..3c94919 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h +@@ -0,0 +1,62 @@ ++/* ++ * Copyright 2015 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef AMDGPU_AMDKFD_GFX_V8_H_INCLUDED ++#define AMDGPU_AMDKFD_GFX_V8_H_INCLUDED ++ ++#include <linux/types.h> ++ ++enum { ++ MAX_TRAPID = 8, /* 3 bits in the bitfield. */ ++ MAX_WATCH_ADDRESSES = 4 ++}; ++ ++enum { ++ ADDRESS_WATCH_REG_ADDR_HI = 0, ++ ADDRESS_WATCH_REG_ADDR_LO, ++ ADDRESS_WATCH_REG_CNTL, ++ ADDRESS_WATCH_REG_MAX ++}; ++ ++/* not defined in the VI reg file */ ++enum { ++ ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, ++ ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, ++ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, ++ /* extend the mask to 26 bits in order to match the low address field */ ++ ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, ++ ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF ++}; ++ ++union TCP_WATCH_CNTL_BITS { ++ struct { ++ uint32_t mask:24; ++ uint32_t vmid:4; ++ uint32_t atc:1; ++ uint32_t mode:2; ++ uint32_t valid:1; ++ } bitfields, bits; ++ uint32_t u32All; ++ signed int i32All; ++ float f32All; ++}; ++#endif +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +new file mode 100644 +index 0000000..edbae19 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +@@ -0,0 +1,1227 @@ ++/* ++ * Copyright 2014 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++#undef pr_fmt ++#define pr_fmt(fmt) "kfd2kgd: " fmt ++ ++#include <linux/module.h> ++#include <linux/fdtable.h> ++#include <linux/uaccess.h> ++#include <linux/firmware.h> ++#include <drm/drmP.h> ++#include "amdgpu.h" ++#include "amdgpu_amdkfd.h" ++#include "amdgpu_ucode.h" ++#include "amdgpu_amdkfd_gfx_v8.h" ++#include "vega10/soc15ip.h" ++#include "vega10/GC/gc_9_0_offset.h" ++#include "vega10/GC/gc_9_0_sh_mask.h" ++#include "vega10/vega10_enum.h" ++#include "vega10/SDMA0/sdma0_4_0_offset.h" ++#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" ++#include "vega10/SDMA1/sdma1_4_0_offset.h" ++#include "vega10/SDMA1/sdma1_4_0_sh_mask.h" ++#include "vega10/ATHUB/athub_1_0_offset.h" ++#include "vega10/ATHUB/athub_1_0_sh_mask.h" ++#include "vega10/OSSSYS/osssys_4_0_offset.h" ++#include "vega10/OSSSYS/osssys_4_0_sh_mask.h" ++#include "soc15_common.h" ++#include "v9_structs.h" ++#include "soc15.h" ++#include "soc15d.h" ++ ++/* HACK: MMHUB and GC both have VM-related register with the same ++ * names but different offsets. Define the MMHUB register we need here ++ * with a prefix. A proper solution would be to move the functions ++ * programming these registers into gfx_v9_0.c and mmhub_v1_0.c ++ * respectively. ++ */ ++#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3 ++#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0 ++ ++#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705 ++#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0 ++ ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0 ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0 ++ ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0 ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0 ++ ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0 ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c ++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0 ++ ++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727 ++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0 ++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728 ++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0 ++ ++enum hqd_dequeue_request_type { ++ NO_ACTION = 0, ++ DRAIN_PIPE, ++ RESET_WAVES, ++ SAVE_WAVES ++}; ++ ++static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { ++ mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, ++ mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, ++ mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, ++ mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL ++}; ++ ++ ++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem); ++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); ++ ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, ++ int fd, uint32_t handle, struct kgd_mem **mem); ++ ++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); ++ ++/* ++ * Register access functions ++ */ ++ ++static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t sh_mem_config, ++ uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, ++ uint32_t sh_mem_bases); ++static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, ++ unsigned int vmid); ++static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, ++ uint32_t hpd_size, uint64_t hpd_gpu_addr); ++static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); ++static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t wptr_shift, uint32_t wptr_mask, ++ struct mm_struct *mm); ++static int kgd_hqd_dump(struct kgd_dev *kgd, ++ uint32_t pipe_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs); ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, ++ uint32_t __user *wptr, struct mm_struct *mm); ++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, ++ uint32_t engine_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs); ++static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, ++ uint32_t pipe_id, uint32_t queue_id); ++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); ++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, ++ enum kfd_preempt_type reset_type, ++ unsigned int utimeout, uint32_t pipe_id, ++ uint32_t queue_id); ++static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, ++ unsigned int utimeout); ++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); ++static uint32_t get_watch_base_addr(void); ++static int kgd_address_watch_disable(struct kgd_dev *kgd); ++static int kgd_address_watch_execute(struct kgd_dev *kgd, ++ unsigned int watch_point_id, ++ uint32_t cntl_val, ++ uint32_t addr_hi, ++ uint32_t addr_lo); ++static int kgd_wave_control_execute(struct kgd_dev *kgd, ++ uint32_t gfx_index_val, ++ uint32_t sq_cmd); ++static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, ++ unsigned int watch_point_id, ++ unsigned int reg_offset); ++ ++static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, ++ uint8_t vmid); ++static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, ++ uint8_t vmid); ++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); ++static void set_num_of_requests(struct kgd_dev *kgd, ++ uint8_t num_of_requests); ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid); ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype); ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base); ++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); ++ ++/* Because of REG_GET_FIELD() being used, we put this function in the ++ * asic specific file. ++ */ ++static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, ++ struct tile_config *config) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ ++ config->gb_addr_config = adev->gfx.config.gb_addr_config; ++#if 0 ++/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but ++ * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu ++ * changes commented out related code, doing the same here for now but ++ * need to sync with Ken et al ++ */ ++ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, ++ MC_ARB_RAMCFG, NOOFBANK); ++ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, ++ MC_ARB_RAMCFG, NOOFRANKS); ++#endif ++ ++ config->tile_config_ptr = adev->gfx.config.tile_mode_array; ++ config->num_tile_configs = ++ ARRAY_SIZE(adev->gfx.config.tile_mode_array); ++ config->macro_tile_config_ptr = ++ adev->gfx.config.macrotile_mode_array; ++ config->num_macro_tile_configs = ++ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); ++ ++ return 0; ++} ++ ++static const struct kfd2kgd_calls kfd2kgd = { ++ .init_gtt_mem_allocation = alloc_gtt_mem, ++ .free_gtt_mem = free_gtt_mem, ++ .get_local_mem_info = get_local_mem_info, ++ .get_gpu_clock_counter = get_gpu_clock_counter, ++ .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, ++ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, ++ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, ++ .create_process_gpumem = create_process_gpumem, ++ .destroy_process_gpumem = destroy_process_gpumem, ++ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, ++ .open_graphic_handle = open_graphic_handle, ++ .program_sh_mem_settings = kgd_program_sh_mem_settings, ++ .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, ++ .init_pipeline = kgd_init_pipeline, ++ .init_interrupts = kgd_init_interrupts, ++ .hqd_load = kgd_hqd_load, ++ .hqd_sdma_load = kgd_hqd_sdma_load, ++ .hqd_dump = kgd_hqd_dump, ++ .hqd_sdma_dump = kgd_hqd_sdma_dump, ++ .hqd_is_occupied = kgd_hqd_is_occupied, ++ .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, ++ .hqd_destroy = kgd_hqd_destroy, ++ .hqd_sdma_destroy = kgd_hqd_sdma_destroy, ++ .address_watch_disable = kgd_address_watch_disable, ++ .address_watch_execute = kgd_address_watch_execute, ++ .wave_control_execute = kgd_wave_control_execute, ++ .address_watch_get_offset = kgd_address_watch_get_offset, ++ .get_atc_vmid_pasid_mapping_pasid = ++ get_atc_vmid_pasid_mapping_pasid, ++ .get_atc_vmid_pasid_mapping_valid = ++ get_atc_vmid_pasid_mapping_valid, ++ .write_vmid_invalidate_request = write_vmid_invalidate_request, ++ .invalidate_tlbs = invalidate_tlbs, ++ .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, ++ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, ++ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, ++ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, ++ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, ++ .get_fw_version = get_fw_version, ++ .set_num_of_requests = set_num_of_requests, ++ .get_cu_info = get_cu_info, ++ .alloc_memory_of_scratch = alloc_memory_of_scratch, ++ .write_config_static_mem = write_config_static_mem, ++ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, ++ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, ++ .set_vm_context_page_table_base = set_vm_context_page_table_base, ++ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, ++ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, ++ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, ++ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, ++ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, ++ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, ++ .submit_ib = amdgpu_amdkfd_submit_ib, ++ .get_tile_config = amdgpu_amdkfd_get_tile_config, ++ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, ++ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, ++ .get_vram_usage = amdgpu_amdkfd_get_vram_usage ++}; ++ ++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions() ++{ ++ return (struct kfd2kgd_calls *)&kfd2kgd; ++} ++ ++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem) ++{ ++ return 0; ++} ++ ++/* Destroys the GPU allocation and frees the kgd_mem structure */ ++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) ++{ ++ ++} ++ ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, ++ int fd, uint32_t handle, struct kgd_mem **mem) ++{ ++ return 0; ++} ++ ++static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) ++{ ++ return (struct amdgpu_device *)kgd; ++} ++ ++static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, ++ uint32_t queue, uint32_t vmid) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ ++ mutex_lock(&adev->srbm_mutex); ++ soc15_grbm_select(adev, mec, pipe, queue, vmid); ++} ++ ++static void unlock_srbm(struct kgd_dev *kgd) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ ++ soc15_grbm_select(adev, 0, 0, 0, 0); ++ mutex_unlock(&adev->srbm_mutex); ++} ++ ++static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ ++ uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); ++ ++ lock_srbm(kgd, mec, pipe, queue_id, 0); ++} ++ ++static uint32_t get_queue_mask(struct amdgpu_device *adev, ++ uint32_t pipe_id, uint32_t queue_id) ++{ ++ unsigned int bit = (pipe_id * adev->gfx.mec.num_pipe_per_mec + ++ queue_id) & 31; ++ ++ return ((uint32_t)1) << bit; ++} ++ ++static void release_queue(struct kgd_dev *kgd) ++{ ++ unlock_srbm(kgd); ++} ++ ++static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t sh_mem_config, ++ uint32_t sh_mem_ape1_base, ++ uint32_t sh_mem_ape1_limit, ++ uint32_t sh_mem_bases) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ ++ lock_srbm(kgd, 0, 0, 0, vmid); ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); ++ /* APE1 no longer exists on GFX9 */ ++ ++ unlock_srbm(kgd); ++} ++ ++static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, ++ unsigned int vmid) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ ++ /* ++ * We have to assume that there is no outstanding mapping. ++ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because ++ * a mapping is in progress or because a mapping finished ++ * and the SW cleared it. ++ * So the protocol is to always wait & clear. ++ */ ++ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | ++ ATC_VMID0_PASID_MAPPING__VALID_MASK; ++ ++ /* ++ * need to do this twice, once for gfx and once for mmhub ++ * for ATC add 16 to VMID for mmhub, for IH different registers. ++ * ATC_VMID0..15 registers are separate from ATC_VMID16..31. ++ */ ++ ++ WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, ++ pasid_mapping); ++ ++ while (!(RREG32(SOC15_REG_OFFSET( ++ ATHUB, 0, ++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & ++ (1U << vmid))) ++ cpu_relax(); ++ ++ WREG32(SOC15_REG_OFFSET(ATHUB, 0, ++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), ++ 1U << vmid); ++ ++ /* Mapping vmid to pasid also for IH block */ ++ WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, ++ pasid_mapping); ++ ++ WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, ++ pasid_mapping); ++ ++ while (!(RREG32(SOC15_REG_OFFSET( ++ ATHUB, 0, ++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & ++ (1U << (vmid + 16)))) ++ cpu_relax(); ++ ++ WREG32(SOC15_REG_OFFSET(ATHUB, 0, ++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), ++ 1U << (vmid + 16)); ++ ++ /* Mapping vmid to pasid also for IH block */ ++ WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, ++ pasid_mapping); ++ return 0; ++} ++ ++static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, ++ uint32_t hpd_size, uint64_t hpd_gpu_addr) ++{ ++ /* amdgpu owns the per-pipe state */ ++ return 0; ++} ++ ++/* TODO - RING0 form of field is obsolete, seems to date back to SI ++ * but still works ++ */ ++ ++static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t mec; ++ uint32_t pipe; ++ ++ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); ++ ++ lock_srbm(kgd, mec, pipe, 0, 0); ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), ++ CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | ++ CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); ++ ++ unlock_srbm(kgd); ++ ++ return 0; ++} ++ ++static uint32_t get_sdma_base_addr(unsigned int engine_id, ++ unsigned int queue_id) ++{ ++ static const uint32_t base[2] = { ++ SOC15_REG_OFFSET(SDMA0, 0, ++ mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, ++ SOC15_REG_OFFSET(SDMA1, 0, ++ mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL ++ }; ++ uint32_t retval; ++ ++ retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - ++ mmSDMA0_RLC0_RB_CNTL); ++ ++ pr_debug("sdma base address: 0x%x\n", retval); ++ ++ return retval; ++} ++ ++static uint32_t get_watch_base_addr(void) ++{ ++ uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) - ++ mmTCP_WATCH0_ADDR_H; ++ ++ pr_debug("kfd: reg watch base address: 0x%x\n", retval); ++ ++ return retval; ++} ++ ++static inline struct v9_mqd *get_mqd(void *mqd) ++{ ++ return (struct v9_mqd *)mqd; ++} ++ ++static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) ++{ ++ return (struct v9_sdma_mqd *)mqd; ++} ++ ++static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t wptr_shift, uint32_t wptr_mask, ++ struct mm_struct *mm) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ struct v9_mqd *m; ++ uint32_t *mqd_hqd; ++ uint32_t reg, hqd_base, data; ++ ++ m = get_mqd(mqd); ++ ++ acquire_queue(kgd, pipe_id, queue_id); ++ ++ /* HIQ is set during driver init period with vmid set to 0*/ ++ if (m->cp_hqd_vmid == 0) { ++ uint32_t value, mec, pipe; ++ ++ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); ++ ++ pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", ++ mec, pipe, queue_id); ++ value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); ++ value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, ++ ((mec << 5) | (pipe << 3) | queue_id | 0x80)); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); ++ } ++ ++ /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ ++ mqd_hqd = &m->cp_mqd_base_addr_lo; ++ hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); ++ ++ for (reg = hqd_base; ++ reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) ++ WREG32(reg, mqd_hqd[reg - hqd_base]); ++ ++ ++ /* Activate doorbell logic before triggering WPTR poll. */ ++ data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, ++ CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); ++ ++ if (wptr) { ++ /* Don't read wptr with get_user because the user ++ * context may not be accessible (if this function ++ * runs in a work queue). Instead trigger a one-shot ++ * polling read from memory in the CP. This assumes ++ * that wptr is GPU-accessible in the queue's VMID via ++ * ATC or SVM. WPTR==RPTR before starting the poll so ++ * the CP starts fetching new commands from the right ++ * place. ++ * ++ * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit ++ * tricky. Assume that the queue didn't overflow. The ++ * number of valid bits in the 32-bit RPTR depends on ++ * the queue size. The remaining bits are taken from ++ * the saved 64-bit WPTR. If the WPTR wrapped, add the ++ * queue size. ++ */ ++ uint32_t queue_size = ++ 2 << REG_GET_FIELD(m->cp_hqd_pq_control, ++ CP_HQD_PQ_CONTROL, QUEUE_SIZE); ++ uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); ++ ++ if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) ++ guessed_wptr += queue_size; ++ guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); ++ guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), ++ lower_32_bits(guessed_wptr)); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), ++ upper_32_bits(guessed_wptr)); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), ++ lower_32_bits((uint64_t)wptr)); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), ++ upper_32_bits((uint64_t)wptr)); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), ++ get_queue_mask(adev, pipe_id, queue_id)); ++ } ++ ++ /* Start the EOP fetcher */ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), ++ REG_SET_FIELD(m->cp_hqd_eop_rptr, ++ CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); ++ ++ data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); ++ ++ release_queue(kgd); ++ ++ return 0; ++} ++ ++static int kgd_hqd_dump(struct kgd_dev *kgd, ++ uint32_t pipe_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t i = 0, reg; ++#define HQD_N_REGS 56 ++#define DUMP_REG(addr) do { \ ++ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ ++ break; \ ++ (*dump)[i][0] = (addr) << 2; \ ++ (*dump)[i++][1] = RREG32(addr); \ ++ } while (0) ++ ++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); ++ if (*dump == NULL) ++ return -ENOMEM; ++ ++ acquire_queue(kgd, pipe_id, queue_id); ++ ++ for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); ++ reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) ++ DUMP_REG(reg); ++ ++ release_queue(kgd); ++ ++ WARN_ON_ONCE(i != HQD_N_REGS); ++ *n_regs = i; ++ ++ return 0; ++} ++ ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, ++ uint32_t __user *wptr, struct mm_struct *mm) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ struct v9_sdma_mqd *m; ++ uint32_t sdma_base_addr, sdmax_gfx_context_cntl; ++ uint32_t temp, timeout = 2000; ++ uint32_t data; ++ uint64_t data64; ++ uint64_t __user *wptr64 = (uint64_t __user *)wptr; ++ ++ m = get_sdma_mqd(mqd); ++ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, ++ m->sdma_queue_id); ++ sdmax_gfx_context_cntl = m->sdma_engine_id ? ++ SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : ++ SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); ++ ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, ++ m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); ++ ++ while (true) { ++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); ++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) ++ break; ++ if (timeout == 0) ++ return -ETIME; ++ msleep(10); ++ timeout -= 10; ++ } ++ data = RREG32(sdmax_gfx_context_cntl); ++ data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, ++ RESUME_CTX, 0); ++ WREG32(sdmax_gfx_context_cntl, data); ++ ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, ++ m->sdmax_rlcx_doorbell_offset); ++ ++ data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, ++ ENABLE, 1); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, ++ m->sdmax_rlcx_rb_rptr_hi); ++ ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); ++ if (read_user_wptr(mm, wptr64, data64)) { ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, ++ lower_32_bits(data64)); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, ++ upper_32_bits(data64)); ++ } else { ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, ++ m->sdmax_rlcx_rb_rptr); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, ++ m->sdmax_rlcx_rb_rptr_hi); ++ } ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); ++ ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, ++ m->sdmax_rlcx_rb_base_hi); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, ++ m->sdmax_rlcx_rb_rptr_addr_lo); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, ++ m->sdmax_rlcx_rb_rptr_addr_hi); ++ ++ data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, ++ RB_ENABLE, 1); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); ++ ++ return 0; ++} ++ ++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, ++ uint32_t engine_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t sdma_base_addr = get_sdma_base_addr(engine_id, queue_id); ++ uint32_t i = 0, reg; ++#undef HQD_N_REGS ++#define HQD_N_REGS (19+6+7+10) ++ ++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); ++ if (*dump == NULL) ++ return -ENOMEM; ++ ++ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) ++ DUMP_REG(sdma_base_addr + reg); ++ for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) ++ DUMP_REG(sdma_base_addr + reg); ++ for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; ++ reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) ++ DUMP_REG(sdma_base_addr + reg); ++ for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; ++ reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) ++ DUMP_REG(sdma_base_addr + reg); ++ ++ WARN_ON_ONCE(i != HQD_N_REGS); ++ *n_regs = i; ++ ++ return 0; ++} ++ ++static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, ++ uint32_t pipe_id, uint32_t queue_id) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t act; ++ bool retval = false; ++ uint32_t low, high; ++ ++ acquire_queue(kgd, pipe_id, queue_id); ++ act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); ++ if (act) { ++ low = lower_32_bits(queue_address >> 8); ++ high = upper_32_bits(queue_address >> 8); ++ ++ if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && ++ high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) ++ retval = true; ++ } ++ release_queue(kgd); ++ return retval; ++} ++ ++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ struct v9_sdma_mqd *m; ++ uint32_t sdma_base_addr; ++ uint32_t sdma_rlc_rb_cntl; ++ ++ m = get_sdma_mqd(mqd); ++ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, ++ m->sdma_queue_id); ++ ++ sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); ++ ++ if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) ++ return true; ++ ++ return false; ++} ++ ++static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, ++ enum kfd_preempt_type reset_type, ++ unsigned int utimeout, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ enum hqd_dequeue_request_type type; ++ unsigned long end_jiffies; ++ uint32_t temp; ++ struct v9_mqd *m = get_mqd(mqd); ++ ++#if 0 ++ unsigned long flags; ++ int retry; ++#endif ++ ++ acquire_queue(kgd, pipe_id, queue_id); ++ ++ if (m->cp_hqd_vmid == 0) ++ WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); ++ ++ switch (reset_type) { ++ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: ++ type = DRAIN_PIPE; ++ break; ++ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: ++ type = RESET_WAVES; ++ break; ++ default: ++ type = DRAIN_PIPE; ++ break; ++ } ++ ++#if 0 /* Is this still needed? */ ++ /* Workaround: If IQ timer is active and the wait time is close to or ++ * equal to 0, dequeueing is not safe. Wait until either the wait time ++ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is ++ * cleared before continuing. Also, ensure wait times are set to at ++ * least 0x3. ++ */ ++ local_irq_save(flags); ++ preempt_disable(); ++ retry = 5000; /* wait for 500 usecs at maximum */ ++ while (true) { ++ temp = RREG32(mmCP_HQD_IQ_TIMER); ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { ++ pr_debug("HW is processing IQ\n"); ++ goto loop; ++ } ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) ++ == 3) /* SEM-rearm is safe */ ++ break; ++ /* Wait time 3 is safe for CP, but our MMIO read/write ++ * time is close to 1 microsecond, so check for 10 to ++ * leave more buffer room ++ */ ++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) ++ >= 10) ++ break; ++ pr_debug("IQ timer is active\n"); ++ } else ++ break; ++loop: ++ if (!retry) { ++ pr_err("CP HQD IQ timer status time out\n"); ++ break; ++ } ++ ndelay(100); ++ --retry; ++ } ++ retry = 1000; ++ while (true) { ++ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); ++ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) ++ break; ++ pr_debug("Dequeue request is pending\n"); ++ ++ if (!retry) { ++ pr_err("CP HQD dequeue request time out\n"); ++ break; ++ } ++ ndelay(100); ++ --retry; ++ } ++ local_irq_restore(flags); ++ preempt_enable(); ++#endif ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); ++ ++ end_jiffies = (utimeout * HZ / 1000) + jiffies; ++ while (true) { ++ temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); ++ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) ++ break; ++ if (time_after(jiffies, end_jiffies)) { ++ pr_err("cp queue preemption time out.\n"); ++ release_queue(kgd); ++ return -ETIME; ++ } ++ usleep_range(500, 1000); ++ } ++ ++ release_queue(kgd); ++ return 0; ++} ++ ++static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, ++ unsigned int utimeout) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ struct v9_sdma_mqd *m; ++ uint32_t sdma_base_addr; ++ uint32_t temp; ++ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; ++ ++ m = get_sdma_mqd(mqd); ++ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, ++ m->sdma_queue_id); ++ ++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); ++ temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); ++ ++ while (true) { ++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); ++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) ++ break; ++ if (time_after(jiffies, end_jiffies)) ++ return -ETIME; ++ usleep_range(500, 1000); ++ } ++ ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, ++ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | ++ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); ++ ++ m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); ++ m->sdmax_rlcx_rb_rptr_hi = ++ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); ++ ++ return 0; ++} ++ ++static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, ++ uint8_t vmid) ++{ ++ uint32_t reg; ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) ++ + vmid); ++ return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; ++} ++ ++static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, ++ uint8_t vmid) ++{ ++ uint32_t reg; ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) ++ + vmid); ++ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; ++} ++ ++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ uint32_t req = (1 << vmid) | ++ (1 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* light */ ++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK | ++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK | ++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK | ++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK | ++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK; ++ ++ spin_lock(&adev->tlb_invalidation_lock); ++ ++ /* Use light weight invalidation. ++ * ++ * TODO 1: agree on the right set of invalidation registers for ++ * KFD use. Use the last one for now. Invalidate both GC and ++ * MMHUB. ++ * ++ * TODO 2: support range-based invalidation, requires kfg2kgd ++ * interface change ++ */ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), ++ 0xffffffff); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), ++ 0x0000001f); ++ ++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, ++ mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), ++ 0xffffffff); ++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, ++ mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), ++ 0x0000001f); ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req); ++ ++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), ++ req); ++ ++ while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) & ++ (1 << vmid))) ++ cpu_relax(); ++ ++ while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, ++ mmMMHUB_VM_INVALIDATE_ENG16_ACK)) & ++ (1 << vmid))) ++ cpu_relax(); ++ ++ spin_unlock(&adev->tlb_invalidation_lock); ++ ++} ++ ++static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) ++{ ++ signed long r; ++ struct dma_fence *f; ++ struct amdgpu_ring *ring = &adev->gfx.kiq.ring; ++ ++ mutex_lock(&adev->gfx.kiq.ring_mutex); ++ amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ ++ amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); ++ amdgpu_ring_write(ring, ++ PACKET3_INVALIDATE_TLBS_DST_SEL(1) | ++ PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | ++ PACKET3_INVALIDATE_TLBS_PASID(pasid) | ++ PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(2)); ++ amdgpu_fence_emit(ring, &f); ++ amdgpu_ring_commit(ring); ++ mutex_unlock(&adev->gfx.kiq.ring_mutex); ++ ++ r = dma_fence_wait(f, false); ++ if (r) ++ DRM_ERROR("wait for kiq fence error: %ld.\n", r); ++ dma_fence_put(f); ++ ++ return r; ++} ++ ++static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ int vmid; ++ struct amdgpu_ring *ring = &adev->gfx.kiq.ring; ++ ++ if (ring->ready) ++ return invalidate_tlbs_with_kiq(adev, pasid); ++ ++ for (vmid = 0; vmid < 16; vmid++) { ++ if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) ++ continue; ++ if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { ++ if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) ++ == pasid) { ++ write_vmid_invalidate_request(kgd, vmid); ++ break; ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++static int kgd_address_watch_disable(struct kgd_dev *kgd) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ union TCP_WATCH_CNTL_BITS cntl; ++ unsigned int i; ++ uint32_t watch_base_addr; ++ ++ cntl.u32All = 0; ++ ++ cntl.bitfields.valid = 0; ++ cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; ++ cntl.bitfields.atc = 1; ++ ++ watch_base_addr = get_watch_base_addr(); ++ /* Turning off this address until we set all the registers */ ++ for (i = 0; i < MAX_WATCH_ADDRESSES; i++) ++ WREG32(watch_base_addr + ++ watchRegs[i * ADDRESS_WATCH_REG_MAX + ++ ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); ++ ++ return 0; ++} ++ ++static int kgd_address_watch_execute(struct kgd_dev *kgd, ++ unsigned int watch_point_id, ++ uint32_t cntl_val, ++ uint32_t addr_hi, ++ uint32_t addr_lo) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ union TCP_WATCH_CNTL_BITS cntl; ++ uint32_t watch_base_addr; ++ ++ watch_base_addr = get_watch_base_addr(); ++ cntl.u32All = cntl_val; ++ ++ /* Turning off this watch point until we set all the registers */ ++ cntl.bitfields.valid = 0; ++ WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); ++ ++ WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], ++ addr_hi); ++ ++ WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], ++ addr_lo); ++ ++ /* Enable the watch point */ ++ cntl.bitfields.valid = 1; ++ ++ WREG32(watch_base_addr + ++ watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ++ ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); ++ ++ return 0; ++} ++ ++static int kgd_wave_control_execute(struct kgd_dev *kgd, ++ uint32_t gfx_index_val, ++ uint32_t sq_cmd) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t data = 0; ++ ++ mutex_lock(&adev->grbm_idx_mutex); ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); ++ ++ data = REG_SET_FIELD(data, GRBM_GFX_INDEX, ++ INSTANCE_BROADCAST_WRITES, 1); ++ data = REG_SET_FIELD(data, GRBM_GFX_INDEX, ++ SH_BROADCAST_WRITES, 1); ++ data = REG_SET_FIELD(data, GRBM_GFX_INDEX, ++ SE_BROADCAST_WRITES, 1); ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); ++ mutex_unlock(&adev->grbm_idx_mutex); ++ ++ return 0; ++} ++ ++static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, ++ unsigned int watch_point_id, ++ unsigned int reg_offset) ++{ ++ return get_watch_base_addr() + ++ watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; ++} ++ ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype) ++{ ++ /* No longer needed on GFXv9. These values are now hard-coded, ++ * except for the MTYPE which comes from the page table. ++ */ ++ ++ return 0; ++} ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid) ++{ ++ /* No longer needed on GFXv9. The scratch base address is ++ * passed to the shader by the CP. It's the user mode driver's ++ * responsibility. ++ */ ++ ++ return 0; ++} ++ ++/* FIXME: Does this need to be ASIC-specific code? */ ++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ const union amdgpu_firmware_header *hdr; ++ ++ switch (type) { ++ case KGD_ENGINE_PFP: ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; ++ break; ++ ++ case KGD_ENGINE_ME: ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; ++ break; ++ ++ case KGD_ENGINE_CE: ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; ++ break; ++ ++ case KGD_ENGINE_MEC1: ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; ++ break; ++ ++ case KGD_ENGINE_MEC2: ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; ++ break; ++ ++ case KGD_ENGINE_RLC: ++ hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; ++ break; ++ ++ case KGD_ENGINE_SDMA1: ++ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; ++ break; ++ ++ case KGD_ENGINE_SDMA2: ++ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; ++ break; ++ ++ default: ++ return 0; ++ } ++ ++ if (hdr == NULL) ++ return 0; ++ ++ /* Only 12 bit in use*/ ++ return hdr->common.ucode_version; ++} ++ ++static void set_num_of_requests(struct kgd_dev *kgd, ++ uint8_t num_of_requests) ++{ ++ pr_debug("This is a stub\n"); ++} ++ ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT | ++ AMDGPU_PTE_VALID; ++ ++ /* TODO: Don't use hardcoded VMIDs */ ++ if (vmid < 8 || vmid > 15) { ++ pr_err("trying to set page table base for wrong VMID %u\n", ++ vmid); ++ return; ++ } ++ ++ /* TODO: take advantage of per-process address space size. For ++ * now, all processes share the same address space size, like ++ * on GFX8 and older. ++ */ ++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); ++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); ++ ++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), ++ lower_32_bits(adev->vm_manager.max_pfn - 1)); ++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), ++ upper_32_bits(adev->vm_manager.max_pfn - 1)); ++ ++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); ++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), ++ lower_32_bits(adev->vm_manager.max_pfn - 1)); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), ++ upper_32_bits(adev->vm_manager.max_pfn - 1)); ++ ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); ++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); ++} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +new file mode 100644 +index 0000000..7df892d +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +@@ -0,0 +1,2578 @@ ++/* ++ * Copyright 2014 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "kfd2kgd: " fmt ++ ++#include <linux/module.h> ++#include <linux/fdtable.h> ++#include <linux/uaccess.h> ++#include <linux/firmware.h> ++#include <linux/list.h> ++#include <linux/sched/mm.h> ++#include <drm/drmP.h> ++#include <linux/dma-buf.h> ++#include <linux/pagemap.h> ++#include "amdgpu_amdkfd.h" ++#include "amdgpu_ucode.h" ++#include "gca/gfx_8_0_sh_mask.h" ++#include "gca/gfx_8_0_d.h" ++#include "gca/gfx_8_0_enum.h" ++#include "oss/oss_3_0_sh_mask.h" ++#include "oss/oss_3_0_d.h" ++#include "gmc/gmc_8_1_sh_mask.h" ++#include "gmc/gmc_8_1_d.h" ++ ++/* Special VM and GART address alignment needed for VI pre-Fiji due to ++ * a HW bug. ++ */ ++#define VI_BO_SIZE_ALIGN (0x8000) ++ ++/* BO flag to indicate a KFD userptr BO */ ++#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63) ++ ++/* Impose limit on how much memory KFD can use */ ++struct kfd_mem_usage_limit { ++ uint64_t max_system_mem_limit; ++ uint64_t max_userptr_mem_limit; ++ int64_t system_mem_used; ++ int64_t userptr_mem_used; ++ spinlock_t mem_limit_lock; ++}; ++ ++static struct kfd_mem_usage_limit kfd_mem_limit; ++ ++/* Struct used for amdgpu_amdkfd_bo_validate */ ++struct amdgpu_vm_parser { ++ uint32_t domain; ++ bool wait; ++}; ++ ++static const char * const domain_bit_to_string[] = { ++ "CPU", ++ "GTT", ++ "VRAM", ++ "GDS", ++ "GWS", ++ "OA" ++}; ++ ++#define domain_string(domain) domain_bit_to_string[ffs(domain)-1] ++ ++static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); ++ ++ ++static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) ++{ ++ return (struct amdgpu_device *)kgd; ++} ++ ++static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, ++ struct kgd_mem *mem) ++{ ++ struct kfd_bo_va_list *entry; ++ ++ list_for_each_entry(entry, &mem->bo_va_list, bo_list) ++ if (entry->bo_va->base.vm == avm) ++ return false; ++ ++ return true; ++} ++ ++/* Set memory usage limits. Current, limits are ++ * System (kernel) memory - 15/16th System RAM ++ * Userptr memory - 15/16th System RAM ++ */ ++void amdgpu_amdkfd_gpuvm_init_mem_limits(void) ++{ ++ struct sysinfo si; ++ uint64_t mem; ++ ++ si_meminfo(&si); ++ mem = si.totalram - si.totalhigh; ++ mem *= si.mem_unit; ++ ++ spin_lock_init(&kfd_mem_limit.mem_limit_lock); ++ kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4); /* 15/16 */ ++ kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 4); /* 15/16 */ ++ pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n", ++ (kfd_mem_limit.max_system_mem_limit >> 20), ++ (kfd_mem_limit.max_userptr_mem_limit >> 20)); ++} ++ ++static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, ++ uint64_t size, u32 domain) ++{ ++ size_t acc_size; ++ int ret = 0; ++ ++ acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, ++ sizeof(struct amdgpu_bo)); ++ ++ spin_lock(&kfd_mem_limit.mem_limit_lock); ++ if (domain == AMDGPU_GEM_DOMAIN_GTT) { ++ if (kfd_mem_limit.system_mem_used + (acc_size + size) > ++ kfd_mem_limit.max_system_mem_limit) { ++ ret = -ENOMEM; ++ goto err_no_mem; ++ } ++ kfd_mem_limit.system_mem_used += (acc_size + size); ++ } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { ++ if ((kfd_mem_limit.system_mem_used + acc_size > ++ kfd_mem_limit.max_system_mem_limit) || ++ (kfd_mem_limit.userptr_mem_used + (size + acc_size) > ++ kfd_mem_limit.max_userptr_mem_limit)) { ++ ret = -ENOMEM; ++ goto err_no_mem; ++ } ++ kfd_mem_limit.system_mem_used += acc_size; ++ kfd_mem_limit.userptr_mem_used += size; ++ } ++err_no_mem: ++ spin_unlock(&kfd_mem_limit.mem_limit_lock); ++ return ret; ++} ++ ++static void unreserve_system_mem_limit(struct amdgpu_device *adev, ++ uint64_t size, u32 domain) ++{ ++ size_t acc_size; ++ ++ acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, ++ sizeof(struct amdgpu_bo)); ++ ++ spin_lock(&kfd_mem_limit.mem_limit_lock); ++ if (domain == AMDGPU_GEM_DOMAIN_GTT) { ++ kfd_mem_limit.system_mem_used -= (acc_size + size); ++ } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { ++ kfd_mem_limit.system_mem_used -= acc_size; ++ kfd_mem_limit.userptr_mem_used -= size; ++ } ++ WARN_ONCE(kfd_mem_limit.system_mem_used < 0, ++ "kfd system memory accounting unbalanced"); ++ WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, ++ "kfd userptr memory accounting unbalanced"); ++ ++ spin_unlock(&kfd_mem_limit.mem_limit_lock); ++} ++ ++void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo) ++{ ++ spin_lock(&kfd_mem_limit.mem_limit_lock); ++ ++ if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) { ++ kfd_mem_limit.system_mem_used -= bo->tbo.acc_size; ++ kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo); ++ } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { ++ kfd_mem_limit.system_mem_used -= ++ (bo->tbo.acc_size + amdgpu_bo_size(bo)); ++ } ++ WARN_ONCE(kfd_mem_limit.system_mem_used < 0, ++ "kfd system memory accounting unbalanced"); ++ WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, ++ "kfd userptr memory accounting unbalanced"); ++ ++ spin_unlock(&kfd_mem_limit.mem_limit_lock); ++} ++ ++ ++/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence(s) from BO's ++ * reservation object. ++ * ++ * @bo: [IN] Remove eviction fence(s) from this BO ++ * @ef: [IN] If ef is specified, then this eviction fence is removed if it ++ * is present in the shared list. ++ * @ef_list: [OUT] Returns list of eviction fences. These fences are removed ++ * from BO's reservation object shared list. ++ * @ef_count: [OUT] Number of fences in ef_list. ++ * ++ * NOTE: If called with ef_list, then amdgpu_amdkfd_add_eviction_fence must be ++ * called to restore the eviction fences and to avoid memory leak. This is ++ * useful for shared BOs. ++ * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held. ++ */ ++static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, ++ struct amdgpu_amdkfd_fence *ef, ++ struct amdgpu_amdkfd_fence ***ef_list, ++ unsigned int *ef_count) ++{ ++ struct reservation_object_list *fobj; ++ struct reservation_object *resv; ++ unsigned int i = 0, j = 0, k = 0, shared_count; ++ unsigned int count = 0; ++ struct amdgpu_amdkfd_fence **fence_list; ++ ++ if (!ef && !ef_list) ++ return -EINVAL; ++ ++ if (ef_list) { ++ *ef_list = NULL; ++ *ef_count = 0; ++ } ++ ++ resv = bo->tbo.resv; ++ fobj = reservation_object_get_list(resv); ++ ++ if (!fobj) ++ return 0; ++ ++ preempt_disable(); ++ write_seqcount_begin(&resv->seq); ++ ++ /* Go through all the shared fences in the resevation object. If ++ * ef is specified and it exists in the list, remove it and reduce the ++ * count. If ef is not specified, then get the count of eviction fences ++ * present. ++ */ ++ shared_count = fobj->shared_count; ++ for (i = 0; i < shared_count; ++i) { ++ struct dma_fence *f; ++ ++ f = rcu_dereference_protected(fobj->shared[i], ++ reservation_object_held(resv)); ++ ++ if (ef) { ++ if (f->context == ef->base.context) { ++ dma_fence_put(f); ++ fobj->shared_count--; ++ } else ++ RCU_INIT_POINTER(fobj->shared[j++], f); ++ ++ } else if (to_amdgpu_amdkfd_fence(f)) ++ count++; ++ } ++ write_seqcount_end(&resv->seq); ++ preempt_enable(); ++ ++ if (ef || !count) ++ return 0; ++ ++ /* Alloc memory for count number of eviction fence pointers. Fill the ++ * ef_list array and ef_count ++ */ ++ ++ fence_list = kcalloc(count, sizeof(struct amdgpu_amdkfd_fence *), ++ GFP_KERNEL); ++ if (!fence_list) ++ return -ENOMEM; ++ ++ preempt_disable(); ++ write_seqcount_begin(&resv->seq); ++ ++ j = 0; ++ for (i = 0; i < shared_count; ++i) { ++ struct dma_fence *f; ++ struct amdgpu_amdkfd_fence *efence; ++ ++ f = rcu_dereference_protected(fobj->shared[i], ++ reservation_object_held(resv)); ++ ++ efence = to_amdgpu_amdkfd_fence(f); ++ if (efence) { ++ fence_list[k++] = efence; ++ fobj->shared_count--; ++ } else ++ RCU_INIT_POINTER(fobj->shared[j++], f); ++ } ++ ++ write_seqcount_end(&resv->seq); ++ preempt_enable(); ++ ++ *ef_list = fence_list; ++ *ef_count = k; ++ ++ return 0; ++} ++ ++/* amdgpu_amdkfd_add_eviction_fence - Adds eviction fence(s) back into BO's ++ * reservation object. ++ * ++ * @bo: [IN] Add eviction fences to this BO ++ * @ef_list: [IN] List of eviction fences to be added ++ * @ef_count: [IN] Number of fences in ef_list. ++ * ++ * NOTE: Must call amdgpu_amdkfd_remove_eviction_fence before calling this ++ * function. ++ */ ++static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo, ++ struct amdgpu_amdkfd_fence **ef_list, ++ unsigned int ef_count) ++{ ++ int i; ++ ++ if (!ef_list || !ef_count) ++ return; ++ ++ for (i = 0; i < ef_count; i++) { ++ amdgpu_bo_fence(bo, &ef_list[i]->base, true); ++ /* Readding the fence takes an additional reference. Drop that ++ * reference. ++ */ ++ dma_fence_put(&ef_list[i]->base); ++ } ++ ++ kfree(ef_list); ++} ++ ++static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, ++ bool wait) ++{ ++ int ret; ++ ++ if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm), ++ "Called with userptr BO")) ++ return -EINVAL; ++ ++ amdgpu_ttm_placement_from_domain(bo, domain); ++ ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); ++ if (ret) ++ goto validate_fail; ++ if (wait) { ++ struct amdgpu_amdkfd_fence **ef_list; ++ unsigned int ef_count; ++ ++ ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list, ++ &ef_count); ++ if (ret) ++ goto validate_fail; ++ ++ ttm_bo_wait(&bo->tbo, false, false); ++ amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count); ++ } ++ ++validate_fail: ++ return ret; ++} ++ ++static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo) ++{ ++ struct amdgpu_vm_parser *p = param; ++ ++ return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait); ++} ++ ++/* vm_validate_pt_pd_bos - Validate page table and directory BOs ++ * ++ * Also updates page directory entries so we don't need to do this ++ * again later until the page directory is validated again (e.g. after ++ * an eviction or allocating new page tables). ++ */ ++static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm) ++{ ++ struct amdgpu_bo *pd = vm->root.base.bo; ++ struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev); ++ struct amdgpu_vm_parser param; ++ int ret; ++ ++ param.domain = AMDGPU_GEM_DOMAIN_VRAM; ++ param.wait = false; ++ ++ ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate, ++ ¶m); ++ if (ret) { ++ pr_err("amdgpu: failed to validate PT BOs\n"); ++ return ret; ++ } ++ ++ ret = amdgpu_amdkfd_validate(¶m, pd); ++ if (ret) { ++ pr_err("amdgpu: failed to validate PD\n"); ++ return ret; ++ } ++ ++ ret = amdgpu_vm_update_directories(adev, vm); ++ if (ret != 0) ++ return ret; ++ ++ return 0; ++} ++ ++/* add_bo_to_vm - Add a BO to a VM ++ * ++ * Everything that needs to bo done only once when a BO is first added ++ * to a VM. It can later be mapped and unmapped many times without ++ * repeating these steps. ++ * ++ * 1. Allocate and initialize BO VA entry data structure ++ * 2. Add BO to the VM ++ * 3. Determine ASIC-specific PTE flags ++ * 4. Alloc page tables and directories if needed ++ * 4a. Validate new page tables and directories and update directories ++ */ ++static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem, ++ struct amdgpu_vm *avm, bool is_aql, ++ struct kfd_bo_va_list **p_bo_va_entry) ++{ ++ int ret; ++ struct kfd_bo_va_list *bo_va_entry; ++ struct amdkfd_vm *kvm = container_of(avm, ++ struct amdkfd_vm, base); ++ struct amdgpu_bo *pd = avm->root.base.bo; ++ struct amdgpu_bo *bo = mem->bo; ++ uint64_t va = mem->va; ++ struct list_head *list_bo_va = &mem->bo_va_list; ++ unsigned long bo_size = bo->tbo.mem.size; ++ ++ if (!va) { ++ pr_err("Invalid VA when adding BO to VM\n"); ++ return -EINVAL; ++ } ++ ++ if (is_aql) ++ va += bo_size; ++ ++ bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL); ++ if (!bo_va_entry) ++ return -ENOMEM; ++ ++ pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va, ++ va + bo_size, avm); ++ ++ /* Add BO to VM internal data structures*/ ++ bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo); ++ if (bo_va_entry->bo_va == NULL) { ++ ret = -EINVAL; ++ pr_err("Failed to add BO object to VM. ret == %d\n", ++ ret); ++ goto err_vmadd; ++ } ++ ++ bo_va_entry->va = va; ++ bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev, ++ mem->mapping_flags); ++ bo_va_entry->kgd_dev = (void *)adev; ++ list_add(&bo_va_entry->bo_list, list_bo_va); ++ ++ if (p_bo_va_entry) ++ *p_bo_va_entry = bo_va_entry; ++ ++ /* Allocate new page tables if neeeded and validate ++ * them. Clearing of new page tables and validate need to wait ++ * on move fences. We don't want that to trigger the eviction ++ * fence, so remove it temporarily. ++ */ ++ amdgpu_amdkfd_remove_eviction_fence(pd, ++ kvm->process_info->eviction_fence, ++ NULL, NULL); ++ ++ ret = amdgpu_vm_alloc_pts(adev, avm, va, amdgpu_bo_size(bo)); ++ if (ret) { ++ pr_err("Failed to allocate pts, err=%d\n", ret); ++ goto err_alloc_pts; ++ } ++ ++ ret = vm_validate_pt_pd_bos(avm); ++ if (ret != 0) { ++ pr_err("validate_pt_pd_bos() failed\n"); ++ goto err_alloc_pts; ++ } ++ ++ /* Add the eviction fence back */ ++ amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); ++ ++ return 0; ++ ++err_alloc_pts: ++ amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); ++ amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va); ++ list_del(&bo_va_entry->bo_list); ++err_vmadd: ++ kfree(bo_va_entry); ++ return ret; ++} ++ ++static void remove_bo_from_vm(struct amdgpu_device *adev, ++ struct kfd_bo_va_list *entry, unsigned long size) ++{ ++ pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n", ++ entry->va, ++ entry->va + size, entry); ++ amdgpu_vm_bo_rmv(adev, entry->bo_va); ++ list_del(&entry->bo_list); ++ kfree(entry); ++} ++ ++static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, ++ struct amdkfd_process_info *process_info, ++ bool userptr) ++{ ++ struct ttm_validate_buffer *entry = &mem->validate_list; ++ struct amdgpu_bo *bo = mem->bo; ++ ++ INIT_LIST_HEAD(&entry->head); ++ entry->shared = true; ++ entry->bo = &bo->tbo; ++ mutex_lock(&process_info->lock); ++ if (userptr) ++ list_add_tail(&entry->head, &process_info->userptr_valid_list); ++ else ++ list_add_tail(&entry->head, &process_info->kfd_bo_list); ++ mutex_unlock(&process_info->lock); ++} ++ ++/* Initializes user pages. It registers the MMU notifier and validates ++ * the userptr BO in the GTT domain. ++ * ++ * The BO must already be on the userptr_valid_list. Otherwise an ++ * eviction and restore may happen that leaves the new BO unmapped ++ * with the user mode queues running. ++ * ++ * Takes the process_info->lock to protect against concurrent restore ++ * workers. ++ * ++ * Returns 0 for success, negative errno for errors. ++ */ ++static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, ++ uint64_t user_addr) ++{ ++ struct amdkfd_process_info *process_info = mem->process_info; ++ struct amdgpu_bo *bo = mem->bo; ++ int ret = 0; ++ ++ mutex_lock(&process_info->lock); ++ ++ ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); ++ if (ret) { ++ pr_err("%s: Failed to set userptr: %d\n", __func__, ret); ++ goto out; ++ } ++ ++ ret = amdgpu_mn_register(bo, user_addr); ++ if (ret) { ++ pr_err("%s: Failed to register MMU notifier: %d\n", ++ __func__, ret); ++ goto out; ++ } ++ ++ /* If no restore worker is running concurrently, user_pages ++ * should not be allocated ++ */ ++ WARN(mem->user_pages, "Leaking user_pages array"); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ mem->user_pages = drm_calloc_large(bo->tbo.ttm->num_pages, ++ sizeof(struct page *)); ++#else ++ mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages, ++ sizeof(struct page *), ++ GFP_KERNEL | __GFP_ZERO); ++#endif ++ if (!mem->user_pages) { ++ pr_err("%s: Failed to allocate pages array\n", __func__); ++ ret = -ENOMEM; ++ goto unregister_out; ++ } ++ ++ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); ++ if (ret) { ++ pr_err("%s: Failed to get user pages: %d\n", __func__, ret); ++ goto free_out; ++ } ++ ++ amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages); ++ ++ ret = amdgpu_bo_reserve(bo, true); ++ if (ret) { ++ pr_err("%s: Failed to reserve BO\n", __func__); ++ goto release_out; ++ } ++ amdgpu_ttm_placement_from_domain(bo, mem->domain); ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, ++ true, false); ++ if (ret) ++ pr_err("%s: failed to validate BO\n", __func__); ++ amdgpu_bo_unreserve(bo); ++ ++release_out: ++ if (ret) ++ release_pages(mem->user_pages, bo->tbo.ttm->num_pages, 0); ++free_out: ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ drm_free_large(mem->user_pages); ++#else ++ kvfree(mem->user_pages); ++#endif ++ mem->user_pages = NULL; ++unregister_out: ++ if (ret) ++ amdgpu_mn_unregister(bo); ++out: ++ mutex_unlock(&process_info->lock); ++ return ret; ++} ++ ++static int __map_bo_to_kernel(struct amdgpu_bo *bo, u32 domain, void **kptr) ++{ ++ int ret; ++ ++ ret = amdgpu_bo_reserve(bo, true); ++ if (ret) { ++ pr_err("Failed to reserve bo. ret %d\n", ret); ++ return ret; ++ } ++ ++ ret = amdgpu_bo_pin(bo, domain, NULL); ++ if (ret) { ++ pr_err("Failed to pin bo. ret %d\n", ret); ++ goto pin_failed; ++ } ++ ++ ret = amdgpu_bo_kmap(bo, kptr); ++ if (ret) { ++ pr_err("Failed to map bo to kernel. ret %d\n", ret); ++ goto kmap_failed; ++ } ++ ++ amdgpu_bo_unreserve(bo); ++ ++ return ret; ++ ++kmap_failed: ++ amdgpu_bo_unpin(bo); ++pin_failed: ++ amdgpu_bo_unreserve(bo); ++ ++ return ret; ++} ++ ++static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, ++ uint64_t size, void *vm, struct kgd_mem **mem, ++ uint64_t *offset, u32 domain, u64 flags, ++ struct sg_table *sg, bool aql_queue, ++ bool readonly, bool execute, bool coherent, bool no_sub, ++ bool userptr) ++{ ++ struct amdgpu_device *adev; ++ int ret; ++ struct amdgpu_bo *bo; ++ uint64_t user_addr = 0; ++ int byte_align; ++ u32 alloc_domain; ++ uint32_t mapping_flags; ++ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; ++ ++ if (aql_queue) ++ size = size >> 1; ++ if (userptr) { ++ if (!offset || !*offset) ++ return -EINVAL; ++ user_addr = *offset; ++ } ++ ++ adev = get_amdgpu_device(kgd); ++ byte_align = (adev->family == AMDGPU_FAMILY_VI && ++ adev->asic_type != CHIP_FIJI && ++ adev->asic_type != CHIP_POLARIS10 && ++ adev->asic_type != CHIP_POLARIS11) ? ++ VI_BO_SIZE_ALIGN : 1; ++ ++ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); ++ if (*mem == NULL) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ INIT_LIST_HEAD(&(*mem)->bo_va_list); ++ mutex_init(&(*mem)->lock); ++ (*mem)->coherent = coherent; ++ (*mem)->no_substitute = no_sub; ++ (*mem)->aql_queue = aql_queue; ++ ++ mapping_flags = AMDGPU_VM_PAGE_READABLE; ++ if (!readonly) ++ mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE; ++ if (execute) ++ mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; ++ if (coherent) ++ mapping_flags |= AMDGPU_VM_MTYPE_UC; ++ else ++ mapping_flags |= AMDGPU_VM_MTYPE_NC; ++ ++ (*mem)->mapping_flags = mapping_flags; ++ ++ alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain; ++ ++ amdgpu_sync_create(&(*mem)->sync); ++ ++ ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain); ++ if (ret) { ++ pr_err("Insufficient system memory\n"); ++ goto err_bo_create; ++ } ++ ++ pr_debug("\t create BO VA 0x%llx size 0x%llx domain %s\n", ++ va, size, domain_string(alloc_domain)); ++ ++ /* Allocate buffer object. Userptr objects need to start out ++ * in the CPU domain, get moved to GTT when pinned. ++ */ ++ ret = amdgpu_bo_create(adev, size, byte_align, false, ++ alloc_domain, ++ flags, sg, NULL, 0, &bo); ++ if (ret != 0) { ++ pr_err("Failed to create BO on domain %s. ret %d\n", ++ domain_string(alloc_domain), ret); ++ unreserve_system_mem_limit(adev, size, alloc_domain); ++ goto err_bo_create; ++ } ++ bo->kfd_bo = *mem; ++ (*mem)->bo = bo; ++ if (userptr) ++ bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; ++ ++ (*mem)->va = va; ++ (*mem)->domain = domain; ++ (*mem)->mapped_to_gpu_memory = 0; ++ (*mem)->process_info = kfd_vm->process_info; ++ add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, userptr); ++ ++ if (userptr) { ++ ret = init_user_pages(*mem, current->mm, user_addr); ++ if (ret) { ++ mutex_lock(&kfd_vm->process_info->lock); ++ list_del(&(*mem)->validate_list.head); ++ mutex_unlock(&kfd_vm->process_info->lock); ++ goto allocate_init_user_pages_failed; ++ } ++ } ++ ++ if (offset) ++ *offset = amdgpu_bo_mmap_offset(bo); ++ ++ return 0; ++ ++allocate_init_user_pages_failed: ++ amdgpu_bo_unref(&bo); ++err_bo_create: ++ kfree(*mem); ++err: ++ return ret; ++} ++ ++/* Reserving a BO and its page table BOs must happen atomically to ++ * avoid deadlocks. When updating userptrs we need to temporarily ++ * back-off the reservation and then reacquire it. Track all the ++ * reservation info in a context structure. Buffers can be mapped to ++ * multiple VMs simultaneously (buffers being restored on multiple ++ * GPUs). ++ */ ++struct bo_vm_reservation_context { ++ struct amdgpu_bo_list_entry kfd_bo; ++ unsigned int n_vms; ++ struct amdgpu_bo_list_entry *vm_pd; ++ struct ww_acquire_ctx ticket; ++ struct list_head list, duplicates; ++ struct amdgpu_sync *sync; ++ bool reserved; ++}; ++ ++/** ++ * reserve_bo_and_vm - reserve a BO and a VM unconditionally. ++ * @mem: KFD BO structure. ++ * @vm: the VM to reserve. ++ * @ctx: the struct that will be used in unreserve_bo_and_vms(). ++ */ ++static int reserve_bo_and_vm(struct kgd_mem *mem, ++ struct amdgpu_vm *vm, ++ struct bo_vm_reservation_context *ctx) ++{ ++ struct amdgpu_bo *bo = mem->bo; ++ int ret; ++ ++ WARN_ON(!vm); ++ ++ ctx->reserved = false; ++ ctx->n_vms = 1; ++ ctx->sync = &mem->sync; ++ ++ INIT_LIST_HEAD(&ctx->list); ++ INIT_LIST_HEAD(&ctx->duplicates); ++ ++ ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) ++ * ctx->n_vms, GFP_KERNEL); ++ if (ctx->vm_pd == NULL) ++ return -ENOMEM; ++ ++ ctx->kfd_bo.robj = bo; ++ ctx->kfd_bo.priority = 0; ++ ctx->kfd_bo.tv.bo = &bo->tbo; ++ ctx->kfd_bo.tv.shared = true; ++ ctx->kfd_bo.user_pages = NULL; ++ list_add(&ctx->kfd_bo.tv.head, &ctx->list); ++ ++ amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]); ++ ++ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, ++ false, &ctx->duplicates); ++ if (!ret) ++ ctx->reserved = true; ++ else ++ pr_err("Failed to reserve buffers in ttm\n"); ++ ++ if (ret) { ++ kfree(ctx->vm_pd); ++ ctx->vm_pd = NULL; ++ } ++ ++ return ret; ++} ++ ++enum VA_TYPE { ++ VA_NOT_MAPPED = 0, ++ VA_MAPPED, ++ VA_DO_NOT_CARE, ++}; ++ ++/** ++ * reserve_bo_and_vm - reserve a BO and some VMs that the BO has been added ++ * to, conditionally based on map_type. ++ * @mem: KFD BO structure. ++ * @vm: the VM to reserve. If NULL, then all VMs associated with the BO ++ * is used. Otherwise, a single VM associated with the BO. ++ * @map_type: the mapping status that will be used to filter the VMs. ++ * @ctx: the struct that will be used in unreserve_bo_and_vms(). ++ */ ++static int reserve_bo_and_cond_vms(struct kgd_mem *mem, ++ struct amdgpu_vm *vm, enum VA_TYPE map_type, ++ struct bo_vm_reservation_context *ctx) ++{ ++ struct amdgpu_bo *bo = mem->bo; ++ struct kfd_bo_va_list *entry; ++ unsigned int i; ++ int ret; ++ ++ ctx->reserved = false; ++ ctx->n_vms = 0; ++ ctx->vm_pd = NULL; ++ ctx->sync = &mem->sync; ++ ++ INIT_LIST_HEAD(&ctx->list); ++ INIT_LIST_HEAD(&ctx->duplicates); ++ ++ list_for_each_entry(entry, &mem->bo_va_list, bo_list) { ++ if ((vm && vm != entry->bo_va->base.vm) || ++ (entry->is_mapped != map_type ++ && map_type != VA_DO_NOT_CARE)) ++ continue; ++ ++ ctx->n_vms++; ++ } ++ ++ if (ctx->n_vms != 0) { ++ ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) ++ * ctx->n_vms, GFP_KERNEL); ++ if (ctx->vm_pd == NULL) ++ return -ENOMEM; ++ } ++ ++ ctx->kfd_bo.robj = bo; ++ ctx->kfd_bo.priority = 0; ++ ctx->kfd_bo.tv.bo = &bo->tbo; ++ ctx->kfd_bo.tv.shared = true; ++ ctx->kfd_bo.user_pages = NULL; ++ list_add(&ctx->kfd_bo.tv.head, &ctx->list); ++ ++ i = 0; ++ list_for_each_entry(entry, &mem->bo_va_list, bo_list) { ++ if ((vm && vm != entry->bo_va->base.vm) || ++ (entry->is_mapped != map_type ++ && map_type != VA_DO_NOT_CARE)) ++ continue; ++ ++ amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list, ++ &ctx->vm_pd[i]); ++ i++; ++ } ++ ++ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, ++ false, &ctx->duplicates); ++ if (!ret) ++ ctx->reserved = true; ++ else ++ pr_err("Failed to reserve buffers in ttm.\n"); ++ ++ if (ret) { ++ kfree(ctx->vm_pd); ++ ctx->vm_pd = NULL; ++ } ++ ++ return ret; ++} ++ ++static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, ++ bool wait, bool intr) ++{ ++ int ret = 0; ++ ++ if (wait) ++ ret = amdgpu_sync_wait(ctx->sync, intr); ++ ++ if (ctx->reserved) ++ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); ++ kfree(ctx->vm_pd); ++ ++ ctx->sync = NULL; ++ ++ ctx->reserved = false; ++ ctx->vm_pd = NULL; ++ ++ return ret; ++} ++ ++static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, ++ struct kfd_bo_va_list *entry, ++ struct amdgpu_sync *sync) ++{ ++ struct amdgpu_bo_va *bo_va = entry->bo_va; ++ struct amdgpu_vm *vm = bo_va->base.vm; ++ struct amdkfd_vm *kvm = container_of(vm, struct amdkfd_vm, base); ++ struct amdgpu_bo *pd = vm->root.base.bo; ++ ++ /* Remove eviction fence from PD (and thereby from PTs too as they ++ * share the resv. object. Otherwise during PT update job (see ++ * amdgpu_vm_bo_update_mapping), eviction fence will get added to ++ * job->sync object ++ */ ++ amdgpu_amdkfd_remove_eviction_fence(pd, ++ kvm->process_info->eviction_fence, ++ NULL, NULL); ++ amdgpu_vm_bo_unmap(adev, bo_va, entry->va); ++ ++ amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); ++ ++ /* Add the eviction fence back */ ++ amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); ++ ++ amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); ++ ++ /* Sync objects can't handle multiple GPUs (contexts) updating ++ * sync->last_vm_update. Fortunately we don't need it for ++ * KFD's purposes, so we can just drop that fence. ++ */ ++ if (sync->last_vm_update) { ++ dma_fence_put(sync->last_vm_update); ++ sync->last_vm_update = NULL; ++ } ++ ++ return 0; ++} ++ ++static int update_gpuvm_pte(struct amdgpu_device *adev, ++ struct kfd_bo_va_list *entry, ++ struct amdgpu_sync *sync) ++{ ++ int ret; ++ struct amdgpu_vm *vm; ++ struct amdgpu_bo_va *bo_va; ++ struct amdgpu_bo *bo; ++ ++ bo_va = entry->bo_va; ++ vm = bo_va->base.vm; ++ bo = bo_va->base.bo; ++ ++ /* Update the page tables */ ++ ret = amdgpu_vm_bo_update(adev, bo_va, false); ++ if (ret != 0) { ++ pr_err("amdgpu_vm_bo_update failed\n"); ++ return ret; ++ } ++ ++ amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); ++ ++ /* Sync objects can't handle multiple GPUs (contexts) updating ++ * sync->last_vm_update. Fortunately we don't need it for ++ * KFD's purposes, so we can just drop that fence. ++ */ ++ if (sync->last_vm_update) { ++ dma_fence_put(sync->last_vm_update); ++ sync->last_vm_update = NULL; ++ } ++ ++ return 0; ++} ++ ++static int map_bo_to_gpuvm(struct amdgpu_device *adev, ++ struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, ++ bool no_update_pte) ++{ ++ int ret; ++ ++ /* Set virtual address for the allocation */ ++ ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0, ++ amdgpu_bo_size(entry->bo_va->base.bo), entry->pte_flags); ++ if (ret != 0) { ++ pr_err("Failed to map VA 0x%llx in vm. ret %d\n", ++ entry->va, ret); ++ return ret; ++ } ++ ++ if (no_update_pte) ++ return 0; ++ ++ ret = update_gpuvm_pte(adev, entry, sync); ++ if (ret != 0) { ++ pr_err("update_gpuvm_pte() failed\n"); ++ goto update_gpuvm_pte_failed; ++ } ++ ++ return 0; ++ ++update_gpuvm_pte_failed: ++ unmap_bo_from_gpuvm(adev, entry, sync); ++ return ret; ++} ++ ++static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size) ++{ ++ struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL); ++ ++ if (!sg) ++ return NULL; ++ if (sg_alloc_table(sg, 1, GFP_KERNEL)) { ++ kfree(sg); ++ return NULL; ++ } ++ sg->sgl->dma_address = addr; ++ sg->sgl->length = size; ++#ifdef CONFIG_NEED_SG_DMA_LENGTH ++ sg->sgl->dma_length = size; ++#endif ++ return sg; ++} ++ ++int amdgpu_amdkfd_gpuvm_sync_memory( ++ struct kgd_dev *kgd, struct kgd_mem *mem, bool intr) ++{ ++ int ret = 0; ++ struct amdgpu_sync sync; ++ struct amdgpu_device *adev; ++ ++ adev = get_amdgpu_device(kgd); ++ amdgpu_sync_create(&sync); ++ ++ mutex_lock(&mem->lock); ++ amdgpu_sync_clone(adev, &mem->sync, &sync); ++ mutex_unlock(&mem->lock); ++ ++ ret = amdgpu_sync_wait(&sync, intr); ++ amdgpu_sync_free(&sync); ++ return ret; ++} ++ ++#define BOOL_TO_STR(b) (b == true) ? "true" : "false" ++ ++int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( ++ struct kgd_dev *kgd, uint64_t va, uint64_t size, ++ void *vm, struct kgd_mem **mem, ++ uint64_t *offset, uint32_t flags) ++{ ++ bool aql_queue, public, readonly, execute, coherent, no_sub, userptr; ++ u64 alloc_flag; ++ uint32_t domain; ++ uint64_t *temp_offset; ++ struct sg_table *sg = NULL; ++ ++ if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) { ++ pr_err("current hw doesn't support paged memory\n"); ++ return -EINVAL; ++ } ++ ++ domain = 0; ++ alloc_flag = 0; ++ temp_offset = NULL; ++ ++ aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false; ++ public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false; ++ readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false; ++ execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false; ++ coherent = (flags & ALLOC_MEM_FLAGS_COHERENT) ? true : false; ++ no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false; ++ userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false; ++ ++ /* ++ * Check on which domain to allocate BO ++ */ ++ if (flags & ALLOC_MEM_FLAGS_VRAM) { ++ domain = AMDGPU_GEM_DOMAIN_VRAM; ++ alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; ++ if (public) { ++ alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; ++ temp_offset = offset; ++ } ++ alloc_flag |= AMDGPU_GEM_CREATE_VRAM_CLEARED; ++ } else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) { ++ domain = AMDGPU_GEM_DOMAIN_GTT; ++ alloc_flag = 0; ++ temp_offset = offset; ++ } else if (flags & ALLOC_MEM_FLAGS_DOORBELL) { ++ domain = AMDGPU_GEM_DOMAIN_GTT; ++ alloc_flag = 0; ++ temp_offset = offset; ++ if (size > UINT_MAX) ++ return -EINVAL; ++ sg = create_doorbell_sg(*offset, size); ++ if (!sg) ++ return -ENOMEM; ++ } ++ ++ if (offset && !userptr) ++ *offset = 0; ++ ++ pr_debug("Allocate VA 0x%llx - 0x%llx domain %s aql %s\n", ++ va, va + size, domain_string(domain), ++ BOOL_TO_STR(aql_queue)); ++ ++ pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s coherent %s no_sub %s\n", ++ alloc_flag, BOOL_TO_STR(public), ++ BOOL_TO_STR(readonly), BOOL_TO_STR(execute), ++ BOOL_TO_STR(coherent), BOOL_TO_STR(no_sub)); ++ ++ return __alloc_memory_of_gpu(kgd, va, size, vm, mem, ++ temp_offset, domain, ++ alloc_flag, sg, ++ aql_queue, readonly, execute, ++ coherent, no_sub, userptr); ++} ++ ++int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) ++{ ++ struct amdgpu_device *adev; ++ struct kfd_bo_va_list *entry, *tmp; ++ struct bo_vm_reservation_context ctx; ++ int ret = 0; ++ struct ttm_validate_buffer *bo_list_entry; ++ struct amdkfd_process_info *process_info; ++ unsigned long bo_size; ++ ++ adev = get_amdgpu_device(kgd); ++ process_info = ((struct amdkfd_vm *)vm)->process_info; ++ ++ bo_size = mem->bo->tbo.mem.size; ++ ++ mutex_lock(&mem->lock); ++ ++ if (mem->mapped_to_gpu_memory > 0) { ++ pr_err("BO VA 0x%llx size 0x%lx is already mapped to vm %p.\n", ++ mem->va, bo_size, vm); ++ mutex_unlock(&mem->lock); ++ return -EBUSY; ++ } ++ ++ mutex_unlock(&mem->lock); ++ /* lock is not needed after this, since mem is unused and will ++ * be freed anyway ++ */ ++ ++ /* No more MMU notifiers */ ++ amdgpu_mn_unregister(mem->bo); ++ ++ /* Make sure restore workers don't access the BO any more */ ++ bo_list_entry = &mem->validate_list; ++ mutex_lock(&process_info->lock); ++ list_del(&bo_list_entry->head); ++ mutex_unlock(&process_info->lock); ++ ++ /* Free user pages if necessary */ ++ if (mem->user_pages) { ++ pr_debug("%s: Freeing user_pages array\n", __func__); ++ if (mem->user_pages[0]) ++ release_pages(mem->user_pages, ++ mem->bo->tbo.ttm->num_pages, 0); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ drm_free_large(mem->user_pages); ++#else ++ kvfree(mem->user_pages); ++#endif ++ } ++ ++ ret = reserve_bo_and_cond_vms(mem, NULL, VA_DO_NOT_CARE, &ctx); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++ /* The eviction fence should be removed by the last unmap. ++ * TODO: Log an error condition if the bo still has the eviction fence ++ * attached ++ */ ++ amdgpu_amdkfd_remove_eviction_fence(mem->bo, ++ process_info->eviction_fence, ++ NULL, NULL); ++ pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va, ++ mem->va + bo_size * (1 + mem->aql_queue)); ++ ++ /* Remove from VM internal data structures */ ++ list_for_each_entry_safe(entry, tmp, &mem->bo_va_list, bo_list) { ++ remove_bo_from_vm((struct amdgpu_device *)entry->kgd_dev, ++ entry, bo_size); ++ } ++ ++ ret = unreserve_bo_and_vms(&ctx, false, false); ++ ++ /* Free the sync object */ ++ amdgpu_sync_free(&mem->sync); ++ ++ /* If the SG is not NULL, it's one we created for a doorbell ++ * BO. We need to free it. ++ */ ++ if (mem->bo->tbo.sg) { ++ sg_free_table(mem->bo->tbo.sg); ++ kfree(mem->bo->tbo.sg); ++ } ++ ++ /* Free the BO*/ ++ amdgpu_bo_unref(&mem->bo); ++ kfree(mem); ++ ++ return ret; ++} ++ ++int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) ++{ ++ struct amdgpu_device *adev; ++ int ret; ++ struct amdgpu_bo *bo; ++ uint32_t domain; ++ struct kfd_bo_va_list *entry; ++ struct bo_vm_reservation_context ctx; ++ struct kfd_bo_va_list *bo_va_entry = NULL; ++ struct kfd_bo_va_list *bo_va_entry_aql = NULL; ++ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; ++ unsigned long bo_size; ++ bool is_invalid_userptr; ++ ++ adev = get_amdgpu_device(kgd); ++ ++ /* Make sure restore is not running concurrently. Since we ++ * don't map invalid userptr BOs, we rely on the next restore ++ * worker to do the mapping ++ */ ++ mutex_lock(&mem->process_info->lock); ++ ++ /* Lock mmap-sem. If we find an invalid userptr BO, we can be ++ * sure that the MMU notifier is no longer running ++ * concurrently and the queues are actually stopped ++ */ ++ down_read(¤t->mm->mmap_sem); ++ is_invalid_userptr = atomic_read(&mem->invalid); ++ up_read(¤t->mm->mmap_sem); ++ ++ mutex_lock(&mem->lock); ++ ++ bo = mem->bo; ++ ++ if (!bo) { ++ pr_err("Invalid BO when mapping memory to GPU\n"); ++ return -EINVAL; ++ } ++ ++ domain = mem->domain; ++ bo_size = bo->tbo.mem.size; ++ ++ pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n", ++ mem->va, ++ mem->va + bo_size * (1 + mem->aql_queue), ++ vm, domain_string(domain)); ++ ++ ret = reserve_bo_and_vm(mem, vm, &ctx); ++ if (unlikely(ret != 0)) ++ goto bo_reserve_failed; ++ ++ /* Userptr can be marked as "not invalid", but not actually be ++ * validated yet (still in the system domain). In that case ++ * the queues are still stopped and we can leave mapping for ++ * the next restore worker ++ */ ++ if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) ++ is_invalid_userptr = true; ++ ++ if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, mem)) { ++ ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, false, ++ &bo_va_entry); ++ if (ret != 0) ++ goto add_bo_to_vm_failed; ++ if (mem->aql_queue) { ++ ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, ++ true, &bo_va_entry_aql); ++ if (ret != 0) ++ goto add_bo_to_vm_failed_aql; ++ } ++ } ++ ++ if (mem->mapped_to_gpu_memory == 0 && ++ !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { ++ /* Validate BO only once. The eviction fence gets added to BO ++ * the first time it is mapped. Validate will wait for all ++ * background evictions to complete. ++ */ ++ ret = amdgpu_amdkfd_bo_validate(bo, domain, true); ++ if (ret) { ++ pr_debug("Validate failed\n"); ++ goto map_bo_to_gpuvm_failed; ++ } ++ } ++ ++ list_for_each_entry(entry, &mem->bo_va_list, bo_list) { ++ if (entry->bo_va->base.vm == vm && !entry->is_mapped) { ++ pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n", ++ entry->va, entry->va + bo_size, ++ entry); ++ ++ ret = map_bo_to_gpuvm(adev, entry, ctx.sync, ++ is_invalid_userptr); ++ if (ret != 0) { ++ pr_err("Failed to map radeon bo to gpuvm\n"); ++ goto map_bo_to_gpuvm_failed; ++ } ++ entry->is_mapped = true; ++ mem->mapped_to_gpu_memory++; ++ pr_debug("\t INC mapping count %d\n", ++ mem->mapped_to_gpu_memory); ++ } ++ } ++ ++ if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) == NULL) ++ amdgpu_bo_fence(bo, ++ &kfd_vm->process_info->eviction_fence->base, ++ true); ++ ret = unreserve_bo_and_vms(&ctx, false, false); ++ ++ mutex_unlock(&mem->process_info->lock); ++ mutex_unlock(&mem->lock); ++ return ret; ++ ++map_bo_to_gpuvm_failed: ++ if (bo_va_entry_aql) ++ remove_bo_from_vm(adev, bo_va_entry_aql, bo_size); ++add_bo_to_vm_failed_aql: ++ if (bo_va_entry) ++ remove_bo_from_vm(adev, bo_va_entry, bo_size); ++add_bo_to_vm_failed: ++ unreserve_bo_and_vms(&ctx, false, false); ++bo_reserve_failed: ++ mutex_unlock(&mem->process_info->lock); ++ mutex_unlock(&mem->lock); ++ return ret; ++} ++ ++static u64 get_vm_pd_gpu_offset(void *vm) ++{ ++ struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; ++ struct amdgpu_device *adev = ++ amdgpu_ttm_adev(avm->root.base.bo->tbo.bdev); ++ u64 offset; ++ ++ BUG_ON(avm == NULL); ++ ++ amdgpu_bo_reserve(avm->root.base.bo, false); ++ ++ offset = amdgpu_bo_gpu_offset(avm->root.base.bo); ++ ++ amdgpu_bo_unreserve(avm->root.base.bo); ++ ++ /* On some ASICs the FB doesn't start at 0. Adjust FB offset ++ * to an actual MC address. ++ */ ++ if (adev->gart.gart_funcs->get_vm_pde) ++ offset = amdgpu_gart_get_vm_pde(adev, offset); ++ ++ return offset; ++} ++ ++int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, ++ void **process_info, ++ struct dma_fence **ef) ++{ ++ int ret; ++ struct amdkfd_vm *new_vm; ++ struct amdkfd_process_info *info; ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ ++ new_vm = kzalloc(sizeof(*new_vm), GFP_KERNEL); ++ if (new_vm == NULL) ++ return -ENOMEM; ++ ++ /* Initialize the VM context, allocate the page directory and zero it */ ++ ret = amdgpu_vm_init(adev, &new_vm->base, AMDGPU_VM_CONTEXT_COMPUTE); ++ if (ret != 0) { ++ pr_err("Failed init vm ret %d\n", ret); ++ /* Undo everything related to the new VM context */ ++ goto vm_init_fail; ++ } ++ new_vm->adev = adev; ++ ++ if (!*process_info) { ++ info = kzalloc(sizeof(*info), GFP_KERNEL); ++ if (!info) { ++ pr_err("Failed to create amdkfd_process_info"); ++ ret = -ENOMEM; ++ goto alloc_process_info_fail; ++ } ++ ++ mutex_init(&info->lock); ++ INIT_LIST_HEAD(&info->vm_list_head); ++ INIT_LIST_HEAD(&info->kfd_bo_list); ++ INIT_LIST_HEAD(&info->userptr_valid_list); ++ INIT_LIST_HEAD(&info->userptr_inval_list); ++ ++ info->eviction_fence = ++ amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), ++ current->mm); ++ if (info->eviction_fence == NULL) { ++ pr_err("Failed to create eviction fence\n"); ++ goto create_evict_fence_fail; ++ } ++ ++ info->pid = get_task_pid(current->group_leader, ++ PIDTYPE_PID); ++ atomic_set(&info->evicted_bos, 0); ++ INIT_DELAYED_WORK(&info->work, ++ amdgpu_amdkfd_restore_userptr_worker); ++ ++ *process_info = info; ++ *ef = dma_fence_get(&info->eviction_fence->base); ++ } ++ ++ new_vm->process_info = *process_info; ++ ++ mutex_lock(&new_vm->process_info->lock); ++ list_add_tail(&new_vm->vm_list_node, ++ &(new_vm->process_info->vm_list_head)); ++ new_vm->process_info->n_vms++; ++ mutex_unlock(&new_vm->process_info->lock); ++ ++ *vm = (void *) new_vm; ++ ++ pr_debug("Created process vm %p\n", *vm); ++ ++ return ret; ++ ++create_evict_fence_fail: ++ kfree(info); ++alloc_process_info_fail: ++ amdgpu_vm_fini(adev, &new_vm->base); ++vm_init_fail: ++ kfree(new_vm); ++ return ret; ++ ++} ++ ++void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *) vm; ++ struct amdgpu_vm *avm = &kfd_vm->base; ++ struct amdgpu_bo *pd; ++ struct amdkfd_process_info *process_info; ++ ++ if (WARN_ON(!kgd || !vm)) ++ return; ++ ++ pr_debug("Destroying process vm %p\n", vm); ++ /* Release eviction fence from PD */ ++ pd = avm->root.base.bo; ++ amdgpu_bo_reserve(pd, false); ++ amdgpu_bo_fence(pd, NULL, false); ++ amdgpu_bo_unreserve(pd); ++ ++ process_info = kfd_vm->process_info; ++ ++ mutex_lock(&process_info->lock); ++ process_info->n_vms--; ++ list_del(&kfd_vm->vm_list_node); ++ mutex_unlock(&process_info->lock); ++ ++ /* Release per-process resources */ ++ if (!process_info->n_vms) { ++ WARN_ON(!list_empty(&process_info->kfd_bo_list)); ++ WARN_ON(!list_empty(&process_info->userptr_valid_list)); ++ WARN_ON(!list_empty(&process_info->userptr_inval_list)); ++ ++ dma_fence_put(&process_info->eviction_fence->base); ++ cancel_delayed_work_sync(&process_info->work); ++ put_pid(process_info->pid); ++ kfree(process_info); ++ } ++ ++ /* Release the VM context */ ++ amdgpu_vm_fini(adev, avm); ++ kfree(vm); ++} ++ ++uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm) ++{ ++ return get_vm_pd_gpu_offset(vm) >> AMDGPU_GPU_PAGE_SHIFT; ++} ++ ++int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, ++ struct kfd_vm_fault_info *mem) ++{ ++ struct amdgpu_device *adev; ++ ++ adev = (struct amdgpu_device *) kgd; ++ if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) { ++ *mem = *adev->mc.vm_fault_info; ++ mb(); ++ atomic_set(&adev->mc.vm_fault_info_updated, 0); ++ } ++ return 0; ++} ++ ++static bool is_mem_on_local_device(struct kgd_dev *kgd, ++ struct list_head *bo_va_list, void *vm) ++{ ++ struct kfd_bo_va_list *entry; ++ ++ list_for_each_entry(entry, bo_va_list, bo_list) { ++ if (entry->kgd_dev == kgd && entry->bo_va->base.vm == vm) ++ return true; ++ } ++ ++ return false; ++} ++ ++int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) ++{ ++ struct kfd_bo_va_list *entry; ++ struct amdgpu_device *adev; ++ unsigned int mapped_before; ++ int ret = 0; ++ struct bo_vm_reservation_context ctx; ++ struct amdkfd_process_info *process_info; ++ unsigned long bo_size; ++ ++ adev = (struct amdgpu_device *) kgd; ++ process_info = ((struct amdkfd_vm *)vm)->process_info; ++ ++ bo_size = mem->bo->tbo.mem.size; ++ ++ mutex_lock(&mem->lock); ++ ++ /* ++ * Make sure that this BO mapped on KGD before unmappping it ++ */ ++ if (!is_mem_on_local_device(kgd, &mem->bo_va_list, vm)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (mem->mapped_to_gpu_memory == 0) { ++ pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", ++ mem->va, bo_size, vm); ++ ret = -EINVAL; ++ goto out; ++ } ++ mapped_before = mem->mapped_to_gpu_memory; ++ ++ ret = reserve_bo_and_cond_vms(mem, vm, VA_MAPPED, &ctx); ++ if (unlikely(ret != 0)) ++ goto out; ++ ++ pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n", ++ mem->va, ++ mem->va + bo_size * (1 + mem->aql_queue), ++ vm); ++ ++ list_for_each_entry(entry, &mem->bo_va_list, bo_list) { ++ if (entry->bo_va->base.vm == vm && entry->is_mapped) { ++ pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n", ++ entry->va, ++ entry->va + bo_size, ++ entry); ++ ++ ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync); ++ if (ret == 0) { ++ entry->is_mapped = false; ++ } else { ++ pr_err("failed to unmap VA 0x%llx\n", ++ mem->va); ++ goto unreserve_out; ++ } ++ ++ mem->mapped_to_gpu_memory--; ++ pr_debug("\t DEC mapping count %d\n", ++ mem->mapped_to_gpu_memory); ++ } ++ } ++ ++ /* If BO is unmapped from all VMs, unfence it. It can be evicted if ++ * required. ++ */ ++ if (mem->mapped_to_gpu_memory == 0 && ++ !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) ++ amdgpu_amdkfd_remove_eviction_fence(mem->bo, ++ process_info->eviction_fence, ++ NULL, NULL); ++ ++ if (mapped_before == mem->mapped_to_gpu_memory) { ++ pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", ++ mem->va, bo_size, vm); ++ ret = -EINVAL; ++ } ++ ++unreserve_out: ++ unreserve_bo_and_vms(&ctx, false, false); ++out: ++ mutex_unlock(&mem->lock); ++ return ret; ++} ++ ++int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) ++{ ++ struct amdgpu_device *adev; ++ ++ adev = get_amdgpu_device(kgd); ++ if (!adev) { ++ pr_err("Could not get amdgpu device in %s\n", __func__); ++ return -ENODEV; ++ } ++ ++ return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev); ++} ++ ++int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, ++ struct kgd_mem *mem, void **kptr) ++{ ++ int ret; ++ struct amdgpu_bo *bo = mem->bo; ++ ++ if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { ++ pr_err("userptr can't be mapped to kernel\n"); ++ return -EINVAL; ++ } ++ ++ /* delete kgd_mem from kfd_bo_list to avoid re-validating ++ * this BO in BO's restoring after eviction. ++ */ ++ mutex_lock(&mem->process_info->lock); ++ ++ list_del_init(&mem->validate_list.head); ++ ++ ret = __map_bo_to_kernel(bo, AMDGPU_GEM_DOMAIN_GTT, kptr); ++ if (!ret) ++ mem->kptr = *kptr; ++ ++ mutex_unlock(&mem->process_info->lock); ++ ++ return ret; ++} ++ ++static int pin_bo_wo_map(struct kgd_mem *mem) ++{ ++ struct amdgpu_bo *bo = mem->bo; ++ int ret = 0; ++ ++ ret = amdgpu_bo_reserve(bo, false); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++ ret = amdgpu_bo_pin(bo, mem->domain, NULL); ++ amdgpu_bo_unreserve(bo); ++ ++ return ret; ++} ++ ++static void unpin_bo_wo_map(struct kgd_mem *mem) ++{ ++ struct amdgpu_bo *bo = mem->bo; ++ int ret = 0; ++ ++ ret = amdgpu_bo_reserve(bo, false); ++ if (unlikely(ret != 0)) ++ return; ++ ++ amdgpu_bo_unpin(bo); ++ amdgpu_bo_unreserve(bo); ++} ++ ++#define AMD_GPU_PAGE_SHIFT PAGE_SHIFT ++#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT) ++ ++static int get_sg_table(struct amdgpu_device *adev, ++ struct kgd_mem *mem, uint64_t offset, ++ uint64_t size, struct sg_table **ret_sg) ++{ ++ struct amdgpu_bo *bo = mem->bo; ++ struct sg_table *sg = NULL; ++ unsigned long bus_addr; ++ unsigned int chunks; ++ unsigned int i; ++ struct scatterlist *s; ++ uint64_t offset_in_page; ++ unsigned int page_size; ++ int ret; ++ ++ sg = kmalloc(sizeof(*sg), GFP_KERNEL); ++ if (!sg) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) ++ page_size = AMD_GPU_PAGE_SIZE; ++ else ++ page_size = PAGE_SIZE; ++ ++ ++ offset_in_page = offset & (page_size - 1); ++ chunks = (size + offset_in_page + page_size - 1) ++ / page_size; ++ ++ ret = sg_alloc_table(sg, chunks, GFP_KERNEL); ++ if (unlikely(ret)) ++ goto out; ++ ++ if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) { ++ bus_addr = bo->tbo.offset + adev->mc.aper_base + offset; ++ ++ for_each_sg(sg->sgl, s, sg->orig_nents, i) { ++ uint64_t chunk_size, length; ++ ++ chunk_size = page_size - offset_in_page; ++ length = min(size, chunk_size); ++ ++ sg_set_page(s, NULL, length, offset_in_page); ++ s->dma_address = bus_addr; ++ s->dma_length = length; ++ ++ size -= length; ++ offset_in_page = 0; ++ bus_addr += length; ++ } ++ } else { ++ struct page **pages; ++ unsigned int cur_page; ++ ++ pages = bo->tbo.ttm->pages; ++ ++ cur_page = offset / page_size; ++ for_each_sg(sg->sgl, s, sg->orig_nents, i) { ++ uint64_t chunk_size, length; ++ ++ chunk_size = page_size - offset_in_page; ++ length = min(size, chunk_size); ++ ++ sg_set_page(s, pages[cur_page], length, offset_in_page); ++ s->dma_address = page_to_phys(pages[cur_page]); ++ s->dma_length = length; ++ ++ size -= length; ++ offset_in_page = 0; ++ cur_page++; ++ } ++ } ++ ++ *ret_sg = sg; ++ return 0; ++out: ++ kfree(sg); ++ *ret_sg = NULL; ++ return ret; ++} ++ ++int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, ++ struct kgd_mem *mem, uint64_t offset, ++ uint64_t size, struct sg_table **ret_sg) ++{ ++ int ret; ++ struct amdgpu_device *adev; ++ ++ ret = pin_bo_wo_map(mem); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++ adev = get_amdgpu_device(kgd); ++ ++ ret = get_sg_table(adev, mem, offset, size, ret_sg); ++ if (ret) ++ unpin_bo_wo_map(mem); ++ ++ return ret; ++} ++ ++void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( ++ struct kgd_mem *mem, struct sg_table *sg) ++{ ++ sg_free_table(sg); ++ kfree(sg); ++ ++ unpin_bo_wo_map(mem); ++} ++ ++int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, ++ struct dma_buf *dma_buf, ++ uint64_t va, void *vm, ++ struct kgd_mem **mem, uint64_t *size, ++ uint64_t *mmap_offset) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ struct drm_gem_object *obj; ++ struct amdgpu_bo *bo; ++ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; ++ ++ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) ++ /* Can't handle non-graphics buffers */ ++ return -EINVAL; ++ ++ obj = dma_buf->priv; ++ if (obj->dev->dev_private != adev) ++ /* Can't handle buffers from other devices */ ++ return -EINVAL; ++ ++ bo = gem_to_amdgpu_bo(obj); ++ if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | ++ AMDGPU_GEM_DOMAIN_GTT | ++ AMDGPU_GEM_DOMAIN_DGMA))) ++ /* Only VRAM and GTT BOs are supported */ ++ return -EINVAL; ++ ++ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); ++ if (*mem == NULL) ++ return -ENOMEM; ++ ++ if (size) ++ *size = amdgpu_bo_size(bo); ++ ++ if (mmap_offset) ++ *mmap_offset = amdgpu_bo_mmap_offset(bo); ++ ++ INIT_LIST_HEAD(&(*mem)->bo_va_list); ++ mutex_init(&(*mem)->lock); ++ (*mem)->mapping_flags = ++ AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | ++ AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC; ++ ++ (*mem)->bo = amdgpu_bo_ref(bo); ++ (*mem)->va = va; ++ if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ++ (*mem)->domain = AMDGPU_GEM_DOMAIN_VRAM; ++ else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) ++ (*mem)->domain = AMDGPU_GEM_DOMAIN_GTT; ++ else ++ (*mem)->domain = AMDGPU_GEM_DOMAIN_DGMA; ++ (*mem)->mapped_to_gpu_memory = 0; ++ (*mem)->process_info = kfd_vm->process_info; ++ add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, false); ++ amdgpu_sync_create(&(*mem)->sync); ++ ++ return 0; ++} ++ ++int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, ++ struct kgd_mem *mem, ++ struct dma_buf **dmabuf) ++{ ++ struct amdgpu_device *adev = NULL; ++ struct amdgpu_bo *bo = NULL; ++ struct drm_gem_object *gobj = NULL; ++ ++ if (!dmabuf || !kgd || !vm || !mem) ++ return -EINVAL; ++ ++ adev = get_amdgpu_device(kgd); ++ bo = mem->bo; ++ ++ gobj = amdgpu_gem_prime_foreign_bo(adev, bo); ++ if (gobj == NULL) { ++ pr_err("Export BO failed. Unable to find/create GEM object\n"); ++ return -EINVAL; ++ } ++ ++ *dmabuf = amdgpu_gem_prime_export(adev->ddev, gobj, 0); ++ return 0; ++} ++ ++static int process_validate_vms(struct amdkfd_process_info *process_info) ++{ ++ struct amdkfd_vm *peer_vm; ++ int ret; ++ ++ list_for_each_entry(peer_vm, &process_info->vm_list_head, ++ vm_list_node) { ++ ret = vm_validate_pt_pd_bos(&peer_vm->base); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* Evict a userptr BO by stopping the queues if necessary ++ * ++ * Runs in MMU notifier, may be in RECLAIM_FS context. This means it ++ * cannot do any memory allocations, and cannot take any locks that ++ * are held elsewhere while allocating memory. Therefore this is as ++ * simple as possible, using atomic counters. ++ * ++ * It doesn't do anything to the BO itself. The real work happens in ++ * restore, where we get updated page addresses. This function only ++ * ensures that GPU access to the BO is stopped. ++ */ ++int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, ++ struct mm_struct *mm) ++{ ++ struct amdkfd_process_info *process_info = mem->process_info; ++ int invalid, evicted_bos; ++ int r = 0; ++ ++ invalid = atomic_inc_return(&mem->invalid); ++ evicted_bos = atomic_inc_return(&process_info->evicted_bos); ++ if (evicted_bos == 1) { ++ /* First eviction, stop the queues */ ++ r = kgd2kfd->quiesce_mm(NULL, mm); ++ if (r != 0) ++ pr_err("Failed to quiesce KFD\n"); ++ schedule_delayed_work(&process_info->work, 1); ++ } ++ ++ return r; ++} ++ ++/* Update invalid userptr BOs ++ * ++ * Moves invalidated (evicted) userptr BOs from userptr_valid_list to ++ * userptr_inval_list and updates user pages for all BOs that have ++ * been invalidated since their last update. ++ */ ++static int update_invalid_user_pages(struct amdkfd_process_info *process_info, ++ struct mm_struct *mm) ++{ ++ struct kgd_mem *mem, *tmp_mem; ++ struct amdgpu_bo *bo; ++ int invalid, ret; ++ ++ /* Move all invalidated BOs to the userptr_inval_list and ++ * release their user pages by migration to the CPU domain ++ */ ++ list_for_each_entry_safe(mem, tmp_mem, ++ &process_info->userptr_valid_list, ++ validate_list.head) { ++ if (!atomic_read(&mem->invalid)) ++ continue; /* BO is still valid */ ++ ++ bo = mem->bo; ++ ++ if (amdgpu_bo_reserve(bo, true)) ++ return -EAGAIN; ++ amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); ++ amdgpu_bo_unreserve(bo); ++ if (ret) { ++ pr_err("%s: Failed to invalidate userptr BO\n", ++ __func__); ++ return -EAGAIN; ++ } ++ ++ list_move_tail(&mem->validate_list.head, ++ &process_info->userptr_inval_list); ++ } ++ ++ if (list_empty(&process_info->userptr_inval_list)) ++ return 0; /* All evicted userptr BOs were freed */ ++ ++ /* Go through userptr_inval_list and update any invalid user_pages */ ++ list_for_each_entry(mem, &process_info->userptr_inval_list, ++ validate_list.head) { ++ invalid = atomic_read(&mem->invalid); ++ if (!invalid) ++ /* BO hasn't been invalidated since the last ++ * revalidation attempt. Keep its BO list. ++ */ ++ continue; ++ ++ bo = mem->bo; ++ ++ if (!mem->user_pages) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ mem->user_pages = ++ drm_calloc_large(bo->tbo.ttm->num_pages, ++ sizeof(struct page *)); ++#else ++ mem->user_pages = ++ kvmalloc_array(bo->tbo.ttm->num_pages, ++ sizeof(struct page *), ++ GFP_KERNEL | __GFP_ZERO); ++#endif ++ if (!mem->user_pages) { ++ pr_err("%s: Failed to allocate pages array\n", ++ __func__); ++ return -ENOMEM; ++ } ++ } else if (mem->user_pages[0]) { ++ release_pages(mem->user_pages, ++ bo->tbo.ttm->num_pages, 0); ++ } ++ ++ /* Get updated user pages */ ++ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, ++ mem->user_pages); ++ if (ret) { ++ mem->user_pages[0] = NULL; ++ pr_info("%s: Failed to get user pages: %d\n", ++ __func__, ret); ++ /* Pretend it succeeded. It will fail later ++ * with a VM fault if the GPU tries to access ++ * it. Better than hanging indefinitely with ++ * stalled user mode queues. ++ */ ++ } ++ ++ /* Mark the BO as valid unless it was invalidated ++ * again concurrently ++ */ ++ if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) ++ return -EAGAIN; ++ } ++ return 0; ++} ++ ++/* Validate invalid userptr BOs ++ * ++ * Validates BOs on the userptr_inval_list, and moves them back to the ++ * userptr_valid_list. Also updates GPUVM page tables with new page ++ * addresses and waits for the page table updates to complete. ++ */ ++static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) ++{ ++ struct amdgpu_bo_list_entry *pd_bo_list_entries; ++ struct list_head resv_list, duplicates; ++ struct ww_acquire_ctx ticket; ++ struct amdgpu_sync sync; ++ ++ struct amdkfd_vm *peer_vm; ++ struct kgd_mem *mem, *tmp_mem; ++ struct amdgpu_bo *bo; ++ int i, ret; ++ ++ pd_bo_list_entries = kcalloc(process_info->n_vms, ++ sizeof(struct amdgpu_bo_list_entry), ++ GFP_KERNEL); ++ if (!pd_bo_list_entries) { ++ pr_err("%s: Failed to allocate PD BO list entries\n", __func__); ++ return -ENOMEM; ++ } ++ ++ INIT_LIST_HEAD(&resv_list); ++ INIT_LIST_HEAD(&duplicates); ++ ++ /* Get all the page directory BOs that need to be reserved */ ++ i = 0; ++ list_for_each_entry(peer_vm, &process_info->vm_list_head, ++ vm_list_node) ++ amdgpu_vm_get_pd_bo(&peer_vm->base, &resv_list, ++ &pd_bo_list_entries[i++]); ++ /* Add the userptr_inval_list entries to resv_list */ ++ list_for_each_entry(mem, &process_info->userptr_inval_list, ++ validate_list.head) { ++ list_add_tail(&mem->resv_list.head, &resv_list); ++ mem->resv_list.bo = mem->validate_list.bo; ++ mem->resv_list.shared = mem->validate_list.shared; ++ } ++ ++ /* Reserve all BOs and page tables for validation */ ++ ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); ++ WARN(!list_empty(&duplicates), "Duplicates should be empty"); ++ if (ret) ++ goto out; ++ ++ amdgpu_sync_create(&sync); ++ ++ /* Avoid triggering eviction fences when unmapping invalid ++ * userptr BOs (waits for all fences, doesn't use ++ * FENCE_OWNER_VM) ++ */ ++ list_for_each_entry(peer_vm, &process_info->vm_list_head, ++ vm_list_node) ++ amdgpu_amdkfd_remove_eviction_fence(peer_vm->base.root.base.bo, ++ process_info->eviction_fence, ++ NULL, NULL); ++ ++ ret = process_validate_vms(process_info); ++ if (ret) ++ goto unreserve_out; ++ ++ /* Validate BOs and update GPUVM page tables */ ++ list_for_each_entry_safe(mem, tmp_mem, ++ &process_info->userptr_inval_list, ++ validate_list.head) { ++ struct kfd_bo_va_list *bo_va_entry; ++ ++ bo = mem->bo; ++ ++ /* Copy pages array and validate the BO if we got user pages */ ++ if (mem->user_pages[0]) { ++ amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, ++ mem->user_pages); ++ amdgpu_ttm_placement_from_domain(bo, mem->domain); ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, ++ false, false); ++ if (ret) { ++ pr_err("%s: failed to validate BO\n", __func__); ++ goto unreserve_out; ++ } ++ } ++ ++ /* Validate succeeded, now the BO owns the pages, free ++ * our copy of the pointer array. Put this BO back on ++ * the userptr_valid_list. If we need to revalidate ++ * it, we need to start from scratch. ++ */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ drm_free_large(mem->user_pages); ++#else ++ kvfree(mem->user_pages); ++#endif ++ mem->user_pages = NULL; ++ list_move_tail(&mem->validate_list.head, ++ &process_info->userptr_valid_list); ++ ++ /* Update mapping. If the BO was not validated ++ * (because we couldn't get user pages), this will ++ * clear the page table entries, which will result in ++ * VM faults if the GPU tries to access the invalid ++ * memory. ++ */ ++ list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { ++ if (!bo_va_entry->is_mapped) ++ continue; ++ ++ ret = update_gpuvm_pte((struct amdgpu_device *) ++ bo_va_entry->kgd_dev, ++ bo_va_entry, &sync); ++ if (ret) { ++ pr_err("%s: update PTE failed\n", __func__); ++ /* make sure this gets validated again */ ++ atomic_inc(&mem->invalid); ++ goto unreserve_out; ++ } ++ } ++ } ++unreserve_out: ++ list_for_each_entry(peer_vm, &process_info->vm_list_head, ++ vm_list_node) ++ amdgpu_bo_fence(peer_vm->base.root.base.bo, ++ &process_info->eviction_fence->base, true); ++ ttm_eu_backoff_reservation(&ticket, &resv_list); ++ amdgpu_sync_wait(&sync, false); ++ amdgpu_sync_free(&sync); ++out: ++ kfree(pd_bo_list_entries); ++ ++ return ret; ++} ++ ++/* Worker callback to restore evicted userptr BOs ++ * ++ * Tries to update and validate all userptr BOs. If successful and no ++ * concurrent evictions happened, the queues are restarted. Otherwise, ++ * reschedule for another attempt later. ++ */ ++static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct amdkfd_process_info *process_info = ++ container_of(dwork, struct amdkfd_process_info, work); ++ struct task_struct *usertask; ++ struct mm_struct *mm; ++ int evicted_bos; ++ ++ evicted_bos = atomic_read(&process_info->evicted_bos); ++ if (!evicted_bos) ++ return; ++ ++ /* Reference task and mm in case of concurrent process termination */ ++ usertask = get_pid_task(process_info->pid, PIDTYPE_PID); ++ if (!usertask) ++ return; ++ mm = get_task_mm(usertask); ++ if (!mm) { ++ put_task_struct(usertask); ++ return; ++ } ++ ++ mutex_lock(&process_info->lock); ++ ++ if (update_invalid_user_pages(process_info, mm)) ++ goto unlock_out; ++ /* userptr_inval_list can be empty if all evicted userptr BOs ++ * have been freed. In that case there is nothing to validate ++ * and we can just restart the queues. ++ */ ++ if (!list_empty(&process_info->userptr_inval_list)) { ++ if (atomic_read(&process_info->evicted_bos) != evicted_bos) ++ goto unlock_out; /* Concurrent eviction, try again */ ++ ++ if (validate_invalid_user_pages(process_info)) ++ goto unlock_out; ++ } ++ /* Final check for concurrent evicton and atomic update. If ++ * another eviction happens after successful update, it will ++ * be a first eviction that calls quiesce_mm. The eviction ++ * reference counting inside KFD will handle this case. ++ */ ++ if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != ++ evicted_bos) ++ goto unlock_out; ++ evicted_bos = 0; ++ if (kgd2kfd->resume_mm(NULL, mm)) { ++ pr_err("%s: Failed to resume KFD\n", __func__); ++ /* No recovery from this failure. Probably the CP is ++ * hanging. No point trying again. ++ */ ++ } ++unlock_out: ++ mutex_unlock(&process_info->lock); ++ mmput(mm); ++ put_task_struct(usertask); ++ ++ /* If validation failed, reschedule another attempt */ ++ if (evicted_bos) ++ schedule_delayed_work(&process_info->work, 1); ++} ++ ++/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given ++ * KFD process identified by process_info ++ * ++ * @process_info: amdkfd_process_info of the KFD process ++ * ++ * After memory eviction, restore thread calls this function. The function ++ * should be called when the Process is still valid. BO restore involves - ++ * ++ * 1. Release old eviction fence and create new one ++ * 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list. ++ * 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of ++ * BOs that need to be reserved. ++ * 4. Reserve all the BOs ++ * 5. Validate of PD and PT BOs. ++ * 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence ++ * 7. Add fence to all PD and PT BOs. ++ * 8. Unreserve all BOs ++ */ ++ ++int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) ++{ ++ struct amdgpu_bo_list_entry *pd_bo_list; ++ struct amdkfd_process_info *process_info = info; ++ struct amdkfd_vm *peer_vm; ++ struct kgd_mem *mem; ++ struct bo_vm_reservation_context ctx; ++ struct amdgpu_amdkfd_fence *new_fence; ++ int ret = 0, i; ++ struct list_head duplicate_save; ++ struct amdgpu_sync sync_obj; ++ ++ INIT_LIST_HEAD(&duplicate_save); ++ INIT_LIST_HEAD(&ctx.list); ++ INIT_LIST_HEAD(&ctx.duplicates); ++ ++ pd_bo_list = kcalloc(process_info->n_vms, ++ sizeof(struct amdgpu_bo_list_entry), ++ GFP_KERNEL); ++ if (pd_bo_list == NULL) ++ return -ENOMEM; ++ ++ i = 0; ++ mutex_lock(&process_info->lock); ++ list_for_each_entry(peer_vm, &process_info->vm_list_head, ++ vm_list_node) ++ amdgpu_vm_get_pd_bo(&peer_vm->base, &ctx.list, ++ &pd_bo_list[i++]); ++ ++ /* Reserve all BOs and page tables/directory. Add all BOs from ++ * kfd_bo_list to ctx.list ++ */ ++ list_for_each_entry(mem, &process_info->kfd_bo_list, ++ validate_list.head) { ++ ++ list_add_tail(&mem->resv_list.head, &ctx.list); ++ mem->resv_list.bo = mem->validate_list.bo; ++ mem->resv_list.shared = mem->validate_list.shared; ++ } ++ ++ ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list, ++ false, &duplicate_save); ++ if (ret) { ++ pr_debug("Memory eviction: TTM Reserve Failed. Try again\n"); ++ goto ttm_reserve_fail; ++ } ++ ++ amdgpu_sync_create(&sync_obj); ++ ctx.sync = &sync_obj; ++ ++ /* Validate PDs and PTs */ ++ ret = process_validate_vms(process_info); ++ if (ret) ++ goto validate_map_fail; ++ ++ /* Wait for PD/PTs validate to finish */ ++ /* FIXME: I think this isn't needed */ ++ list_for_each_entry(peer_vm, &process_info->vm_list_head, ++ vm_list_node) { ++ struct amdgpu_bo *bo = peer_vm->base.root.base.bo; ++ ++ ttm_bo_wait(&bo->tbo, false, false); ++ } ++ ++ /* Validate BOs and map them to GPUVM (update VM page tables). */ ++ list_for_each_entry(mem, &process_info->kfd_bo_list, ++ validate_list.head) { ++ ++ struct amdgpu_bo *bo = mem->bo; ++ uint32_t domain = mem->domain; ++ struct kfd_bo_va_list *bo_va_entry; ++ ++ ret = amdgpu_amdkfd_bo_validate(bo, domain, false); ++ if (ret) { ++ pr_debug("Memory eviction: Validate BOs failed. Try again\n"); ++ goto validate_map_fail; ++ } ++ ++ list_for_each_entry(bo_va_entry, &mem->bo_va_list, ++ bo_list) { ++ ret = update_gpuvm_pte((struct amdgpu_device *) ++ bo_va_entry->kgd_dev, ++ bo_va_entry, ++ ctx.sync); ++ if (ret) { ++ pr_debug("Memory eviction: update PTE failed. Try again\n"); ++ goto validate_map_fail; ++ } ++ } ++ } ++ ++ amdgpu_sync_wait(ctx.sync, false); ++ ++ /* Release old eviction fence and create new one, because fence only ++ * goes from unsignaled to signaled, fence cannot be reused. ++ * Use context and mm from the old fence. ++ */ ++ new_fence = amdgpu_amdkfd_fence_create( ++ process_info->eviction_fence->base.context, ++ process_info->eviction_fence->mm); ++ if (!new_fence) { ++ pr_err("Failed to create eviction fence\n"); ++ ret = -ENOMEM; ++ goto validate_map_fail; ++ } ++ dma_fence_put(&process_info->eviction_fence->base); ++ process_info->eviction_fence = new_fence; ++ *ef = dma_fence_get(&new_fence->base); ++ ++ /* Wait for validate to finish and attach new eviction fence */ ++ list_for_each_entry(mem, &process_info->kfd_bo_list, ++ validate_list.head) ++ ttm_bo_wait(&mem->bo->tbo, false, false); ++ list_for_each_entry(mem, &process_info->kfd_bo_list, ++ validate_list.head) ++ amdgpu_bo_fence(mem->bo, ++ &process_info->eviction_fence->base, true); ++ ++ /* Attach eviction fence to PD / PT BOs */ ++ list_for_each_entry(peer_vm, &process_info->vm_list_head, ++ vm_list_node) { ++ struct amdgpu_bo *bo = peer_vm->base.root.base.bo; ++ ++ amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true); ++ } ++validate_map_fail: ++ ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list); ++ amdgpu_sync_free(&sync_obj); ++ttm_reserve_fail: ++ mutex_unlock(&process_info->lock); ++evict_fence_fail: ++ kfree(pd_bo_list); ++ return ret; ++} ++ ++int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, ++ uint64_t src_offset, struct kgd_mem *dst_mem, ++ uint64_t dst_offset, uint64_t size, ++ struct dma_fence **f, uint64_t *actual_size) ++{ ++ struct amdgpu_device *adev = NULL; ++ struct ttm_mem_reg *src = NULL, *dst = NULL; ++ struct ttm_buffer_object *src_ttm_bo, *dst_ttm_bo; ++ struct drm_mm_node *src_mm, *dst_mm; ++ struct amdgpu_ring *ring; ++ struct ww_acquire_ctx ticket; ++ struct list_head list; ++ struct ttm_validate_buffer resv_list[2]; ++ uint64_t src_start, dst_start; ++ uint64_t src_left, dst_left, cur_copy_size, total_copy_size = 0; ++ struct dma_fence *fence = NULL; ++ int r; ++ ++ if (!kgd || !src_mem || !dst_mem) ++ return -EINVAL; ++ ++ if (actual_size) ++ *actual_size = 0; ++ ++ adev = get_amdgpu_device(kgd); ++ src_ttm_bo = &src_mem->bo->tbo; ++ dst_ttm_bo = &dst_mem->bo->tbo; ++ src = &src_ttm_bo->mem; ++ dst = &dst_ttm_bo->mem; ++ src_mm = (struct drm_mm_node *)src->mm_node; ++ dst_mm = (struct drm_mm_node *)dst->mm_node; ++ ++ ring = adev->mman.buffer_funcs_ring; ++ ++ INIT_LIST_HEAD(&list); ++ ++ resv_list[0].bo = src_ttm_bo; ++ resv_list[0].shared = true; ++ resv_list[1].bo = dst_ttm_bo; ++ resv_list[1].shared = true; ++ ++ list_add_tail(&resv_list[0].head, &list); ++ list_add_tail(&resv_list[1].head, &list); ++ ++ if (!ring->ready) { ++ pr_err("Trying to move memory with ring turned off.\n"); ++ return -EINVAL; ++ } ++ ++ r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL); ++ if (r) { ++ pr_err("Copy buffer failed. Unable to reserve bo (%d)\n", r); ++ return r; ++ } ++ ++ switch (src->mem_type) { ++ case TTM_PL_TT: ++ r = amdgpu_ttm_bind(src_ttm_bo, src); ++ if (r) { ++ DRM_ERROR("Copy failed. Cannot bind to gart\n"); ++ goto copy_fail; ++ } ++ break; ++ case TTM_PL_VRAM: ++ /* VRAM could be scattered. Find the node in which the offset ++ * belongs to ++ */ ++ while (src_offset >= (src_mm->size << PAGE_SHIFT)) { ++ src_offset -= (src_mm->size << PAGE_SHIFT); ++ ++src_mm; ++ } ++ break; ++ default: ++ DRM_ERROR("Unknown placement %d\n", src->mem_type); ++ r = -EINVAL; ++ goto copy_fail; ++ } ++ src_start = src_mm->start << PAGE_SHIFT; ++ src_start += src_ttm_bo->bdev->man[src->mem_type].gpu_offset; ++ src_start += src_offset; ++ src_left = (src_mm->size << PAGE_SHIFT) - src_offset; ++ ++ switch (dst->mem_type) { ++ case TTM_PL_TT: ++ r = amdgpu_ttm_bind(dst_ttm_bo, dst); ++ if (r) { ++ DRM_ERROR("Copy failed. Cannot bind to gart\n"); ++ goto copy_fail; ++ } ++ break; ++ case TTM_PL_VRAM: ++ while (dst_offset >= (dst_mm->size << PAGE_SHIFT)) { ++ dst_offset -= (dst_mm->size << PAGE_SHIFT); ++ ++dst_mm; ++ } ++ break; ++ default: ++ DRM_ERROR("Unknown placement %d\n", dst->mem_type); ++ r = -EINVAL; ++ goto copy_fail; ++ } ++ dst_start = dst_mm->start << PAGE_SHIFT; ++ dst_start += dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; ++ dst_start += dst_offset; ++ dst_left = (dst_mm->size << PAGE_SHIFT) - dst_offset; ++ ++ do { ++ struct dma_fence *next; ++ ++ /* src_left/dst_left: amount of space left in the current node ++ * Copy minimum of (src_left, dst_left, amount of bytes left to ++ * copy) ++ */ ++ cur_copy_size = min3(src_left, dst_left, ++ (size - total_copy_size)); ++ ++ r = amdgpu_copy_buffer(ring, src_start, dst_start, ++ cur_copy_size, NULL, &next, false, false); ++ if (r) ++ break; ++ ++ /* Just keep the last fence */ ++ dma_fence_put(fence); ++ fence = next; ++ ++ total_copy_size += cur_copy_size; ++ /* Required amount of bytes copied. Done. */ ++ if (total_copy_size >= size) ++ break; ++ ++ /* If end of src or dst node is reached, move to next node */ ++ src_left -= cur_copy_size; ++ if (!src_left) { ++ ++src_mm; ++ src_start = src_mm->start << PAGE_SHIFT; ++ src_start += ++ src_ttm_bo->bdev->man[src->mem_type].gpu_offset; ++ src_left = src_mm->size << PAGE_SHIFT; ++ } else ++ src_start += cur_copy_size; ++ ++ dst_left -= cur_copy_size; ++ if (!dst_left) { ++ ++dst_mm; ++ dst_start = dst_mm->start << PAGE_SHIFT; ++ dst_start += ++ dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; ++ dst_left = dst_mm->size << PAGE_SHIFT; ++ } else ++ dst_start += cur_copy_size; ++ ++ } while (total_copy_size < size); ++ ++ /* Failure could occur after partial copy. So fill in amount copied ++ * and fence, still fill-in ++ */ ++ if (actual_size) ++ *actual_size = total_copy_size; ++ ++ if (fence) { ++ amdgpu_bo_fence(src_mem->bo, fence, true); ++ amdgpu_bo_fence(dst_mem->bo, fence, true); ++ } ++ ++ if (f) ++ *f = fence; ++ ++copy_fail: ++ ttm_eu_backoff_reservation(&ticket, &list); ++ return r; ++} ++ +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +index 9c472c5..2be2e05 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +@@ -817,11 +817,7 @@ static struct drm_driver kms_driver = { + .driver_features = + DRIVER_USE_AGP | + DRIVER_HAVE_IRQ | DRIVER_IRQ_SHARED | DRIVER_GEM | +-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ, +-#else +- DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET, +-#endif + .load = amdgpu_driver_load_kms, + .open = amdgpu_driver_open_kms, + .postclose = amdgpu_driver_postclose_kms, +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +old mode 100644 +new mode 100755 +index 283dc1b..f421505 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +@@ -36,6 +36,7 @@ + #include <drm/drm_cache.h> + #include "amdgpu.h" + #include "amdgpu_trace.h" ++#include "amdgpu_amdkfd.h" + + static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) + { +@@ -46,6 +47,8 @@ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) + + if (bo->tbo.mem.mem_type == AMDGPU_PL_DGMA_IMPORT) + kfree(tbo->mem.bus.addr); ++ if (bo->kfd_bo) ++ amdgpu_amdkfd_unreserve_system_memory_limit(bo); + amdgpu_bo_kunmap(bo); + + if (bo->gem_base.import_attach) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +old mode 100644 +new mode 100755 +index 8a91658..f73dba5 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +@@ -89,6 +89,7 @@ struct amdgpu_bo { + + struct ttm_bo_kmap_obj dma_buf_vmap; + struct amdgpu_mn *mn; ++ struct kgd_mem *kfd_bo; + + union { + struct list_head mn_list; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +old mode 100644 +new mode 100755 +index 322d2529..af8e544 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +@@ -36,6 +36,7 @@ + /* some special values for the owner field */ + #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul) + #define AMDGPU_FENCE_OWNER_VM ((void*)1ul) ++#define AMDGPU_FENCE_OWNER_KFD ((void *)2ul) + + #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) + #define AMDGPU_FENCE_FLAG_INT (1 << 1) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c +old mode 100644 +new mode 100755 +index c586f44..7ee8247 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c +@@ -31,6 +31,7 @@ + #include <drm/drmP.h> + #include "amdgpu.h" + #include "amdgpu_trace.h" ++#include "amdgpu_amdkfd.h" + + struct amdgpu_sync_entry { + struct hlist_node node; +@@ -84,11 +85,20 @@ static bool amdgpu_sync_same_dev(struct amdgpu_device *adev, + */ + static void *amdgpu_sync_get_owner(struct dma_fence *f) + { +- struct amd_sched_fence *s_fence = to_amd_sched_fence(f); ++ struct amd_sched_fence *s_fence; ++ struct amdgpu_amdkfd_fence *kfd_fence; ++ ++ if (f == NULL) ++ return AMDGPU_FENCE_OWNER_UNDEFINED; + ++ s_fence = to_amd_sched_fence(f); + if (s_fence) + return s_fence->owner; + ++ kfd_fence = to_amdgpu_amdkfd_fence(f); ++ if (kfd_fence) ++ return AMDGPU_FENCE_OWNER_KFD; ++ + return AMDGPU_FENCE_OWNER_UNDEFINED; + } + +@@ -171,7 +181,8 @@ int amdgpu_sync_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync, + * @resv: reservation object with embedded fence + * @shared: true if we should only sync to the exclusive fence + * +- * Sync to the fence ++ * Sync to the fence except if it is KFD eviction fence and owner is ++ * AMDGPU_FENCE_OWNER_VM. + */ + int amdgpu_sync_resv(struct amdgpu_device *adev, + struct amdgpu_sync *sync, +@@ -198,11 +209,15 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, + for (i = 0; i < flist->shared_count; ++i) { + f = rcu_dereference_protected(flist->shared[i], + reservation_object_held(resv)); ++ fence_owner = amdgpu_sync_get_owner(f); ++ if (fence_owner == AMDGPU_FENCE_OWNER_KFD && ++ owner != AMDGPU_FENCE_OWNER_UNDEFINED) ++ continue; ++ + if (amdgpu_sync_same_dev(adev, f)) { + /* VM updates are only interesting + * for other VM updates and moves. + */ +- fence_owner = amdgpu_sync_get_owner(f); + if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) && + (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) && + ((owner == AMDGPU_FENCE_OWNER_VM) != +@@ -297,6 +312,31 @@ struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync) + return NULL; + } + ++int amdgpu_sync_clone(struct amdgpu_device *adev, ++ struct amdgpu_sync *source, ++ struct amdgpu_sync *clone) ++{ ++ struct amdgpu_sync_entry *e; ++ struct hlist_node *tmp; ++ struct dma_fence *f; ++ int i, r; ++ ++ hash_for_each_safe(source->fences, i, tmp, e, node) { ++ ++ f = e->fence; ++ if (!dma_fence_is_signaled(f)) { ++ r = amdgpu_sync_fence(adev, clone, f); ++ if (r) ++ return r; ++ } else { ++ hash_del(&e->node); ++ dma_fence_put(f); ++ kmem_cache_free(amdgpu_sync_slab, e); ++ } ++ } ++ return 0; ++} ++ + int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr) + { + struct amdgpu_sync_entry *e; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h +old mode 100644 +new mode 100755 +index dc76879..8e29bc7 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h +@@ -49,6 +49,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, + struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync, + struct amdgpu_ring *ring); + struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync); ++int amdgpu_sync_clone(struct amdgpu_device *adev, struct amdgpu_sync *source, ++ struct amdgpu_sync *clone); + int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr); + void amdgpu_sync_free(struct amdgpu_sync *sync); + int amdgpu_sync_init(void); +diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h +old mode 100644 +new mode 100755 +index 9f34fab..f22f7a8 +--- a/drivers/gpu/drm/amd/amdgpu/soc15d.h ++++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h +@@ -272,6 +272,7 @@ + # define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) + # define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4) + # define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) ++# define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29) + #define PACKET3_SET_RESOURCES 0xA0 + /* 1. header + * 2. CONTROL +diff --git a/drivers/gpu/drm/amd/amdgpu/vid.h b/drivers/gpu/drm/amd/amdgpu/vid.h +old mode 100644 +new mode 100755 +index 323e21c..d09592a +--- a/drivers/gpu/drm/amd/amdgpu/vid.h ++++ b/drivers/gpu/drm/amd/amdgpu/vid.h +@@ -27,6 +27,8 @@ + #define SDMA1_REGISTER_OFFSET 0x200 /* not a register */ + #define SDMA_MAX_INSTANCE 2 + ++#define KFD_VI_SDMA_QUEUE_OFFSET 0x80 /* not a register */ ++ + /* crtc instance offsets */ + #define CRTC0_REGISTER_OFFSET (0x1b9c - 0x1b9c) + #define CRTC1_REGISTER_OFFSET (0x1d9c - 0x1b9c) +diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile +old mode 100644 +new mode 100755 +index f55a0f8..dba08ec +--- a/drivers/gpu/drm/amd/amdkfd/Makefile ++++ b/drivers/gpu/drm/amd/amdkfd/Makefile +@@ -26,5 +26,3 @@ amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o + + obj-$(CONFIG_HSA_AMD) += amdkfd.o + +-AMDKFD_FULL_PATH = $(src) +-include $(AMDKFD_FULL_PATH)/backport/Makefile +diff --git a/drivers/gpu/drm/amd/amdkfd/backport/backport.h b/drivers/gpu/drm/amd/amdkfd/backport/backport.h +index 8b13b98..e1f8c1d 100644 +--- a/drivers/gpu/drm/amd/amdkfd/backport/backport.h ++++ b/drivers/gpu/drm/amd/amdkfd/backport/backport.h +@@ -2,12 +2,5 @@ + #define AMDKFD_BACKPORT_H + + #include <linux/version.h> +-#if defined(BUILD_AS_DKMS) +-#include <kcl/kcl_amd_asic_type.h> +-#endif +-#include <kcl/kcl_compat.h> +-#include <kcl/kcl_pci.h> +-#include <kcl/kcl_mn.h> +-#include <kcl/kcl_fence.h> + + #endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +index b2795af..207a05e 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +@@ -25,9 +25,7 @@ + #include <linux/err.h> + #include <linux/fs.h> + #include <linux/sched.h> +-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) + #include <linux/sched/mm.h> +-#endif + #include <linux/slab.h> + #include <linux/uaccess.h> + #include <linux/compat.h> +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +index 5f597a6..4e94081 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +@@ -811,11 +811,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, + */ + pgdat = NODE_DATA(numa_node_id); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) +- mem_in_bytes += pgdat->node_zones[zone_type].present_pages; +-#else + mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; +-#endif + mem_in_bytes <<= PAGE_SHIFT; + + sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +index c6b447d..6b3a1fa 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +@@ -326,11 +326,6 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd); + + static int kfd_resume(struct kfd_dev *kfd); + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) +-void kfd_init_processes_srcu(void); +-void kfd_cleanup_processes_srcu(void); +-#endif +- + static const struct kfd_device_info *lookup_device_info(unsigned short did) + { + size_t i; +@@ -633,10 +628,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + + kfd_ib_mem_init(kfd); + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) +- kfd_init_processes_srcu(); +-#endif +- + if (kfd_resume(kfd)) { + dev_err(kfd_device, "Error resuming kfd\n"); + goto kfd_resume_error; +@@ -678,9 +669,6 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) + { + if (kfd->init_complete) { + kgd2kfd_suspend(kfd); +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) +- kfd_cleanup_processes_srcu(); +-#endif + kfd_cwsr_fini(kfd); + device_queue_manager_uninit(kfd->dqm); + kfd_interrupt_exit(kfd); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +index 8debe6e..7eacf42 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +@@ -24,10 +24,8 @@ + #include <linux/slab.h> + #include <linux/types.h> + #include <linux/uaccess.h> +-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) + #include <linux/sched/mm.h> + #include <linux/sched/signal.h> +-#endif + #include <linux/mman.h> + #include <linux/memory.h> + #include "kfd_priv.h" +@@ -269,13 +267,7 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) + { + struct kfd_event *ev; + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *node; +- +- hash_for_each_possible(p->events, ev, node, events, id) +-#else + hash_for_each_possible(p->events, ev, events, id) +-#endif + if (ev->event_id == id) + return ev; + +@@ -420,13 +412,7 @@ static void destroy_events(struct kfd_process *p) + struct hlist_node *tmp; + unsigned int hash_bkt; + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *node; +- +- hash_for_each_safe(p->events, hash_bkt, node, tmp, ev, events) +-#else + hash_for_each_safe(p->events, hash_bkt, tmp, ev, events) +-#endif + destroy_event(p, ev); + } + +@@ -972,16 +958,9 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, + int bkt; + bool send_signal = true; + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *node; +- ev_data = (struct kfd_hsa_memory_exception_data *) event_data; +- +- hash_for_each(p->events, bkt, node, ev, events) +-#else + ev_data = (struct kfd_hsa_memory_exception_data *) event_data; + + hash_for_each(p->events, bkt, ev, events) +-#endif + if (ev->type == type) { + send_signal = false; + dev_dbg(kfd_device, +@@ -1114,9 +1093,6 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, + int bkt; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_hsa_memory_exception_data memory_exception_data; +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *node; +-#endif + + if (!p) + return; /* Presumably process exited. */ +@@ -1136,11 +1112,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, + } + mutex_lock(&p->event_mutex); + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- hash_for_each(p->events, bkt, node, ev, events) { +-#else + hash_for_each(p->events, bkt, ev, events) { +-#endif + if (ev->type == KFD_EVENT_TYPE_MEMORY) { + ev->memory_exception_data = memory_exception_data; + set_event(ev); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +index 4f4392a..47dcf4a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +@@ -61,11 +61,7 @@ int kfd_interrupt_init(struct kfd_dev *kfd) + return r; + } + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) +- kfd->ih_wq = create_rt_workqueue("KFD IH"); +-#else + kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); +-#endif + spin_lock_init(&kfd->interrupt_lock); + + INIT_WORK(&kfd->interrupt_work, interrupt_wq); +@@ -115,15 +111,9 @@ bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) + count = kfifo_in(&kfd->ih_fifo, ih_ring_entry, + kfd->device_info->ih_ring_entry_size); + if (count != kfd->device_info->ih_ring_entry_size) { +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) +- dev_err(kfd_chardev(), +- "Interrupt ring overflow, dropping interrupt %d\n", +- count); +-#else + dev_err_ratelimited(kfd_chardev(), + "Interrupt ring overflow, dropping interrupt %d\n", + count); +-#endif + return false; + } + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c +index c6be3ba..e67eb9f 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c +@@ -192,21 +192,13 @@ int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, + { + int r; + struct kfd_ipc_obj *entry, *found = NULL; +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *tmp_node; +-#endif + + mutex_lock(&kfd_ipc_handles.lock); + /* Convert the user provided handle to hash key and search only in that + * bucket + */ +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- hlist_for_each_entry(entry, tmp_node, +- &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { +-#else + hlist_for_each_entry(entry, + &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { +-#endif + if (!memcmp(entry->share_handle, share_handle, + sizeof(entry->share_handle))) { + found = entry; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +index 64bf653..5724d33 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +@@ -465,19 +465,15 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + + static int debugfs_show_mqd(struct seq_file *m, void *data) + { +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct cik_mqd), false); +-#endif + return 0; + } + + static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) + { +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct cik_sdma_rlc_registers), false); +-#endif + return 0; + } + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +index 0713cac..6c302d2 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +@@ -455,19 +455,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + + static int debugfs_show_mqd(struct seq_file *m, void *data) + { +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v9_mqd), false); +-#endif + return 0; + } + + static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) + { +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v9_sdma_mqd), false); +-#endif + return 0; + } + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +index a5ba6f7..5c26e5a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +@@ -468,19 +468,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + + static int debugfs_show_mqd(struct seq_file *m, void *data) + { +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct vi_mqd), false); +-#endif + return 0; + } + + static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) + { +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct vi_sdma_mqd), false); +-#endif + return 0; + } + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +index 9fcb6fb..7cca7b4 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +@@ -410,10 +410,8 @@ int pm_debugfs_runlist(struct seq_file *m, void *data) + return 0; + } + +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); +-#endif + + return 0; + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +old mode 100644 +new mode 100755 +index ebe311e..88fdfc9 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -36,11 +36,7 @@ + #include <linux/interval_tree.h> + #include <linux/seq_file.h> + #include <linux/kref.h> +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) +-#include <linux/kfifo-new.h> +-#else + #include <linux/kfifo.h> +-#endif + #include <kgd_kfd_interface.h> + + #include <drm/amd_rdma.h> +@@ -727,7 +723,7 @@ struct kfd_process { + size_t signal_event_count; + bool signal_event_limit_reached; + +- struct rb_root bo_interval_tree; ++ struct rb_root_cached bo_interval_tree; + + /* Information used for memory eviction */ + void *process_info; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +index b458995..c798fa3 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +@@ -23,10 +23,8 @@ + #include <linux/mutex.h> + #include <linux/log2.h> + #include <linux/sched.h> +-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) + #include <linux/sched/mm.h> + #include <linux/sched/task.h> +-#endif + #include <linux/slab.h> + #if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + #include <linux/amd-iommu.h> +@@ -50,20 +48,7 @@ struct mm_struct; + static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); + static DEFINE_MUTEX(kfd_processes_mutex); + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) +-static struct srcu_struct kfd_processes_srcu; +-void kfd_init_processes_srcu(void) +-{ +- init_srcu_struct(&kfd_processes_srcu); +-} +- +-void kfd_cleanup_processes_srcu(void) +-{ +- cleanup_srcu_struct(&kfd_processes_srcu); +-} +-#else + DEFINE_STATIC_SRCU(kfd_processes_srcu); +-#endif + + static struct workqueue_struct *kfd_process_wq; + +@@ -81,11 +66,7 @@ static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); + void kfd_process_create_wq(void) + { + if (!kfd_process_wq) +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) +- kfd_process_wq = create_workqueue("kfd_process_wq"); +-#else + kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0); +-#endif + } + + void kfd_process_destroy_wq(void) +@@ -273,15 +254,8 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) + { + struct kfd_process *process; + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *node; +- +- hash_for_each_possible_rcu(kfd_processes_table, process, node, +- kfd_processes, (uintptr_t)mm) +-#else + hash_for_each_possible_rcu(kfd_processes_table, process, + kfd_processes, (uintptr_t)mm) +-#endif + if (process->mm == mm) + return process; + +@@ -586,7 +560,7 @@ static struct kfd_process *create_process(const struct task_struct *thread, + if (!process) + goto err_alloc_process; + +- process->bo_interval_tree = RB_ROOT; ++ process->bo_interval_tree = RB_ROOT_CACHED; + + process->pasid = kfd_pasid_alloc(); + if (process->pasid == 0) +@@ -1026,13 +1000,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) + + int idx = srcu_read_lock(&kfd_processes_srcu); + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *node; +- +- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { +-#else + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +-#endif + if (p->pasid == pasid) { + kref_get(&p->ref); + ret_p = p; +@@ -1051,13 +1019,7 @@ void kfd_suspend_all_processes(void) + unsigned int temp; + int idx = srcu_read_lock(&kfd_processes_srcu); + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *node; +- +- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { +-#else + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +-#endif + if (cancel_delayed_work_sync(&p->eviction_work.dwork)) + dma_fence_put(p->eviction_work.quiesce_fence); + cancel_delayed_work_sync(&p->restore_work); +@@ -1077,13 +1039,7 @@ int kfd_resume_all_processes(void) + unsigned int temp; + int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *node; +- +- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { +-#else + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +-#endif + if (!schedule_delayed_work(&p->restore_work, 0)) { + pr_err("Restore process %d failed during resume\n", + p->pasid); +@@ -1171,13 +1127,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) + + int idx = srcu_read_lock(&kfd_processes_srcu); + +-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +- struct hlist_node *node; +- +- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { +-#else + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +-#endif + seq_printf(m, "Process %d PASID %d:\n", + p->lead_thread->tgid, p->pasid); + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +index ffd8e0f..d08e3de 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +@@ -122,9 +122,7 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) + struct kfd_mem_properties *mem; + struct kfd_cache_properties *cache; + struct kfd_iolink_properties *iolink; +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + struct kfd_perf_properties *perf; +-#endif + + list_del(&dev->list); + +@@ -149,14 +147,12 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) + kfree(iolink); + } + +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + while (dev->perf_props.next != &dev->perf_props) { + perf = container_of(dev->perf_props.next, + struct kfd_perf_properties, list); + list_del(&perf->list); + kfree(perf); + } +-#endif + + kfree(dev); + } +@@ -192,9 +188,7 @@ struct kfd_topology_device *kfd_create_topology_device( + INIT_LIST_HEAD(&dev->mem_props); + INIT_LIST_HEAD(&dev->cache_props); + INIT_LIST_HEAD(&dev->io_link_props); +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + INIT_LIST_HEAD(&dev->perf_props); +-#endif + + list_add_tail(&dev->list, device_list); + +@@ -374,7 +368,6 @@ static struct kobj_type cache_type = { + .sysfs_ops = &cache_ops, + }; + +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + /****** Sysfs of Performance Counters ******/ + + struct kfd_perf_attr { +@@ -407,7 +400,6 @@ static struct kfd_perf_attr perf_attr_iommu[] = { + KFD_PERF_DESC(counter_ids, 0), + }; + /****************************************/ +-#endif + + static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +@@ -546,9 +538,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + struct kfd_perf_properties *perf; +-#endif + + if (dev->kobj_iolink) { + list_for_each_entry(iolink, &dev->io_link_props, list) +@@ -590,7 +580,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) + dev->kobj_mem = NULL; + } + +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + if (dev->kobj_perf) { + list_for_each_entry(perf, &dev->perf_props, list) { + kfree(perf->attr_group); +@@ -600,7 +589,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) + kobject_put(dev->kobj_perf); + dev->kobj_perf = NULL; + } +-#endif + + if (dev->kobj_node) { + sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); +@@ -618,11 +606,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + struct kfd_perf_properties *perf; + uint32_t num_attrs; + struct attribute **attrs; +-#endif + int ret; + uint32_t i; + +@@ -653,11 +639,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + if (!dev->kobj_iolink) + return -ENOMEM; + +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node); + if (!dev->kobj_perf) + return -ENOMEM; +-#endif + + /* + * Creating sysfs files for node properties +@@ -749,7 +733,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + i++; + } + +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + /* All hardware blocks have the same number of attributes. */ + num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr); + list_for_each_entry(perf, &dev->perf_props, list) { +@@ -775,7 +758,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + if (ret < 0) + return ret; + } +-#endif + + return 0; + } +@@ -942,7 +924,6 @@ static void find_system_memory(const struct dmi_header *dm, + } + } + +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + /* + * Performance counters information is not part of CRAT but we would like to + * put them in the sysfs under topology directory for Thunk to get the data. +@@ -966,7 +947,6 @@ static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) + + return 0; + } +-#endif + + /* kfd_add_non_crat_information - Add information that is not currently + * defined in CRAT but is necessary for KFD topology +@@ -1074,11 +1054,9 @@ int kfd_topology_init(void) + } + } + +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + kdev = list_first_entry(&temp_topology_device_list, + struct kfd_topology_device, list); + kfd_add_perf_to_topology(kdev); +-#endif + + down_write(&topology_lock); + kfd_topology_update_device_list(&temp_topology_device_list, +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +index b59b32c..f22d420 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +@@ -141,14 +141,12 @@ struct kfd_iolink_properties { + struct attribute attr; + }; + +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + struct kfd_perf_properties { + struct list_head list; + char block_name[16]; + uint32_t max_concurrent; + struct attribute_group *attr_group; + }; +-#endif + + struct kfd_topology_device { + struct list_head list; +@@ -160,17 +158,13 @@ struct kfd_topology_device { + struct list_head cache_props; + uint32_t io_link_count; + struct list_head io_link_props; +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + struct list_head perf_props; +-#endif + struct kfd_dev *gpu; + struct kobject *kobj_node; + struct kobject *kobj_mem; + struct kobject *kobj_cache; + struct kobject *kobj_iolink; +-#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + struct kobject *kobj_perf; +-#endif + struct attribute attr_gpuid; + struct attribute attr_name; + struct attribute attr_props; +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +index 2780641..977b21b 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +@@ -707,11 +707,7 @@ static int dm_display_resume(struct drm_device *ddev) + + err: + DRM_ERROR("Restoring old state failed with %i\n", ret); +-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) +- drm_atomic_state_free(state); +-#else + drm_atomic_state_put(state); +-#endif + + return ret; + } +diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +old mode 100644 +new mode 100755 +index 36f3766..b6cf2d5 +--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h ++++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +@@ -30,6 +30,7 @@ + + #include <linux/types.h> + #include <linux/bitmap.h> ++#include <linux/dma-buf.h> + + struct pci_dev; + +@@ -40,6 +41,46 @@ struct kfd_dev; + struct kgd_dev; + + struct kgd_mem; ++struct kfd_process_device; ++struct amdgpu_bo; ++ ++enum kfd_preempt_type { ++ KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN = 0, ++ KFD_PREEMPT_TYPE_WAVEFRONT_RESET, ++}; ++ ++struct kfd_vm_fault_info { ++ uint64_t page_addr; ++ uint32_t vmid; ++ uint32_t mc_id; ++ uint32_t status; ++ bool prot_valid; ++ bool prot_read; ++ bool prot_write; ++ bool prot_exec; ++}; ++ ++struct kfd_cu_info { ++ uint32_t num_shader_engines; ++ uint32_t num_shader_arrays_per_engine; ++ uint32_t num_cu_per_sh; ++ uint32_t cu_active_number; ++ uint32_t cu_ao_mask; ++ uint32_t simd_per_cu; ++ uint32_t max_waves_per_simd; ++ uint32_t wave_front_size; ++ uint32_t max_scratch_slots_per_cu; ++ uint32_t lds_size; ++ uint32_t cu_bitmap[4][4]; ++}; ++ ++/* For getting GPU local memory information from KGD */ ++struct kfd_local_mem_info { ++ uint64_t local_mem_size_private; ++ uint64_t local_mem_size_public; ++ uint32_t vram_width; ++ uint32_t mem_clk_max; ++}; + + enum kgd_memory_pool { + KGD_POOL_SYSTEM_CACHEABLE = 1, +@@ -72,6 +113,21 @@ struct kgd2kfd_shared_resources { + /* Bit n == 1 means Queue n is available for KFD */ + DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); + ++ /* Doorbell assignments (SOC15 and later chips only). Only ++ * specific doorbells are routed to each SDMA engine. Others ++ * are routed to IH and VCN. They are not usable by the CP. ++ * ++ * Any doorbell number D that satisfies the following condition ++ * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val ++ * ++ * KFD currently uses 1024 (= 0x3ff) doorbells per process. If ++ * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means ++ * mask would be set to 0x1f8 and val set to 0x0f0. ++ */ ++ unsigned int sdma_doorbell[2][2]; ++ unsigned int reserved_doorbell_mask; ++ unsigned int reserved_doorbell_val; ++ + /* Base address of doorbell aperture. */ + phys_addr_t doorbell_physical_address; + +@@ -80,8 +136,41 @@ struct kgd2kfd_shared_resources { + + /* Number of bytes at start of aperture reserved for KGD. */ + size_t doorbell_start_offset; ++ ++ /* GPUVM address space size in bytes */ ++ uint64_t gpuvm_size; + }; + ++struct tile_config { ++ uint32_t *tile_config_ptr; ++ uint32_t *macro_tile_config_ptr; ++ uint32_t num_tile_configs; ++ uint32_t num_macro_tile_configs; ++ ++ uint32_t gb_addr_config; ++ uint32_t num_banks; ++ uint32_t num_ranks; ++}; ++ ++/* ++ * Allocation flag domains currently only VRAM and GTT domain supported ++ */ ++#define ALLOC_MEM_FLAGS_VRAM (1 << 0) ++#define ALLOC_MEM_FLAGS_GTT (1 << 1) ++#define ALLOC_MEM_FLAGS_USERPTR (1 << 2) ++#define ALLOC_MEM_FLAGS_DOORBELL (1 << 3) ++ ++/* ++ * Allocation flags attributes/access options. ++ */ ++#define ALLOC_MEM_FLAGS_NONPAGED (1 << 31) ++#define ALLOC_MEM_FLAGS_READONLY (1 << 30) ++#define ALLOC_MEM_FLAGS_PUBLIC (1 << 29) ++#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) ++#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) ++#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) ++#define ALLOC_MEM_FLAGS_COHERENT (1 << 25) ++ + /** + * struct kfd2kgd_calls + * +@@ -90,7 +179,7 @@ struct kgd2kfd_shared_resources { + * + * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture + * +- * @get_vmem_size: Retrieves (physical) size of VRAM ++ * @get_local_mem_info: Retrieves information about GPU local memory + * + * @get_gpu_clock_counter: Retrieves GPU clock counter + * +@@ -112,6 +201,12 @@ struct kgd2kfd_shared_resources { + * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot. + * used only for no HWS mode. + * ++ * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs. ++ * Array is allocated with kmalloc, needs to be freed with kfree by caller. ++ * ++ * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs. ++ * Array is allocated with kmalloc, needs to be freed with kfree by caller. ++ * + * @hqd_is_occupies: Checks if a hqd slot is occupied. + * + * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot. +@@ -121,8 +216,34 @@ struct kgd2kfd_shared_resources { + * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that + * SDMA hqd slot. + * ++ * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs ++ * ++ * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs ++ * + * @get_fw_version: Returns FW versions from the header + * ++ * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to ++ * IOMMU when address translation failed ++ * ++ * @get_cu_info: Retrieves activated cu info ++ * ++ * @get_dmabuf_info: Returns information about a dmabuf if it was ++ * created by the GPU driver ++ * ++ * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object ++ * Supports only DMA buffers created by GPU driver on the same GPU ++ * ++ * @export_dmabuf: Emports a KFD BO for sharing with other process ++ * ++ * @submit_ib: Submits an IB to the engine specified by inserting the IB to ++ * the corresonded ring (ring type). ++ * ++ * @restore_process_bos: Restore all BOs that belongs to the process ++ * ++ * @copy_mem_to_mem: Copies size bytes from source BO to destination BO ++ * ++ * @get_vram_usage: Returns current VRAM usage ++ * + * This structure contains function pointers to services that the kgd driver + * provides to amdkfd driver. + * +@@ -134,11 +255,23 @@ struct kfd2kgd_calls { + + void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); + +- uint64_t (*get_vmem_size)(struct kgd_dev *kgd); ++ void(*get_local_mem_info)(struct kgd_dev *kgd, ++ struct kfd_local_mem_info *mem_info); + uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd); + + uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd); + ++ int (*create_process_vm)(struct kgd_dev *kgd, void **vm, ++ void **process_info, struct dma_fence **ef); ++ void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm); ++ ++ int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem); ++ void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem); ++ ++ uint32_t (*get_process_page_dir)(void *vm); ++ ++ int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); ++ + /* Register access functions */ + void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid, + uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, +@@ -151,16 +284,28 @@ struct kfd2kgd_calls { + uint32_t hpd_size, uint64_t hpd_gpu_addr); + + int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); ++ + + int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr); ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t wptr_shift, uint32_t wptr_mask, ++ struct mm_struct *mm); ++ ++ int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd, ++ uint32_t __user *wptr, struct mm_struct *mm); ++ ++ int (*hqd_dump)(struct kgd_dev *kgd, ++ uint32_t pipe_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs); + +- int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd); ++ int (*hqd_sdma_dump)(struct kgd_dev *kgd, ++ uint32_t engine_id, uint32_t queue_id, ++ uint32_t (**dump)[2], uint32_t *n_regs); + + bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address, + uint32_t pipe_id, uint32_t queue_id); + +- int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type, ++ int (*hqd_destroy)(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id); + +@@ -168,7 +313,7 @@ struct kfd2kgd_calls { + + int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd, + unsigned int timeout); +- ++ + int (*address_watch_disable)(struct kgd_dev *kgd); + int (*address_watch_execute)(struct kgd_dev *kgd, + unsigned int watch_point_id, +@@ -187,11 +332,72 @@ struct kfd2kgd_calls { + uint16_t (*get_atc_vmid_pasid_mapping_pasid)( + struct kgd_dev *kgd, + uint8_t vmid); ++ uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd); + void (*write_vmid_invalidate_request)(struct kgd_dev *kgd, + uint8_t vmid); + ++ int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid); ++ ++ int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); ++ ++ int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va, ++ uint64_t size, void *vm, ++ struct kgd_mem **mem, uint64_t *offset, ++ uint32_t flags); ++ int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, ++ void *vm); ++ int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, ++ void *vm); ++ int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, ++ void *vm); ++ + uint16_t (*get_fw_version)(struct kgd_dev *kgd, + enum kgd_engine_type type); ++ ++ void (*set_num_of_requests)(struct kgd_dev *kgd, ++ uint8_t num_of_requests); ++ int (*alloc_memory_of_scratch)(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid); ++ int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype); ++ void (*get_cu_info)(struct kgd_dev *kgd, ++ struct kfd_cu_info *cu_info); ++ int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma); ++ int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd, ++ struct kgd_mem *mem, void **kptr); ++ void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base); ++ ++ int (*pin_get_sg_table_bo)(struct kgd_dev *kgd, ++ struct kgd_mem *mem, uint64_t offset, ++ uint64_t size, struct sg_table **ret_sg); ++ void (*unpin_put_sg_table_bo)(struct kgd_mem *mem, ++ struct sg_table *sg); ++ ++ int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd, ++ struct kgd_dev **dma_buf_kgd, uint64_t *bo_size, ++ void *metadata_buffer, size_t buffer_size, ++ uint32_t *metadata_size, uint32_t *flags); ++ int (*import_dmabuf)(struct kgd_dev *kgd, struct dma_buf *dmabuf, ++ uint64_t va, void *vm, struct kgd_mem **mem, ++ uint64_t *size, uint64_t *mmap_offset); ++ int (*export_dmabuf)(struct kgd_dev *kgd, void *vm, struct kgd_mem *mem, ++ struct dma_buf **dmabuf); ++ ++ int (*get_vm_fault_info)(struct kgd_dev *kgd, ++ struct kfd_vm_fault_info *info); ++ int (*submit_ib)(struct kgd_dev *kgd, enum kgd_engine_type engine, ++ uint32_t vmid, uint64_t gpu_addr, ++ uint32_t *ib_cmd, uint32_t ib_len); ++ int (*get_tile_config)(struct kgd_dev *kgd, ++ struct tile_config *config); ++ ++ int (*restore_process_bos)(void *process_info, struct dma_fence **ef); ++ int (*copy_mem_to_mem)(struct kgd_dev *kgd, struct kgd_mem *src_mem, ++ uint64_t src_offset, struct kgd_mem *dst_mem, ++ uint64_t dest_offset, uint64_t size, ++ struct dma_fence **f, uint64_t *actual_size); ++ uint64_t (*get_vram_usage)(struct kgd_dev *kgd); + }; + + /** +@@ -210,6 +416,13 @@ struct kfd2kgd_calls { + * + * @resume: Notifies amdkfd about a resume action done to a kgd device + * ++ * @quiesce_mm: Quiesce all user queue access to specified MM address space ++ * ++ * @resume_mm: Resume user queue access to specified MM address space ++ * ++ * @schedule_evict_and_restore_process: Schedules work queue that will prepare ++ * for safe eviction of KFD BOs that belong to the specified process. ++ * + * This structure contains function callback pointers so the kgd driver + * will notify to the amdkfd about certain status changes. + * +@@ -224,9 +437,13 @@ struct kgd2kfd_calls { + void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); + void (*suspend)(struct kfd_dev *kfd); + int (*resume)(struct kfd_dev *kfd); ++ int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm); ++ int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm); ++ int (*schedule_evict_and_restore_process)(struct mm_struct *mm, ++ struct dma_fence *fence); + }; + + int kgd2kfd_init(unsigned interface_version, + const struct kgd2kfd_calls **g2f); + +-#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ ++#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ +diff --git a/drivers/gpu/drm/amd/include/v9_structs.h b/drivers/gpu/drm/amd/include/v9_structs.h +old mode 100644 +new mode 100755 +index 2fb25ab..ceaf493 +--- a/drivers/gpu/drm/amd/include/v9_structs.h ++++ b/drivers/gpu/drm/amd/include/v9_structs.h +@@ -29,10 +29,10 @@ struct v9_sdma_mqd { + uint32_t sdmax_rlcx_rb_base; + uint32_t sdmax_rlcx_rb_base_hi; + uint32_t sdmax_rlcx_rb_rptr; ++ uint32_t sdmax_rlcx_rb_rptr_hi; + uint32_t sdmax_rlcx_rb_wptr; ++ uint32_t sdmax_rlcx_rb_wptr_hi; + uint32_t sdmax_rlcx_rb_wptr_poll_cntl; +- uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; +- uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; + uint32_t sdmax_rlcx_rb_rptr_addr_hi; + uint32_t sdmax_rlcx_rb_rptr_addr_lo; + uint32_t sdmax_rlcx_ib_cntl; +@@ -44,29 +44,29 @@ struct v9_sdma_mqd { + uint32_t sdmax_rlcx_skip_cntl; + uint32_t sdmax_rlcx_context_status; + uint32_t sdmax_rlcx_doorbell; +- uint32_t sdmax_rlcx_virtual_addr; +- uint32_t sdmax_rlcx_ape1_cntl; ++ uint32_t sdmax_rlcx_status; + uint32_t sdmax_rlcx_doorbell_log; +- uint32_t reserved_22; +- uint32_t reserved_23; +- uint32_t reserved_24; +- uint32_t reserved_25; +- uint32_t reserved_26; +- uint32_t reserved_27; +- uint32_t reserved_28; +- uint32_t reserved_29; +- uint32_t reserved_30; +- uint32_t reserved_31; +- uint32_t reserved_32; +- uint32_t reserved_33; +- uint32_t reserved_34; +- uint32_t reserved_35; +- uint32_t reserved_36; +- uint32_t reserved_37; +- uint32_t reserved_38; +- uint32_t reserved_39; +- uint32_t reserved_40; +- uint32_t reserved_41; ++ uint32_t sdmax_rlcx_watermark; ++ uint32_t sdmax_rlcx_doorbell_offset; ++ uint32_t sdmax_rlcx_csa_addr_lo; ++ uint32_t sdmax_rlcx_csa_addr_hi; ++ uint32_t sdmax_rlcx_ib_sub_remain; ++ uint32_t sdmax_rlcx_preempt; ++ uint32_t sdmax_rlcx_dummy_reg; ++ uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; ++ uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; ++ uint32_t sdmax_rlcx_rb_aql_cntl; ++ uint32_t sdmax_rlcx_minor_ptr_update; ++ uint32_t sdmax_rlcx_midcmd_data0; ++ uint32_t sdmax_rlcx_midcmd_data1; ++ uint32_t sdmax_rlcx_midcmd_data2; ++ uint32_t sdmax_rlcx_midcmd_data3; ++ uint32_t sdmax_rlcx_midcmd_data4; ++ uint32_t sdmax_rlcx_midcmd_data5; ++ uint32_t sdmax_rlcx_midcmd_data6; ++ uint32_t sdmax_rlcx_midcmd_data7; ++ uint32_t sdmax_rlcx_midcmd_data8; ++ uint32_t sdmax_rlcx_midcmd_cntl; + uint32_t reserved_42; + uint32_t reserved_43; + uint32_t reserved_44; +diff --git a/drivers/gpu/drm/amd/include/vi_structs.h b/drivers/gpu/drm/amd/include/vi_structs.h +old mode 100644 +new mode 100755 +index 2023482..717fbae +--- a/drivers/gpu/drm/amd/include/vi_structs.h ++++ b/drivers/gpu/drm/amd/include/vi_structs.h +@@ -153,6 +153,8 @@ struct vi_sdma_mqd { + uint32_t reserved_125; + uint32_t reserved_126; + uint32_t reserved_127; ++ uint32_t sdma_engine_id; ++ uint32_t sdma_queue_id; + }; + + struct vi_mqd { +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +old mode 100644 +new mode 100755 +index 2292462..82d97f3 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -2983,6 +2983,87 @@ bool pci_acs_path_enabled(struct pci_dev *start, + } + + /** ++ * pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port ++ * @dev: the PCI device ++ * ++ * Return 0 if the device is capable of generating AtomicOp requests, ++ * all upstream bridges support AtomicOp routing, egress blocking is disabled ++ * on all upstream ports, and the root port supports 32-bit, 64-bit and/or ++ * 128-bit AtomicOp completion, or negative otherwise. ++ */ ++int pci_enable_atomic_ops_to_root(struct pci_dev *dev) ++{ ++ struct pci_bus *bus = dev->bus; ++ ++ if (!pci_is_pcie(dev)) ++ return -EINVAL; ++ ++ switch (pci_pcie_type(dev)) { ++ /* ++ * PCIe 3.0, 6.15 specifies that endpoints and root ports are permitted ++ * to implement AtomicOp requester capabilities. ++ */ ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ while (bus->parent) { ++ struct pci_dev *bridge = bus->self; ++ u32 cap; ++ ++ pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap); ++ ++ switch (pci_pcie_type(bridge)) { ++ /* ++ * Upstream, downstream and root ports may implement AtomicOp ++ * routing capabilities. AtomicOp routing via a root port is ++ * not considered. ++ */ ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ if (!(cap & PCI_EXP_DEVCAP2_ATOMIC_ROUTE)) ++ return -EINVAL; ++ break; ++ ++ /* ++ * Root ports are permitted to implement AtomicOp completion ++ * capabilities. ++ */ ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | ++ PCI_EXP_DEVCAP2_ATOMIC_COMP64 | ++ PCI_EXP_DEVCAP2_ATOMIC_COMP128))) ++ return -EINVAL; ++ break; ++ } ++ ++ /* ++ * Upstream ports may block AtomicOps on egress. ++ */ ++ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) { ++ u32 ctl2; ++ ++ pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, ++ &ctl2); ++ if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_BLOCK) ++ return -EINVAL; ++ } ++ ++ bus = bus->parent; ++ } ++ ++ pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, ++ PCI_EXP_DEVCTL2_ATOMIC_REQ); ++ ++ return 0; ++} ++EXPORT_SYMBOL(pci_enable_atomic_ops_to_root); ++ ++/** + * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge + * @dev: the PCI device + * @pin: the INTx pin (1=INTA, 2=INTB, 3=INTC, 4=INTD) +diff --git a/include/drm/amd_rdma.h b/include/drm/amd_rdma.h +new file mode 100644 +index 0000000..b0cab3c +--- /dev/null ++++ b/include/drm/amd_rdma.h +@@ -0,0 +1,70 @@ ++/* ++ * Copyright 2015 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++/* @file This file defined kernel interfaces to communicate with amdkfd */ ++ ++#ifndef AMD_RDMA_H_ ++#define AMD_RDMA_H_ ++ ++ ++/** ++ * Structure describing information needed to P2P access from another device ++ * to specific location of GPU memory ++ */ ++struct amd_p2p_info { ++ uint64_t va; /**< Specify user virt. address ++ * which this page table ++ * described ++ */ ++ uint64_t size; /**< Specify total size of ++ * allocation ++ */ ++ struct pid *pid; /**< Specify process pid to which ++ * virtual address belongs ++ */ ++ struct sg_table *pages; /**< Specify DMA/Bus addresses */ ++ void *priv; /**< Pointer set by AMD kernel ++ * driver ++ */ ++}; ++ ++/** ++ * Structure providing function pointers to support rdma/p2p requirements. ++ * to specific location of GPU memory ++ */ ++struct amd_rdma_interface { ++ int (*get_pages)(uint64_t address, uint64_t length, struct pid *pid, ++ struct amd_p2p_info **amd_p2p_data, ++ void (*free_callback)(void *client_priv), ++ void *client_priv); ++ int (*put_pages)(struct amd_p2p_info **amd_p2p_data); ++ int (*is_gpu_address)(uint64_t address, struct pid *pid); ++ int (*get_page_size)(uint64_t address, uint64_t length, struct pid *pid, ++ unsigned long *page_size); ++}; ++ ++ ++int amdkfd_query_rdma_interface(const struct amd_rdma_interface **rdma); ++ ++ ++#endif /* AMD_RDMA_H_ */ ++ +diff --git a/include/linux/pci.h b/include/linux/pci.h +old mode 100644 +new mode 100755 +index b1abbcc..3df545d +--- a/include/linux/pci.h ++++ b/include/linux/pci.h +@@ -2072,6 +2072,7 @@ void pci_request_acs(void); + bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags); + bool pci_acs_path_enabled(struct pci_dev *start, + struct pci_dev *end, u16 acs_flags); ++int pci_enable_atomic_ops_to_root(struct pci_dev *dev); + + #define PCI_VPD_LRDT 0x80 /* Large Resource Data Type */ + #define PCI_VPD_LRDT_ID(x) ((x) | PCI_VPD_LRDT) +diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h +index 5bb2b45..de5367c 100644 +--- a/include/uapi/linux/kfd_ioctl.h ++++ b/include/uapi/linux/kfd_ioctl.h +@@ -23,15 +23,15 @@ + #ifndef KFD_IOCTL_H_INCLUDED + #define KFD_IOCTL_H_INCLUDED + +-#include <drm/drm.h> ++#include <linux/types.h> + #include <linux/ioctl.h> + + #define KFD_IOCTL_MAJOR_VERSION 1 +-#define KFD_IOCTL_MINOR_VERSION 1 ++#define KFD_IOCTL_MINOR_VERSION 2 + + struct kfd_ioctl_get_version_args { +- __u32 major_version; /* from KFD */ +- __u32 minor_version; /* from KFD */ ++ uint32_t major_version; /* from KFD */ ++ uint32_t minor_version; /* from KFD */ + }; + + /* For kfd_ioctl_create_queue_args.queue_type. */ +@@ -43,36 +43,51 @@ struct kfd_ioctl_get_version_args { + #define KFD_MAX_QUEUE_PRIORITY 15 + + struct kfd_ioctl_create_queue_args { +- __u64 ring_base_address; /* to KFD */ +- __u64 write_pointer_address; /* from KFD */ +- __u64 read_pointer_address; /* from KFD */ +- __u64 doorbell_offset; /* from KFD */ +- +- __u32 ring_size; /* to KFD */ +- __u32 gpu_id; /* to KFD */ +- __u32 queue_type; /* to KFD */ +- __u32 queue_percentage; /* to KFD */ +- __u32 queue_priority; /* to KFD */ +- __u32 queue_id; /* from KFD */ +- +- __u64 eop_buffer_address; /* to KFD */ +- __u64 eop_buffer_size; /* to KFD */ +- __u64 ctx_save_restore_address; /* to KFD */ +- __u64 ctx_save_restore_size; /* to KFD */ ++ uint64_t ring_base_address; /* to KFD */ ++ uint64_t write_pointer_address; /* from KFD */ ++ uint64_t read_pointer_address; /* from KFD */ ++ uint64_t doorbell_offset; /* from KFD */ ++ ++ uint32_t ring_size; /* to KFD */ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t queue_type; /* to KFD */ ++ uint32_t queue_percentage; /* to KFD */ ++ uint32_t queue_priority; /* to KFD */ ++ uint32_t queue_id; /* from KFD */ ++ ++ uint64_t eop_buffer_address; /* to KFD */ ++ uint64_t eop_buffer_size; /* to KFD */ ++ uint64_t ctx_save_restore_address; /* to KFD */ ++ uint32_t ctx_save_restore_size; /* to KFD */ ++ uint32_t ctl_stack_size; /* to KFD */ + }; + + struct kfd_ioctl_destroy_queue_args { +- __u32 queue_id; /* to KFD */ +- __u32 pad; ++ uint32_t queue_id; /* to KFD */ ++ uint32_t pad; + }; + + struct kfd_ioctl_update_queue_args { +- __u64 ring_base_address; /* to KFD */ ++ uint64_t ring_base_address; /* to KFD */ ++ ++ uint32_t queue_id; /* to KFD */ ++ uint32_t ring_size; /* to KFD */ ++ uint32_t queue_percentage; /* to KFD */ ++ uint32_t queue_priority; /* to KFD */ ++}; + +- __u32 queue_id; /* to KFD */ +- __u32 ring_size; /* to KFD */ +- __u32 queue_percentage; /* to KFD */ +- __u32 queue_priority; /* to KFD */ ++struct kfd_ioctl_set_cu_mask_args { ++ uint32_t queue_id; /* to KFD */ ++ uint32_t num_cu_mask; /* to KFD */ ++ uint64_t cu_mask_ptr; /* to KFD */ ++}; ++ ++struct kfd_ioctl_get_queue_wave_state_args { ++ uint64_t ctl_stack_address; /* to KFD */ ++ uint32_t ctl_stack_used_size; /* from KFD */ ++ uint32_t save_area_used_size; /* from KFD */ ++ uint32_t queue_id; /* to KFD */ ++ uint32_t pad; + }; + + /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ +@@ -80,13 +95,20 @@ struct kfd_ioctl_update_queue_args { + #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 + + struct kfd_ioctl_set_memory_policy_args { +- __u64 alternate_aperture_base; /* to KFD */ +- __u64 alternate_aperture_size; /* to KFD */ ++ uint64_t alternate_aperture_base; /* to KFD */ ++ uint64_t alternate_aperture_size; /* to KFD */ ++ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t default_policy; /* to KFD */ ++ uint32_t alternate_policy; /* to KFD */ ++ uint32_t pad; ++}; + +- __u32 gpu_id; /* to KFD */ +- __u32 default_policy; /* to KFD */ +- __u32 alternate_policy; /* to KFD */ +- __u32 pad; ++struct kfd_ioctl_set_trap_handler_args { ++ uint64_t tba_addr; ++ uint64_t tma_addr; ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t pad; + }; + + /* +@@ -97,35 +119,52 @@ struct kfd_ioctl_set_memory_policy_args { + */ + + struct kfd_ioctl_get_clock_counters_args { +- __u64 gpu_clock_counter; /* from KFD */ +- __u64 cpu_clock_counter; /* from KFD */ +- __u64 system_clock_counter; /* from KFD */ +- __u64 system_clock_freq; /* from KFD */ ++ uint64_t gpu_clock_counter; /* from KFD */ ++ uint64_t cpu_clock_counter; /* from KFD */ ++ uint64_t system_clock_counter; /* from KFD */ ++ uint64_t system_clock_freq; /* from KFD */ + +- __u32 gpu_id; /* to KFD */ +- __u32 pad; ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t pad; + }; + + #define NUM_OF_SUPPORTED_GPUS 7 + + struct kfd_process_device_apertures { +- __u64 lds_base; /* from KFD */ +- __u64 lds_limit; /* from KFD */ +- __u64 scratch_base; /* from KFD */ +- __u64 scratch_limit; /* from KFD */ +- __u64 gpuvm_base; /* from KFD */ +- __u64 gpuvm_limit; /* from KFD */ +- __u32 gpu_id; /* from KFD */ +- __u32 pad; ++ uint64_t lds_base; /* from KFD */ ++ uint64_t lds_limit; /* from KFD */ ++ uint64_t scratch_base; /* from KFD */ ++ uint64_t scratch_limit; /* from KFD */ ++ uint64_t gpuvm_base; /* from KFD */ ++ uint64_t gpuvm_limit; /* from KFD */ ++ uint32_t gpu_id; /* from KFD */ ++ uint32_t pad; + }; + ++/* This IOCTL and the limited NUM_OF_SUPPORTED_GPUS is deprecated. Use ++ * kfd_ioctl_get_process_apertures_new instead, which supports ++ * arbitrary numbers of GPUs. ++ */ + struct kfd_ioctl_get_process_apertures_args { + struct kfd_process_device_apertures + process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */ + + /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ +- __u32 num_of_nodes; +- __u32 pad; ++ uint32_t num_of_nodes; ++ uint32_t pad; ++}; ++ ++struct kfd_ioctl_get_process_apertures_new_args { ++ /* User allocated. Pointer to struct kfd_process_device_apertures ++ * filled in by Kernel ++ */ ++ uint64_t kfd_process_device_apertures_ptr; ++ /* to KFD - indicates amount of memory present in ++ * kfd_process_device_apertures_ptr ++ * from KFD - Number of entries filled by KFD. ++ */ ++ uint32_t num_of_nodes; ++ uint32_t pad; + }; + + #define MAX_ALLOWED_NUM_POINTS 100 +@@ -133,103 +172,245 @@ struct kfd_ioctl_get_process_apertures_args { + #define MAX_ALLOWED_WAC_BUFF_SIZE 128 + + struct kfd_ioctl_dbg_register_args { +- __u32 gpu_id; /* to KFD */ +- __u32 pad; ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t pad; + }; + + struct kfd_ioctl_dbg_unregister_args { +- __u32 gpu_id; /* to KFD */ +- __u32 pad; ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t pad; + }; + + struct kfd_ioctl_dbg_address_watch_args { +- __u64 content_ptr; /* a pointer to the actual content */ +- __u32 gpu_id; /* to KFD */ +- __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ ++ uint64_t content_ptr; /* a pointer to the actual content */ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ + }; + + struct kfd_ioctl_dbg_wave_control_args { +- __u64 content_ptr; /* a pointer to the actual content */ +- __u32 gpu_id; /* to KFD */ +- __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ ++ uint64_t content_ptr; /* a pointer to the actual content */ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ + }; + + /* Matching HSA_EVENTTYPE */ +-#define KFD_IOC_EVENT_SIGNAL 0 +-#define KFD_IOC_EVENT_NODECHANGE 1 +-#define KFD_IOC_EVENT_DEVICESTATECHANGE 2 +-#define KFD_IOC_EVENT_HW_EXCEPTION 3 +-#define KFD_IOC_EVENT_SYSTEM_EVENT 4 +-#define KFD_IOC_EVENT_DEBUG_EVENT 5 +-#define KFD_IOC_EVENT_PROFILE_EVENT 6 +-#define KFD_IOC_EVENT_QUEUE_EVENT 7 +-#define KFD_IOC_EVENT_MEMORY 8 +- +-#define KFD_IOC_WAIT_RESULT_COMPLETE 0 +-#define KFD_IOC_WAIT_RESULT_TIMEOUT 1 +-#define KFD_IOC_WAIT_RESULT_FAIL 2 +- +-#define KFD_SIGNAL_EVENT_LIMIT 256 ++#define KFD_IOC_EVENT_SIGNAL 0 ++#define KFD_IOC_EVENT_NODECHANGE 1 ++#define KFD_IOC_EVENT_DEVICESTATECHANGE 2 ++#define KFD_IOC_EVENT_HW_EXCEPTION 3 ++#define KFD_IOC_EVENT_SYSTEM_EVENT 4 ++#define KFD_IOC_EVENT_DEBUG_EVENT 5 ++#define KFD_IOC_EVENT_PROFILE_EVENT 6 ++#define KFD_IOC_EVENT_QUEUE_EVENT 7 ++#define KFD_IOC_EVENT_MEMORY 8 ++ ++#define KFD_IOC_WAIT_RESULT_COMPLETE 0 ++#define KFD_IOC_WAIT_RESULT_TIMEOUT 1 ++#define KFD_IOC_WAIT_RESULT_FAIL 2 ++ ++#define KFD_SIGNAL_EVENT_LIMIT 4096 + + struct kfd_ioctl_create_event_args { +- __u64 event_page_offset; /* from KFD */ +- __u32 event_trigger_data; /* from KFD - signal events only */ +- __u32 event_type; /* to KFD */ +- __u32 auto_reset; /* to KFD */ +- __u32 node_id; /* to KFD - only valid for certain ++ uint64_t event_page_offset; /* from KFD */ ++ uint32_t event_trigger_data; /* from KFD - signal events only */ ++ uint32_t event_type; /* to KFD */ ++ uint32_t auto_reset; /* to KFD */ ++ uint32_t node_id; /* to KFD - only valid for certain + event types */ +- __u32 event_id; /* from KFD */ +- __u32 event_slot_index; /* from KFD */ ++ uint32_t event_id; /* from KFD */ ++ uint32_t event_slot_index; /* from KFD */ + }; + + struct kfd_ioctl_destroy_event_args { +- __u32 event_id; /* to KFD */ +- __u32 pad; ++ uint32_t event_id; /* to KFD */ ++ uint32_t pad; + }; + + struct kfd_ioctl_set_event_args { +- __u32 event_id; /* to KFD */ +- __u32 pad; ++ uint32_t event_id; /* to KFD */ ++ uint32_t pad; + }; + + struct kfd_ioctl_reset_event_args { +- __u32 event_id; /* to KFD */ +- __u32 pad; ++ uint32_t event_id; /* to KFD */ ++ uint32_t pad; + }; + + struct kfd_memory_exception_failure { +- __u32 NotPresent; /* Page not present or supervisor privilege */ +- __u32 ReadOnly; /* Write access to a read-only page */ +- __u32 NoExecute; /* Execute access to a page marked NX */ +- __u32 pad; ++ uint32_t NotPresent; /* Page not present or supervisor privilege */ ++ uint32_t ReadOnly; /* Write access to a read-only page */ ++ uint32_t NoExecute; /* Execute access to a page marked NX */ ++ uint32_t imprecise; /* Can't determine the exact fault address */ + }; + +-/* memory exception data*/ ++/* memory exception data */ + struct kfd_hsa_memory_exception_data { + struct kfd_memory_exception_failure failure; +- __u64 va; +- __u32 gpu_id; +- __u32 pad; ++ uint64_t va; ++ uint32_t gpu_id; ++ uint32_t pad; + }; + +-/* Event data*/ ++/* Event data */ + struct kfd_event_data { + union { + struct kfd_hsa_memory_exception_data memory_exception_data; + }; /* From KFD */ +- __u64 kfd_event_data_ext; /* pointer to an extension structure +- for future exception types */ +- __u32 event_id; /* to KFD */ +- __u32 pad; ++ uint64_t kfd_event_data_ext; /* pointer to an extension structure ++ for future exception types */ ++ uint32_t event_id; /* to KFD */ ++ uint32_t pad; + }; + + struct kfd_ioctl_wait_events_args { +- __u64 events_ptr; /* pointed to struct ++ uint64_t events_ptr; /* pointed to struct + kfd_event_data array, to KFD */ +- __u32 num_events; /* to KFD */ +- __u32 wait_for_all; /* to KFD */ +- __u32 timeout; /* to KFD */ +- __u32 wait_result; /* from KFD */ ++ uint32_t num_events; /* to KFD */ ++ uint32_t wait_for_all; /* to KFD */ ++ uint32_t timeout; /* to KFD */ ++ uint32_t wait_result; /* from KFD */ ++}; ++ ++struct kfd_ioctl_alloc_memory_of_scratch_args { ++ uint64_t va_addr; /* to KFD */ ++ uint64_t size; /* to KFD */ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t pad; ++}; ++ ++/* Allocation flags: memory types */ ++#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM (1 << 0) ++#define KFD_IOC_ALLOC_MEM_FLAGS_GTT (1 << 1) ++#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 2) ++#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 3) ++/* Allocation flags: attributes/access options */ ++#define KFD_IOC_ALLOC_MEM_FLAGS_NONPAGED (1 << 31) ++#define KFD_IOC_ALLOC_MEM_FLAGS_READONLY (1 << 30) ++#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC (1 << 29) ++#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) ++#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) ++#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) ++#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 25) ++ ++struct kfd_ioctl_alloc_memory_of_gpu_args { ++ uint64_t va_addr; /* to KFD */ ++ uint64_t size; /* to KFD */ ++ uint64_t handle; /* from KFD */ ++ uint64_t mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t flags; ++}; ++ ++struct kfd_ioctl_free_memory_of_gpu_args { ++ uint64_t handle; /* to KFD */ ++}; ++ ++struct kfd_ioctl_map_memory_to_gpu_args { ++ uint64_t handle; /* to KFD */ ++ uint64_t device_ids_array_ptr; /* to KFD */ ++ uint32_t device_ids_array_size; /* to KFD */ ++ uint32_t pad; ++}; ++ ++struct kfd_ioctl_unmap_memory_from_gpu_args { ++ uint64_t handle; /* to KFD */ ++ uint64_t device_ids_array_ptr; /* to KFD */ ++ uint32_t device_ids_array_size; /* to KFD */ ++ uint32_t pad; ++}; ++ ++struct kfd_ioctl_set_process_dgpu_aperture_args { ++ uint64_t dgpu_base; ++ uint64_t dgpu_limit; ++ uint32_t gpu_id; ++ uint32_t pad; ++}; ++ ++struct kfd_ioctl_get_dmabuf_info_args { ++ uint64_t size; /* from KFD */ ++ uint64_t metadata_ptr; /* to KFD */ ++ uint32_t metadata_size; /* to KFD (space allocated by user) ++ * from KFD (actual metadata size) */ ++ uint32_t gpu_id; /* from KFD */ ++ uint32_t flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ ++ uint32_t dmabuf_fd; /* to KFD */ ++}; ++ ++struct kfd_ioctl_import_dmabuf_args { ++ uint64_t va_addr; /* to KFD */ ++ uint64_t handle; /* from KFD */ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t dmabuf_fd; /* to KFD */ ++}; ++ ++struct kfd_ioctl_ipc_export_handle_args { ++ uint64_t handle; /* to KFD */ ++ uint32_t share_handle[4]; /* from KFD */ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t pad; ++}; ++ ++struct kfd_ioctl_ipc_import_handle_args { ++ uint64_t handle; /* from KFD */ ++ uint64_t va_addr; /* to KFD */ ++ uint64_t mmap_offset; /* from KFD */ ++ uint32_t share_handle[4]; /* to KFD */ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t pad; ++}; ++ ++struct kfd_ioctl_get_tile_config_args { ++ /* to KFD: pointer to tile array */ ++ uint64_t tile_config_ptr; ++ /* to KFD: pointer to macro tile array */ ++ uint64_t macro_tile_config_ptr; ++ /* to KFD: array size allocated by user mode ++ * from KFD: array size filled by kernel ++ */ ++ uint32_t num_tile_configs; ++ /* to KFD: array size allocated by user mode ++ * from KFD: array size filled by kernel ++ */ ++ uint32_t num_macro_tile_configs; ++ ++ uint32_t gpu_id; /* to KFD */ ++ uint32_t gb_addr_config; /* from KFD */ ++ uint32_t num_banks; /* from KFD */ ++ uint32_t num_ranks; /* from KFD */ ++ /* struct size can be extended later if needed ++ * without breaking ABI compatibility ++ */ ++}; ++ ++struct kfd_memory_range { ++ uint64_t va_addr; ++ uint64_t size; ++}; ++ ++/* flags definitions ++ * BIT0: 0: read operation, 1: write operation. ++ * This also identifies if the src or dst array belongs to remote process ++ */ ++#define KFD_CROSS_MEMORY_RW_BIT (1 << 0) ++#define KFD_SET_CROSS_MEMORY_READ(flags) (flags &= ~KFD_CROSS_MEMORY_RW_BIT) ++#define KFD_SET_CROSS_MEMORY_WRITE(flags) (flags |= KFD_CROSS_MEMORY_RW_BIT) ++#define KFD_IS_CROSS_MEMORY_WRITE(flags) (flags & KFD_CROSS_MEMORY_RW_BIT) ++ ++struct kfd_ioctl_cross_memory_copy_args { ++ /* to KFD: Process ID of the remote process */ ++ uint32_t pid; ++ /* to KFD: See above definition */ ++ uint32_t flags; ++ /* to KFD: Source GPU VM range */ ++ uint64_t src_mem_range_array; ++ /* to KFD: Size of above array */ ++ uint64_t src_mem_array_size; ++ /* to KFD: Destination GPU VM range */ ++ uint64_t dst_mem_range_array; ++ /* to KFD: Size of above array */ ++ uint64_t dst_mem_array_size; ++ /* from KFD: Total amount of bytes copied */ ++ uint64_t bytes_copied; + }; + + +@@ -287,7 +468,56 @@ struct kfd_ioctl_wait_events_args { + #define AMDKFD_IOC_DBG_WAVE_CONTROL \ + AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args) + ++#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU \ ++ AMDKFD_IOWR(0x11, struct kfd_ioctl_alloc_memory_of_gpu_args) ++ ++#define AMDKFD_IOC_FREE_MEMORY_OF_GPU \ ++ AMDKFD_IOWR(0x12, struct kfd_ioctl_free_memory_of_gpu_args) ++ ++#define AMDKFD_IOC_MAP_MEMORY_TO_GPU \ ++ AMDKFD_IOWR(0x13, struct kfd_ioctl_map_memory_to_gpu_args) ++ ++#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU \ ++ AMDKFD_IOWR(0x14, struct kfd_ioctl_unmap_memory_from_gpu_args) ++ ++#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH \ ++ AMDKFD_IOWR(0x15, struct kfd_ioctl_alloc_memory_of_scratch_args) ++ ++#define AMDKFD_IOC_SET_CU_MASK \ ++ AMDKFD_IOW(0x16, struct kfd_ioctl_set_cu_mask_args) ++ ++#define AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE \ ++ AMDKFD_IOW(0x17, \ ++ struct kfd_ioctl_set_process_dgpu_aperture_args) ++ ++#define AMDKFD_IOC_SET_TRAP_HANDLER \ ++ AMDKFD_IOW(0x18, struct kfd_ioctl_set_trap_handler_args) ++ ++#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW \ ++ AMDKFD_IOWR(0x19, struct kfd_ioctl_get_process_apertures_new_args) ++ ++#define AMDKFD_IOC_GET_DMABUF_INFO \ ++ AMDKFD_IOWR(0x1A, struct kfd_ioctl_get_dmabuf_info_args) ++ ++#define AMDKFD_IOC_IMPORT_DMABUF \ ++ AMDKFD_IOWR(0x1B, struct kfd_ioctl_import_dmabuf_args) ++ ++#define AMDKFD_IOC_GET_TILE_CONFIG \ ++ AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_tile_config_args) ++ ++#define AMDKFD_IOC_IPC_IMPORT_HANDLE \ ++ AMDKFD_IOWR(0x1D, struct kfd_ioctl_ipc_import_handle_args) ++ ++#define AMDKFD_IOC_IPC_EXPORT_HANDLE \ ++ AMDKFD_IOWR(0x1E, struct kfd_ioctl_ipc_export_handle_args) ++ ++#define AMDKFD_IOC_CROSS_MEMORY_COPY \ ++ AMDKFD_IOWR(0x1F, struct kfd_ioctl_cross_memory_copy_args) ++ ++#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE \ ++ AMDKFD_IOWR(0x20, struct kfd_ioctl_get_queue_wave_state_args) ++ + #define AMDKFD_COMMAND_START 0x01 +-#define AMDKFD_COMMAND_END 0x11 ++#define AMDKFD_COMMAND_END 0x21 + + #endif +diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h +old mode 100644 +new mode 100755 +index 87c2c84..1256851 +--- a/include/uapi/linux/pci_regs.h ++++ b/include/uapi/linux/pci_regs.h +@@ -624,7 +624,9 @@ + #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ + #define PCI_EXP_DEVCAP2_ARI 0x00000020 /* Alternative Routing-ID */ + #define PCI_EXP_DEVCAP2_ATOMIC_ROUTE 0x00000040 /* Atomic Op routing */ ++#define PCI_EXP_DEVCAP2_ATOMIC_COMP32 0x00000080 /* 32b AtomicOp completion */ + #define PCI_EXP_DEVCAP2_ATOMIC_COMP64 0x00000100 /* Atomic 64-bit compare */ ++#define PCI_EXP_DEVCAP2_ATOMIC_COMP128 0x00000200 /* 128b AtomicOp completion*/ + #define PCI_EXP_DEVCAP2_LTR 0x00000800 /* Latency tolerance reporting */ + #define PCI_EXP_DEVCAP2_OBFF_MASK 0x000c0000 /* OBFF support mechanism */ + #define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */ +@@ -634,6 +636,7 @@ + #define PCI_EXP_DEVCTL2_ARI 0x0020 /* Alternative Routing-ID */ + #define PCI_EXP_DEVCTL2_ATOMIC_REQ 0x0040 /* Set Atomic requests */ + #define PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK 0x0080 /* Block atomic egress */ ++#define PCI_EXP_DEVCTL2_ATOMIC_BLOCK 0x0040 /* Block AtomicOp on egress */ + #define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x0100 /* Allow IDO for requests */ + #define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x0200 /* Allow IDO for completions */ + #define PCI_EXP_DEVCTL2_LTR_EN 0x0400 /* Enable LTR mechanism */ +diff --git a/kernel/fork.c b/kernel/fork.c +index a19ee25..70d8d5b 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1082,6 +1082,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) + + return mm; + } ++EXPORT_SYMBOL_GPL(mm_access); + + static void complete_vfork_done(struct task_struct *tsk) + { +-- +2.7.4 + |