diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch | 8695 |
1 files changed, 0 insertions, 8695 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch deleted file mode 100644 index a27db153..00000000 --- a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1353-compilation-fix-for-amdkfd-porting.patch +++ /dev/null @@ -1,8695 +0,0 @@ -From 817ccd6f0987f83ddbf989602f0fbf320157f0a9 Mon Sep 17 00:00:00 2001 -From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> -Date: Thu, 18 Oct 2018 12:42:04 +0530 -Subject: [PATCH 1353/4131] compilation fix for amdkfd porting - -Signed-off-by: Sanjay R Mehta <sanju.mehta@amd.com> -Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> ---- - drivers/gpu/drm/amd/amdgpu/Makefile | 8 +- - drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 + - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 346 ++- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 185 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 196 ++ - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 537 ++++- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 590 ++++- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h | 62 + - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1227 ++++++++++ - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2578 +++++++++++++++++++++ - drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 - - drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 + - drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 + - drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + - drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 46 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h | 2 + - drivers/gpu/drm/amd/amdgpu/soc15d.h | 1 + - drivers/gpu/drm/amd/amdgpu/vid.h | 2 + - drivers/gpu/drm/amd/amdkfd/Makefile | 2 - - drivers/gpu/drm/amd/amdkfd/backport/backport.h | 7 - - drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 - - drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 4 - - drivers/gpu/drm/amd/amdkfd/kfd_device.c | 12 - - drivers/gpu/drm/amd/amdkfd/kfd_events.c | 28 - - drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 10 - - drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 8 - - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 4 - - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 4 - - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 4 - - drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 2 - - drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 +- - drivers/gpu/drm/amd/amdkfd/kfd_process.c | 52 +- - drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 22 - - drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 6 - - drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 - - drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 231 +- - drivers/gpu/drm/amd/include/v9_structs.h | 48 +- - drivers/gpu/drm/amd/include/vi_structs.h | 2 + - drivers/pci/pci.c | 81 + - include/drm/amd_rdma.h | 70 + - include/linux/pci.h | 1 + - include/uapi/linux/kfd_ioctl.h | 442 +++- - include/uapi/linux/pci_regs.h | 3 + - kernel/fork.c | 1 + - 44 files changed, 6315 insertions(+), 537 deletions(-) - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/Makefile - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu.h - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h - create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c - create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h - create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c - create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/soc15d.h - mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/vid.h - mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/Makefile - mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_priv.h - mode change 100644 => 100755 drivers/gpu/drm/amd/include/kgd_kfd_interface.h - mode change 100644 => 100755 drivers/gpu/drm/amd/include/v9_structs.h - mode change 100644 => 100755 drivers/gpu/drm/amd/include/vi_structs.h - mode change 100644 => 100755 drivers/pci/pci.c - create mode 100644 include/drm/amd_rdma.h - mode change 100644 => 100755 include/linux/pci.h - mode change 100644 => 100755 include/uapi/linux/pci_regs.h - -diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile -old mode 100644 -new mode 100755 -index 57b8d5f..6b373d0 ---- a/drivers/gpu/drm/amd/amdgpu/Makefile -+++ b/drivers/gpu/drm/amd/amdgpu/Makefile -@@ -32,12 +32,11 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \ - amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \ - amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ - amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \ -- amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o -+ amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o amdgpu_amdkfd_fence.o - - # add asic specific block - amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \ - ci_smc.o ci_dpm.o dce_v8_0.o gfx_v7_0.o cik_sdma.o uvd_v4_2.o vce_v2_0.o \ -- amdgpu_amdkfd_gfx_v7.o - - amdgpu-$(CONFIG_DRM_AMDGPU_SI)+= si.o gmc_v6_0.o gfx_v6_0.o si_ih.o si_dma.o dce_v6_0.o si_dpm.o si_smc.o - -@@ -109,7 +108,10 @@ amdgpu-y += \ - # add amdkfd interfaces - amdgpu-y += \ - amdgpu_amdkfd.o \ -- amdgpu_amdkfd_gfx_v8.o -+ amdgpu_amdkfd_gfx_v7.o \ -+ amdgpu_amdkfd_gfx_v8.o \ -+ amdgpu_amdkfd_gfx_v9.o \ -+ amdgpu_amdkfd_gpuvm.o - - # add cgs - amdgpu-y += amdgpu_cgs.o -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h -old mode 100644 -new mode 100755 -index fe23de8..bcf95e7 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h -@@ -184,6 +184,7 @@ struct amdgpu_cs_parser; - struct amdgpu_job; - struct amdgpu_irq_src; - struct amdgpu_fpriv; -+struct kfd_vm_fault_info; - struct amdgpu_bo_va_mapping; - - enum amdgpu_cp_irq { -@@ -403,6 +404,7 @@ struct amdgpu_gem_object { - struct amdgpu_bo *bo; - }; - -+struct kgd_mem; - #define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_gem_object, base)->bo - - void amdgpu_gem_object_free(struct drm_gem_object *obj); -@@ -543,6 +545,9 @@ struct amdgpu_mc { - u64 private_aperture_end; - /* protects concurrent invalidation */ - spinlock_t invalidate_lock; -+ -+ struct kfd_vm_fault_info *vm_fault_info; -+ atomic_t vm_fault_info_updated; - }; - - /* -@@ -961,6 +966,7 @@ struct amdgpu_gfx_config { - }; - - struct amdgpu_cu_info { -+ uint32_t simd_per_cu; - uint32_t max_waves_per_simd; - uint32_t wave_front_size; - uint32_t max_scratch_slots_per_cu; -@@ -1649,6 +1655,7 @@ struct amdgpu_device { - /* record hw reset is performed */ - bool has_hw_reset; - u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; -+ spinlock_t tlb_invalidation_lock; - - /* record last mm index being written through WREG32*/ - unsigned long last_mm_index; -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c -old mode 100644 -new mode 100755 -index 7ec1915..ec8141f ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c -@@ -20,23 +20,29 @@ - * OTHER DEALINGS IN THE SOFTWARE. - */ - -+#undef pr_fmt -+#define pr_fmt(fmt) "kfd2kgd: " fmt -+ - #include "amdgpu_amdkfd.h" --#include "amd_shared.h" -+#include <linux/dma-buf.h> - #include <drm/drmP.h> - #include "amdgpu.h" - #include "amdgpu_gfx.h" - #include <linux/module.h> - --const struct kfd2kgd_calls *kfd2kgd; -+#define AMDKFD_SKIP_UNCOMPILED_CODE 1 -+ - const struct kgd2kfd_calls *kgd2kfd; --bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); -+bool (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); -+ -+unsigned int global_compute_vmid_bitmap = 0xFF00; - - int amdgpu_amdkfd_init(void) - { - int ret; - - #if defined(CONFIG_HSA_AMD_MODULE) -- int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); -+ int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); - - kgd2kfd_init_p = symbol_request(kgd2kfd_init); - -@@ -57,56 +63,68 @@ int amdgpu_amdkfd_init(void) - #else - ret = -ENOENT; - #endif -- -+ amdgpu_amdkfd_gpuvm_init_mem_limits(); - return ret; - } - --bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev) -+void amdgpu_amdkfd_fini(void) - { -+ if (kgd2kfd) { -+ kgd2kfd->exit(); -+ symbol_put(kgd2kfd_init); -+ } -+} -+ -+void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) -+{ -+ const struct kfd2kgd_calls *kfd2kgd; -+ -+ if (!kgd2kfd) -+ return; -+ - switch (adev->asic_type) { - #ifdef CONFIG_DRM_AMDGPU_CIK - case CHIP_KAVERI: -+ case CHIP_HAWAII: - kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions(); - break; - #endif - case CHIP_CARRIZO: -+ case CHIP_TONGA: -+ case CHIP_FIJI: -+ case CHIP_POLARIS10: -+ case CHIP_POLARIS11: - kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); - break; -+ case CHIP_VEGA10: -+ case CHIP_RAVEN: -+ kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions(); -+ break; - default: -- return false; -- } -- -- return true; --} -- --void amdgpu_amdkfd_fini(void) --{ -- if (kgd2kfd) { -- kgd2kfd->exit(); -- symbol_put(kgd2kfd_init); -+ dev_info(adev->dev, "kfd not supported on this ASIC\n"); -+ return; - } --} - --void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) --{ -- if (kgd2kfd) -- adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, -- adev->pdev, kfd2kgd); -+ adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, -+ adev->pdev, kfd2kgd); - } - - void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) - { - int i; - int last_valid_bit; -+ - if (adev->kfd) { - struct kgd2kfd_shared_resources gpu_resources = { -- .compute_vmid_bitmap = 0xFF00, -+ .compute_vmid_bitmap = global_compute_vmid_bitmap, - .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec, -- .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe -+ .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe, -+ .gpuvm_size = (uint64_t)amdgpu_vm_size << 30 - }; - - /* this is going to have a few of the MSBs set that we need to -- * clear */ -+ * clear -+ */ - bitmap_complement(gpu_resources.queue_bitmap, - adev->gfx.mec.queue_bitmap, - KGD_MAX_QUEUES); -@@ -120,7 +138,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) - gpu_resources.queue_bitmap); - - /* According to linux/bitmap.h we shouldn't use bitmap_clear if -- * nbits is not compile time constant */ -+ * nbits is not compile time constant -+ */ - last_valid_bit = 1 /* only first MEC can have compute queues */ - * adev->gfx.mec.num_pipe_per_mec - * adev->gfx.mec.num_queue_per_pipe; -@@ -131,6 +150,28 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) - &gpu_resources.doorbell_physical_address, - &gpu_resources.doorbell_aperture_size, - &gpu_resources.doorbell_start_offset); -+ if (adev->asic_type >= CHIP_VEGA10) { -+ /* On SOC15 the BIF is involved in routing -+ * doorbells using the low 12 bits of the -+ * address. Communicate the assignments to -+ * KFD. KFD uses two doorbell pages per -+ * process in case of 64-bit doorbells so we -+ * can use each doorbell assignment twice. -+ */ -+ gpu_resources.sdma_doorbell[0][0] = -+ AMDGPU_DOORBELL64_sDMA_ENGINE0; -+ gpu_resources.sdma_doorbell[0][1] = -+ AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200; -+ gpu_resources.sdma_doorbell[1][0] = -+ AMDGPU_DOORBELL64_sDMA_ENGINE1; -+ gpu_resources.sdma_doorbell[1][1] = -+ AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200; -+ /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for -+ * SDMA, IH and VCN. So don't use them for the CP. -+ */ -+ gpu_resources.reserved_doorbell_mask = 0x1f0; -+ gpu_resources.reserved_doorbell_val = 0x0f0; -+ } - - kgd2kfd->device_init(adev->kfd, &gpu_resources); - } -@@ -167,24 +208,81 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev) - return r; - } - -+int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, -+ uint32_t vmid, uint64_t gpu_addr, -+ uint32_t *ib_cmd, uint32_t ib_len) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ struct amdgpu_job *job; -+ struct amdgpu_ib *ib; -+ struct amdgpu_ring *ring; -+ struct dma_fence *f = NULL; -+ int ret; -+ -+ switch (engine) { -+ case KGD_ENGINE_MEC1: -+ ring = &adev->gfx.compute_ring[0]; -+ break; -+ case KGD_ENGINE_SDMA1: -+ ring = &adev->sdma.instance[0].ring; -+ break; -+ case KGD_ENGINE_SDMA2: -+ ring = &adev->sdma.instance[1].ring; -+ break; -+ default: -+ pr_err("Invalid engine in IB submission: %d\n", engine); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = amdgpu_job_alloc(adev, 1, &job, NULL); -+ if (ret) -+ goto err; -+ -+ ib = &job->ibs[0]; -+ memset(ib, 0, sizeof(struct amdgpu_ib)); -+ -+ ib->gpu_addr = gpu_addr; -+ ib->ptr = ib_cmd; -+ ib->length_dw = ib_len; -+ /* This works for NO_HWS. TODO: need to handle without knowing VMID */ -+ job->vm_id = vmid; -+ -+ ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); -+ if (ret) { -+ DRM_ERROR("amdgpu: failed to schedule IB.\n"); -+ goto err_ib_sched; -+ } -+ -+ ret = dma_fence_wait(f, false); -+ -+err_ib_sched: -+ dma_fence_put(f); -+ amdgpu_job_free(job); -+err: -+ return ret; -+} -+ -+u32 pool_to_domain(enum kgd_memory_pool p) -+{ -+ switch (p) { -+ case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM; -+ default: return AMDGPU_GEM_DOMAIN_GTT; -+ } -+} -+ - int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr) - { - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- struct kgd_mem **mem = (struct kgd_mem **) mem_obj; -+ struct amdgpu_bo *bo = NULL; - int r; -- -- BUG_ON(kgd == NULL); -- BUG_ON(gpu_addr == NULL); -- BUG_ON(cpu_ptr == NULL); -- -- *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL); -- if ((*mem) == NULL) -- return -ENOMEM; -+ uint64_t gpu_addr_tmp = 0; -+ void *cpu_ptr_tmp = NULL; - - r = amdgpu_bo_create(adev, size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT, -- AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, &(*mem)->bo); -+ AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &bo); - if (r) { - dev_err(adev->dev, - "failed to allocate BO for amdkfd (%d)\n", r); -@@ -192,64 +290,87 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - } - - /* map the buffer */ -- r = amdgpu_bo_reserve((*mem)->bo, true); -+ r = amdgpu_bo_reserve(bo, true); - if (r) { - dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r); - goto allocate_mem_reserve_bo_failed; - } - -- r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT, -- &(*mem)->gpu_addr); -+ r = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT, -+ &gpu_addr_tmp); - if (r) { - dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r); - goto allocate_mem_pin_bo_failed; - } -- *gpu_addr = (*mem)->gpu_addr; - -- r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); -+ r = amdgpu_bo_kmap(bo, &cpu_ptr_tmp); - if (r) { - dev_err(adev->dev, - "(%d) failed to map bo to kernel for amdkfd\n", r); - goto allocate_mem_kmap_bo_failed; - } -- *cpu_ptr = (*mem)->cpu_ptr; - -- amdgpu_bo_unreserve((*mem)->bo); -+ *mem_obj = bo; -+ *gpu_addr = gpu_addr_tmp; -+ *cpu_ptr = cpu_ptr_tmp; -+ -+ amdgpu_bo_unreserve(bo); - - return 0; - - allocate_mem_kmap_bo_failed: -- amdgpu_bo_unpin((*mem)->bo); -+ amdgpu_bo_unpin(bo); - allocate_mem_pin_bo_failed: -- amdgpu_bo_unreserve((*mem)->bo); -+ amdgpu_bo_unreserve(bo); - allocate_mem_reserve_bo_failed: -- amdgpu_bo_unref(&(*mem)->bo); -+ amdgpu_bo_unref(&bo); - - return r; - } - - void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) - { -- struct kgd_mem *mem = (struct kgd_mem *) mem_obj; -+ struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; - -- BUG_ON(mem == NULL); -- -- amdgpu_bo_reserve(mem->bo, true); -- amdgpu_bo_kunmap(mem->bo); -- amdgpu_bo_unpin(mem->bo); -- amdgpu_bo_unreserve(mem->bo); -- amdgpu_bo_unref(&(mem->bo)); -- kfree(mem); -+ amdgpu_bo_reserve(bo, true); -+ amdgpu_bo_kunmap(bo); -+ amdgpu_bo_unpin(bo); -+ amdgpu_bo_unreserve(bo); -+ amdgpu_bo_unref(&(bo)); - } - --uint64_t get_vmem_size(struct kgd_dev *kgd) -+void get_local_mem_info(struct kgd_dev *kgd, -+ struct kfd_local_mem_info *mem_info) - { -- struct amdgpu_device *adev = -- (struct amdgpu_device *)kgd; -+ uint64_t address_mask; -+ resource_size_t aper_limit; -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - -- BUG_ON(kgd == NULL); -+ address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask : -+ ~((1ULL << 32) - 1); -+ aper_limit = adev->mc.aper_base + adev->mc.aper_size; -+ -+ memset(mem_info, 0, sizeof(*mem_info)); -+ if (!(adev->mc.aper_base & address_mask || -+ aper_limit & address_mask)) { -+ mem_info->local_mem_size_public = adev->mc.visible_vram_size; -+ mem_info->local_mem_size_private = adev->mc.real_vram_size - -+ adev->mc.visible_vram_size; -+ } else { -+ mem_info->local_mem_size_public = 0; -+ mem_info->local_mem_size_private = adev->mc.real_vram_size; -+ } -+ mem_info->vram_width = adev->mc.vram_width; - -- return adev->mc.real_vram_size; -+ pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n", -+ adev->mc.aper_base, aper_limit, -+ mem_info->local_mem_size_public, -+ mem_info->local_mem_size_private); -+ -+ if (amdgpu_sriov_vf(adev)) -+ mem_info->mem_clk_max = adev->clock.default_mclk / 100; -+ else -+ mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100; - } - - uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) -@@ -271,3 +392,106 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd) - - return amdgpu_dpm_get_sclk(adev, false) / 100; - } -+ -+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ struct amdgpu_cu_info acu_info = adev->gfx.cu_info; -+ -+ memset(cu_info, 0, sizeof(*cu_info)); -+ if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap)) -+ return; -+ -+ cu_info->cu_active_number = acu_info.number; -+ cu_info->cu_ao_mask = acu_info.ao_cu_mask; -+ memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], -+ sizeof(acu_info.bitmap)); -+ cu_info->num_shader_engines = adev->gfx.config.max_shader_engines; -+ cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; -+ cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh; -+ cu_info->simd_per_cu = acu_info.simd_per_cu; -+ cu_info->max_waves_per_simd = acu_info.max_waves_per_simd; -+ cu_info->wave_front_size = acu_info.wave_front_size; -+ cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu; -+ cu_info->lds_size = acu_info.lds_size; -+} -+ -+int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, -+ struct kgd_dev **dma_buf_kgd, -+ uint64_t *bo_size, void *metadata_buffer, -+ size_t buffer_size, uint32_t *metadata_size, -+ uint32_t *flags) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ struct dma_buf *dma_buf; -+ struct drm_gem_object *obj; -+ struct amdgpu_bo *bo; -+ uint64_t metadata_flags; -+ int r = -EINVAL; -+ -+ dma_buf = dma_buf_get(dma_buf_fd); -+ if (IS_ERR(dma_buf)) -+ return PTR_ERR(dma_buf); -+ -+ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) -+ /* Can't handle non-graphics buffers */ -+ goto out_put; -+ -+ obj = dma_buf->priv; -+ if (obj->dev->driver != adev->ddev->driver) -+ /* Can't handle buffers from different drivers */ -+ goto out_put; -+ -+ adev = obj->dev->dev_private; -+ bo = gem_to_amdgpu_bo(obj); -+ if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | -+ AMDGPU_GEM_DOMAIN_GTT | -+ AMDGPU_GEM_DOMAIN_DGMA))) -+ /* Only VRAM, GTT and DGMA BOs are supported */ -+ goto out_put; -+ -+ r = 0; -+ if (dma_buf_kgd) -+ *dma_buf_kgd = (struct kgd_dev *)adev; -+ if (bo_size) -+ *bo_size = amdgpu_bo_size(bo); -+ if (metadata_size) -+ *metadata_size = bo->metadata_size; -+ if (metadata_buffer) -+ r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size, -+ metadata_size, &metadata_flags); -+ if (flags) { -+ /* If the preferred domain is DGMA, set flags to VRAM because -+ * KFD doesn't support allocating DGMA memory -+ */ -+ *flags = (bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | -+ AMDGPU_GEM_DOMAIN_DGMA)) ? -+ ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT; -+ -+ if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) -+ *flags |= ALLOC_MEM_FLAGS_PUBLIC; -+ } -+ -+out_put: -+ dma_buf_put(dma_buf); -+ return r; -+} -+ -+uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ uint64_t usage = -+ amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]); -+ return usage; -+} -+ -+bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, -+ u32 vmid) -+{ -+ if (adev->kfd) { -+ if ((1 << vmid) & global_compute_vmid_bitmap) -+ return true; -+ } -+ -+ return false; -+} -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h -old mode 100644 -new mode 100755 -index 6d3a10b..b259ba7 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h -@@ -27,20 +27,109 @@ - - #include <linux/types.h> - #include <linux/mm.h> -+#include <linux/workqueue.h> -+#include <linux/mmu_context.h> - #include <kgd_kfd_interface.h> -+#include "amdgpu.h" -+ -+extern const struct kgd2kfd_calls *kgd2kfd; - - struct amdgpu_device; - -+struct kfd_bo_va_list { -+ struct list_head bo_list; -+ struct amdgpu_bo_va *bo_va; -+ void *kgd_dev; -+ bool is_mapped; -+ bool map_fail; -+ uint64_t va; -+ uint64_t pte_flags; -+}; -+ - struct kgd_mem { -+ struct mutex lock; - struct amdgpu_bo *bo; -- uint64_t gpu_addr; -- void *cpu_ptr; -+ struct list_head bo_va_list; -+ /* protected by amdkfd_process_info.lock */ -+ struct ttm_validate_buffer validate_list; -+ struct ttm_validate_buffer resv_list; -+ uint32_t domain; -+ unsigned int mapped_to_gpu_memory; -+ void *kptr; -+ uint64_t va; -+ -+ uint32_t mapping_flags; -+ -+ atomic_t invalid; -+ struct amdkfd_process_info *process_info; -+ struct page **user_pages; -+ -+ struct amdgpu_sync sync; -+ -+ /* flags bitfield */ -+ bool coherent : 1; -+ bool no_substitute : 1; -+ bool aql_queue : 1; -+}; -+ -+/* KFD Memory Eviction */ -+struct amdgpu_amdkfd_fence { -+ struct dma_fence base; -+ void *mm; -+ spinlock_t lock; -+ char timeline_name[TASK_COMM_LEN]; -+}; -+ -+struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, -+ void *mm); -+bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm); -+struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); -+ -+struct amdkfd_process_info { -+ /* List head of all VMs that belong to a KFD process */ -+ struct list_head vm_list_head; -+ /* List head for all KFD BOs that belong to a KFD process. */ -+ struct list_head kfd_bo_list; -+ /* List of userptr BOs that are valid or invalid */ -+ struct list_head userptr_valid_list; -+ struct list_head userptr_inval_list; -+ /* Lock to protect kfd_bo_list */ -+ struct mutex lock; -+ -+ /* Number of VMs */ -+ unsigned int n_vms; -+ /* Eviction Fence */ -+ struct amdgpu_amdkfd_fence *eviction_fence; -+ -+ /* MMU-notifier related fields */ -+ atomic_t evicted_bos; -+ struct delayed_work work; -+ struct pid *pid; -+}; -+ -+/* struct amdkfd_vm - -+ * For Memory Eviction KGD requires a mechanism to keep track of all KFD BOs -+ * belonging to a KFD process. All the VMs belonging to the same process point -+ * to the same amdkfd_process_info. -+ */ -+struct amdkfd_vm { -+ /* Keep base as the first parameter for pointer compatibility between -+ * amdkfd_vm and amdgpu_vm. -+ */ -+ struct amdgpu_vm base; -+ -+ /* List node in amdkfd_process_info.vm_list_head*/ -+ struct list_head vm_list_node; -+ -+ struct amdgpu_device *adev; -+ /* Points to the KFD process VM info*/ -+ struct amdkfd_process_info *process_info; - }; - -+ - int amdgpu_amdkfd_init(void); - void amdgpu_amdkfd_fini(void); - --bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev); - - void amdgpu_amdkfd_suspend(struct amdgpu_device *adev); - int amdgpu_amdkfd_resume(struct amdgpu_device *adev); -@@ -50,17 +139,105 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); - void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); - void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); - -+int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); -+int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, -+ uint32_t vmid, uint64_t gpu_addr, -+ uint32_t *ib_cmd, uint32_t ib_len); -+int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, -+ struct dma_fence **ef); - struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); - struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); -+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void); -+int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, -+ uint64_t src_offset, struct kgd_mem *dst_mem, -+ uint64_t dest_offset, uint64_t size, struct dma_fence **f, -+ uint64_t *actual_size); -+ -+bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, -+ u32 vmid); - - /* Shared API */ -+int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm, -+ struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va); - int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr); - void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); --uint64_t get_vmem_size(struct kgd_dev *kgd); -+void get_local_mem_info(struct kgd_dev *kgd, -+ struct kfd_local_mem_info *mem_info); - uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); - - uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd); -+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info); -+int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, -+ struct kgd_dev **dmabuf_kgd, -+ uint64_t *bo_size, void *metadata_buffer, -+ size_t buffer_size, uint32_t *metadata_size, -+ uint32_t *flags); -+uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd); -+ -+#define read_user_wptr(mmptr, wptr, dst) \ -+ ({ \ -+ bool valid = false; \ -+ if ((mmptr) && (wptr)) { \ -+ if ((mmptr) == current->mm) { \ -+ valid = !get_user((dst), (wptr)); \ -+ } else if (current->mm == NULL) { \ -+ use_mm(mmptr); \ -+ valid = !get_user((dst), (wptr)); \ -+ unuse_mm(mmptr); \ -+ } \ -+ } \ -+ valid; \ -+ }) -+ -+/* GPUVM API */ -+int amdgpu_amdkfd_gpuvm_sync_memory( -+ struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); -+int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( -+ struct kgd_dev *kgd, uint64_t va, uint64_t size, -+ void *vm, struct kgd_mem **mem, -+ uint64_t *offset, uint32_t flags); -+int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); -+int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); -+int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); - -+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, -+ void **process_info, -+ struct dma_fence **ef); -+void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm); -+ -+uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm); -+ -+int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, -+ struct kfd_vm_fault_info *info); -+ -+int amdgpu_amdkfd_gpuvm_mmap_bo( -+ struct kgd_dev *kgd, struct vm_area_struct *vma); -+ -+int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, -+ struct kgd_mem *mem, void **kptr); -+ -+int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, -+ struct kgd_mem *mem, uint64_t offset, -+ uint64_t size, struct sg_table **ret_sg); -+void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( -+ struct kgd_mem *mem, struct sg_table *sg); -+int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, -+ struct dma_buf *dmabuf, -+ uint64_t va, void *vm, -+ struct kgd_mem **mem, uint64_t *size, -+ uint64_t *mmap_offset); -+int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, -+ struct kgd_mem *mem, -+ struct dma_buf **dmabuf); -+int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm); -+int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm); -+ -+void amdgpu_amdkfd_gpuvm_init_mem_limits(void); -+void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo); - #endif /* AMDGPU_AMDKFD_H_INCLUDED */ -+ -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c -new file mode 100644 -index 0000000..3961937 ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c -@@ -0,0 +1,196 @@ -+/* -+ * Copyright 2016 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#include <linux/spinlock.h> -+#include <linux/atomic.h> -+#include <linux/stacktrace.h> -+#include <linux/sched.h> -+#include <linux/slab.h> -+#include "amdgpu_amdkfd.h" -+ -+const struct dma_fence_ops amd_kfd_fence_ops; -+static atomic_t fence_seq = ATOMIC_INIT(0); -+ -+static int amd_kfd_fence_signal(struct dma_fence *f); -+ -+/* Eviction Fence -+ * Fence helper functions to deal with KFD memory eviction. -+ * Big Idea - Since KFD submissions are done by user queues, a BO cannot be -+ * evicted unless all the user queues for that process are evicted. -+ * -+ * All the BOs in a process share an eviction fence. When process X wants -+ * to map VRAM memory but TTM can't find enough space, TTM will attempt to -+ * evict BOs from its LRU list. TTM checks if the BO is valuable to evict -+ * by calling ttm_bo_driver->eviction_valuable(). -+ * -+ * ttm_bo_driver->eviction_valuable() - will return false if the BO belongs -+ * to process X. Otherwise, it will return true to indicate BO can be -+ * evicted by TTM. -+ * -+ * If ttm_bo_driver->eviction_valuable returns true, then TTM will continue -+ * the evcition process for that BO by calling ttm_bo_evict --> amdgpu_bo_move -+ * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler. -+ * -+ * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to -+ * nofity when the BO is free to move. fence_add_callback --> enable_signaling -+ * --> amdgpu_amdkfd_fence.enable_signaling -+ * -+ * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will quiesce -+ * user queues and signal fence. The work item will also start another delayed -+ * work item to restore BOs -+ */ -+ -+struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, -+ void *mm) -+{ -+ struct amdgpu_amdkfd_fence *fence = NULL; -+ -+ fence = kzalloc(sizeof(*fence), GFP_KERNEL); -+ if (fence == NULL) -+ return NULL; -+ -+ /* mm_struct mm is used as void pointer to identify the parent -+ * KFD process. Don't dereference it. Fence and any threads using -+ * mm is guranteed to be released before process termination. -+ */ -+ fence->mm = mm; -+ get_task_comm(fence->timeline_name, current); -+ spin_lock_init(&fence->lock); -+ -+ dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock, -+ context, atomic_inc_return(&fence_seq)); -+ -+ return fence; -+} -+ -+struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f) -+{ -+ struct amdgpu_amdkfd_fence *fence; -+ -+ if (!f) -+ return NULL; -+ -+ fence = container_of(f, struct amdgpu_amdkfd_fence, base); -+ if (fence && f->ops == &amd_kfd_fence_ops) -+ return fence; -+ -+ return NULL; -+} -+ -+static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f) -+{ -+ return "amdgpu_amdkfd_fence"; -+} -+ -+static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f) -+{ -+ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); -+ -+ return fence->timeline_name; -+} -+ -+/** -+ * amd_kfd_fence_enable_signaling - This gets called when TTM wants to evict -+ * a KFD BO and schedules a job to move the BO. -+ * If fence is already signaled return true. -+ * If fence is not signaled schedule a evict KFD process work item. -+ */ -+static bool amd_kfd_fence_enable_signaling(struct dma_fence *f) -+{ -+ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); -+ -+ if (!fence) -+ return false; -+ -+ if (dma_fence_is_signaled(f)) -+ return true; -+ -+ if (!kgd2kfd->schedule_evict_and_restore_process( -+ (struct mm_struct *)fence->mm, f)) -+ return true; -+ -+ return false; -+} -+ -+static int amd_kfd_fence_signal(struct dma_fence *f) -+{ -+ unsigned long flags; -+ int ret; -+ -+ spin_lock_irqsave(f->lock, flags); -+ /* Set enabled bit so cb will called */ -+ set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags); -+ ret = dma_fence_signal_locked(f); -+ spin_unlock_irqrestore(f->lock, flags); -+ -+ return ret; -+} -+ -+/** -+ * amd_kfd_fence_release - callback that fence can be freed -+ * -+ * @fence: fence -+ * -+ * This function is called when the reference count becomes zero. -+ * It just RCU schedules freeing up the fence. -+*/ -+static void amd_kfd_fence_release(struct dma_fence *f) -+{ -+ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); -+ /* Unconditionally signal the fence. The process is getting -+ * terminated. -+ */ -+ if (WARN_ON(!fence)) -+ return; /* Not an amdgpu_amdkfd_fence */ -+ -+ amd_kfd_fence_signal(f); -+ kfree_rcu(f, rcu); -+} -+ -+/** -+ * amd_kfd_fence_check_mm - Check if @mm is same as that of the fence @f -+ * if same return TRUE else return FALSE. -+ * -+ * @f: [IN] fence -+ * @mm: [IN] mm that needs to be verified -+*/ -+bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm) -+{ -+ struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); -+ -+ if (!fence) -+ return false; -+ else if (fence->mm == mm) -+ return true; -+ -+ return false; -+} -+ -+const struct dma_fence_ops amd_kfd_fence_ops = { -+ .get_driver_name = amd_kfd_fence_get_driver_name, -+ .get_timeline_name = amd_kfd_fence_get_timeline_name, -+ .enable_signaling = amd_kfd_fence_enable_signaling, -+ .signaled = NULL, -+ .wait = dma_fence_default_wait, -+ .release = amd_kfd_fence_release, -+}; -+ -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c -old mode 100644 -new mode 100755 -index 5748504..6964ece ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c -@@ -20,6 +20,9 @@ - * OTHER DEALINGS IN THE SOFTWARE. - */ - -+#undef pr_fmt -+#define pr_fmt(fmt) "kfd2kgd: " fmt -+ - #include <linux/fdtable.h> - #include <linux/uaccess.h> - #include <linux/firmware.h> -@@ -39,6 +42,14 @@ - #include "gmc/gmc_7_1_sh_mask.h" - #include "cik_structs.h" - -+#define AMDKFD_SKIP_UNCOMPILED_CODE 1 -+ -+enum hqd_dequeue_request_type { -+ NO_ACTION = 0, -+ DRAIN_PIPE, -+ RESET_WAVES -+}; -+ - enum { - MAX_TRAPID = 8, /* 3 bits in the bitfield. */ - MAX_WATCH_ADDRESSES = 4 -@@ -55,8 +66,8 @@ enum { - enum { - ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, - ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, -- ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, -- /* extend the mask to 26 bits to match the low address field */ -+ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000, -+ /* extend the mask to 26 bits in order to match the low address field */ - ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, - ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF - }; -@@ -81,30 +92,42 @@ union TCP_WATCH_CNTL_BITS { - float f32All; - }; - -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -+ int fd, uint32_t handle, struct kgd_mem **mem); -+ -+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -+ - /* - * Register access functions - */ - - static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, -- uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); -- -+ uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, -+ uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); - static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, -- unsigned int vmid); -- -+ unsigned int vmid); - static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, -- uint32_t hpd_size, uint64_t hpd_gpu_addr); -+ uint32_t hpd_size, uint64_t hpd_gpu_addr); - static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr); --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t wptr_shift, uint32_t wptr_mask, -+ struct mm_struct *mm); -+static int kgd_hqd_dump(struct kgd_dev *kgd, -+ uint32_t pipe_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs); -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -+ uint32_t __user *wptr, struct mm_struct *mm); -+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -+ uint32_t engine_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs); - static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, -- uint32_t pipe_id, uint32_t queue_id); -- --static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, -+ uint32_t pipe_id, uint32_t queue_id); -+static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); -+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -+ enum kfd_preempt_type reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id); --static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); - static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout); - static int kgd_address_watch_disable(struct kgd_dev *kgd); -@@ -124,21 +147,60 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid); - static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid); - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); -+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); -+static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req); -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid); -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype); -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base); -+static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd); -+ -+/* Because of REG_GET_FIELD() being used, we put this function in the -+ * asic specific file. -+ */ -+static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, -+ struct tile_config *config) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - --static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -+ config->gb_addr_config = adev->gfx.config.gb_addr_config; -+ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -+ MC_ARB_RAMCFG, NOOFBANK); -+ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -+ MC_ARB_RAMCFG, NOOFRANKS); -+ -+ config->tile_config_ptr = adev->gfx.config.tile_mode_array; -+ config->num_tile_configs = -+ ARRAY_SIZE(adev->gfx.config.tile_mode_array); -+ config->macro_tile_config_ptr = -+ adev->gfx.config.macrotile_mode_array; -+ config->num_macro_tile_configs = -+ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); -+ -+ -+ return 0; -+} - - static const struct kfd2kgd_calls kfd2kgd = { - .init_gtt_mem_allocation = alloc_gtt_mem, - .free_gtt_mem = free_gtt_mem, -- .get_vmem_size = get_vmem_size, -+ .get_local_mem_info = get_local_mem_info, - .get_gpu_clock_counter = get_gpu_clock_counter, - .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, -+ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, -+ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, -+ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, -+ .open_graphic_handle = open_graphic_handle, - .program_sh_mem_settings = kgd_program_sh_mem_settings, - .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, - .init_pipeline = kgd_init_pipeline, - .init_interrupts = kgd_init_interrupts, - .hqd_load = kgd_hqd_load, - .hqd_sdma_load = kgd_hqd_sdma_load, -+ .hqd_dump = kgd_hqd_dump, -+ .hqd_sdma_dump = kgd_hqd_sdma_dump, - .hqd_is_occupied = kgd_hqd_is_occupied, - .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, - .hqd_destroy = kgd_hqd_destroy, -@@ -147,17 +209,50 @@ static const struct kfd2kgd_calls kfd2kgd = { - .address_watch_execute = kgd_address_watch_execute, - .wave_control_execute = kgd_wave_control_execute, - .address_watch_get_offset = kgd_address_watch_get_offset, -- .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid, -- .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, -+ .get_atc_vmid_pasid_mapping_pasid = -+ get_atc_vmid_pasid_mapping_pasid, -+ .get_atc_vmid_pasid_mapping_valid = -+ get_atc_vmid_pasid_mapping_valid, -+ .read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg, - .write_vmid_invalidate_request = write_vmid_invalidate_request, -- .get_fw_version = get_fw_version -+ .invalidate_tlbs = invalidate_tlbs, -+ .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, -+ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, -+ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, -+ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, -+ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, -+ .get_fw_version = get_fw_version, -+ .set_num_of_requests = set_num_of_requests, -+ .get_cu_info = get_cu_info, -+ .alloc_memory_of_scratch = alloc_memory_of_scratch, -+ .write_config_static_mem = write_config_static_mem, -+ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, -+ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, -+ .set_vm_context_page_table_base = set_vm_context_page_table_base, -+ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, -+ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, -+ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, -+ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, -+ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, -+ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, -+ .submit_ib = amdgpu_amdkfd_submit_ib, -+ .get_tile_config = amdgpu_amdkfd_get_tile_config, -+ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, -+ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, -+ .get_vram_usage = amdgpu_amdkfd_get_vram_usage - }; - --struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void) -+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions() - { - return (struct kfd2kgd_calls *)&kfd2kgd; - } - -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -+ int fd, uint32_t handle, struct kgd_mem **mem) -+{ -+ return 0; -+} -+ - static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) - { - return (struct amdgpu_device *)kgd; -@@ -186,7 +281,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - -- uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - lock_srbm(kgd, mec, pipe, queue_id, 0); -@@ -222,12 +317,12 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, - - /* - * We have to assume that there is no outstanding mapping. -- * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because -- * a mapping is in progress or because a mapping finished and the -- * SW cleared it. So the protocol is to always wait & clear. -+ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a -+ * mapping is in progress or because a mapping finished and the SW -+ * cleared it. So the protocol is to always wait & clear. - */ -- uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | -- ATC_VMID0_PASID_MAPPING__VALID_MASK; -+ uint32_t pasid_mapping = (pasid == 0) ? 0 : -+ (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; - - WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); - -@@ -273,8 +368,7 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) - - retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + - m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; -- -- pr_debug("kfd: sdma base address: 0x%x\n", retval); -+ pr_debug("sdma base address: 0x%x\n", retval); - - return retval; - } -@@ -290,26 +384,91 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) - } - - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr) -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t wptr_shift, uint32_t wptr_mask, -+ struct mm_struct *mm) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t wptr_shadow, is_wptr_shadow_valid; - struct cik_mqd *m; -+ uint32_t *mqd_hqd; -+ uint32_t reg, wptr_val, data; -+ bool valid_wptr = false; - - m = get_mqd(mqd); - -- is_wptr_shadow_valid = !get_user(wptr_shadow, wptr); -- if (is_wptr_shadow_valid) -- m->cp_hqd_pq_wptr = wptr_shadow; -+ acquire_queue(kgd, pipe_id, queue_id); -+ -+ /* HQD registers extend from CP_MQD_BASE_ADDR to CP_MQD_CONTROL. */ -+ mqd_hqd = &m->cp_mqd_base_addr_lo; -+ -+ for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) -+ WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); -+ -+ /* Copy userspace write pointer value to register. -+ * Activate doorbell logic to monitor subsequent changes. -+ */ -+ data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, -+ CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); -+ WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); -+ -+ /* read_user_ptr may take the mm->mmap_sem. -+ * release srbm_mutex to avoid circular dependency between -+ * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. -+ */ -+ release_queue(kgd); -+ valid_wptr = read_user_wptr(mm, wptr, wptr_val); - - acquire_queue(kgd, pipe_id, queue_id); -- gfx_v7_0_mqd_commit(adev, m); -+ if (valid_wptr) -+ WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); -+ -+ data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); -+ WREG32(mmCP_HQD_ACTIVE, data); -+ -+ - release_queue(kgd); - - return 0; - } - --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) -+static int kgd_hqd_dump(struct kgd_dev *kgd, -+ uint32_t pipe_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t i = 0, reg; -+#define HQD_N_REGS (35+4) -+#define DUMP_REG(addr) do { \ -+ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ -+ break; \ -+ (*dump)[i][0] = (addr) << 2; \ -+ (*dump)[i++][1] = RREG32(addr); \ -+ } while (0) -+ -+ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -+ if (*dump == NULL) -+ return -ENOMEM; -+ -+ acquire_queue(kgd, pipe_id, queue_id); -+ -+ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); -+ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); -+ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); -+ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); -+ -+ for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) -+ DUMP_REG(reg); -+ -+ release_queue(kgd); -+ -+ WARN_ON_ONCE(i != HQD_N_REGS); -+ *n_regs = i; -+ -+ return 0; -+} -+ -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -+ uint32_t __user *wptr, struct mm_struct *mm) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct cik_sdma_rlc_registers *m; -@@ -320,17 +479,17 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); - -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -- m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -+ m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); - -- end_jiffies = msecs_to_jiffies(2000) + jiffies; - while (true) { -- data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -- break; -- if (time_after(jiffies, end_jiffies)) -- return -ETIME; -- usleep_range(500, 1000); -+ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -+ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -+ break; -+ if (timeout == 0) -+ return -ETIME; -+ msleep(10); -+ timeout -= 10; - } - if (m->sdma_engine_id) { - data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); -@@ -344,25 +503,59 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) - WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); - } - -- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, -- m->sdma_rlc_doorbell); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, -- m->sdma_rlc_virtual_addr); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); -+ data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL, -+ ENABLE, 1); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr); -+ if (read_user_wptr(mm, wptr, data)) -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); -+ else -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, -+ m->sdma_rlc_rb_rptr); -+ -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, -+ m->sdma_rlc_virtual_addr); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); -+ - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, - m->sdma_rlc_rb_base_hi); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, - m->sdma_rlc_rb_rptr_addr_lo); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, - m->sdma_rlc_rb_rptr_addr_hi); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -- m->sdma_rlc_rb_cntl); -- -+ data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL, -+ RB_ENABLE, 1); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); - return 0; - } - -+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -+ uint32_t engine_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + -+ queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; -+ uint32_t i = 0, reg; -+#undef HQD_N_REGS -+#define HQD_N_REGS (19+4) -+ -+ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -+ if (*dump == NULL) -+ return -ENOMEM; -+ -+ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) -+ DUMP_REG(sdma_offset + reg); -+ for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; -+ reg++) -+ DUMP_REG(sdma_offset + reg); -+ -+ WARN_ON_ONCE(i != HQD_N_REGS); -+ *n_regs = i; -+ -+ return 0; -+} -+ - static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id) - { -@@ -403,30 +596,99 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) - return false; - } - --static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, -+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -+ enum kfd_preempt_type reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t temp; -- int timeout = utimeout; -+ enum hqd_dequeue_request_type type; -+ unsigned long flags, end_jiffies; -+ int retry; - - acquire_queue(kgd, pipe_id, queue_id); - WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0); - -- WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); -+ switch (reset_type) { -+ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: -+ type = DRAIN_PIPE; -+ break; -+ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: -+ type = RESET_WAVES; -+ break; -+ default: -+ type = DRAIN_PIPE; -+ break; -+ } -+ -+ /* Workaround: If IQ timer is active and the wait time is close to or -+ * equal to 0, dequeueing is not safe. Wait until either the wait time -+ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is -+ * cleared before continuing. Also, ensure wait times are set to at -+ * least 0x3. -+ */ -+ local_irq_save(flags); -+ preempt_disable(); -+ retry = 5000; /* wait for 500 usecs at maximum */ -+ while (true) { -+ temp = RREG32(mmCP_HQD_IQ_TIMER); -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { -+ pr_debug("HW is processing IQ\n"); -+ goto loop; -+ } -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) -+ == 3) /* SEM-rearm is safe */ -+ break; -+ /* Wait time 3 is safe for CP, but our MMIO read/write -+ * time is close to 1 microsecond, so check for 10 to -+ * leave more buffer room -+ */ -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) -+ >= 10) -+ break; -+ pr_debug("IQ timer is active\n"); -+ } else -+ break; -+loop: -+ if (!retry) { -+ pr_err("CP HQD IQ timer status time out\n"); -+ break; -+ } -+ ndelay(100); -+ --retry; -+ } -+ retry = 1000; -+ while (true) { -+ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); -+ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) -+ break; -+ pr_debug("Dequeue request is pending\n"); - -+ if (!retry) { -+ pr_err("CP HQD dequeue request time out\n"); -+ break; -+ } -+ ndelay(100); -+ --retry; -+ } -+ local_irq_restore(flags); -+ preempt_enable(); -+ -+ WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); -+ -+ end_jiffies = (utimeout * HZ / 1000) + jiffies; - while (true) { - temp = RREG32(mmCP_HQD_ACTIVE); -- if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) -+ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) - break; -- if (timeout <= 0) { -- pr_err("kfd: cp queue preemption time out.\n"); -+ if (time_after(jiffies, end_jiffies)) { -+ pr_err("cp queue preemption time out\n"); - release_queue(kgd); - return -ETIME; - } -- msleep(20); -- timeout -= 20; -+ usleep_range(500, 1000); - } - - release_queue(kgd); -@@ -440,7 +702,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - struct cik_sdma_rlc_registers *m; - uint32_t sdma_base_addr; - uint32_t temp; -- int timeout = utimeout; -+ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); -@@ -451,12 +713,11 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - - while (true) { - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) -+ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) - break; -- if (timeout <= 0) -+ if (time_after(jiffies, end_jiffies)) - return -ETIME; -- msleep(20); -- timeout -= 20; -+ usleep_range(500, 1000); - } - - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); -@@ -464,6 +725,8 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | - SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); - -+ m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); -+ - return 0; - } - -@@ -481,8 +744,9 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd) - - /* Turning off this address until we set all the registers */ - for (i = 0; i < MAX_WATCH_ADDRESSES; i++) -- WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_CNTL], cntl.u32All); -+ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); - - return 0; - } -@@ -500,20 +764,24 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, - - /* Turning off this watch point until we set all the registers */ - cntl.bitfields.valid = 0; -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_CNTL], cntl.u32All); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); - -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_ADDR_HI], addr_hi); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_ADDR_HI], -+ addr_hi); - -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_ADDR_LO], addr_lo); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_ADDR_LO], -+ addr_lo); - - /* Enable the watch point */ - cntl.bitfields.valid = 1; - -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_CNTL], cntl.u32All); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); - - return 0; - } -@@ -567,7 +835,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); -- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; -+ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; - } - - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) -@@ -577,52 +845,90 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) - WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); - } - -+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ int vmid; -+ -+ for (vmid = 0; vmid < 16; vmid++) { -+ if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) -+ continue; -+ if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & -+ ATC_VMID0_PASID_MAPPING__VALID_MASK) { -+ if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & -+ ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { -+ WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); -+ break; -+ } -+ } -+ } -+ -+ return 0; -+} -+ -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype) -+{ -+ uint32_t reg; -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | -+ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | -+ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | -+ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; -+ -+ WREG32(mmSH_STATIC_MEM_CONFIG, reg); -+ return 0; -+} -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ lock_srbm(kgd, 0, 0, 0, vmid); -+ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); -+ unlock_srbm(kgd); -+ -+ return 0; -+} -+ -+ - static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - { - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - const union amdgpu_firmware_header *hdr; - -- BUG_ON(kgd == NULL); -- - switch (type) { - case KGD_ENGINE_PFP: -- hdr = (const union amdgpu_firmware_header *) -- adev->gfx.pfp_fw->data; -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; - break; - - case KGD_ENGINE_ME: -- hdr = (const union amdgpu_firmware_header *) -- adev->gfx.me_fw->data; -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; - break; - - case KGD_ENGINE_CE: -- hdr = (const union amdgpu_firmware_header *) -- adev->gfx.ce_fw->data; -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; - break; - - case KGD_ENGINE_MEC1: -- hdr = (const union amdgpu_firmware_header *) -- adev->gfx.mec_fw->data; -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; - break; - - case KGD_ENGINE_MEC2: -- hdr = (const union amdgpu_firmware_header *) -- adev->gfx.mec2_fw->data; -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; - break; - - case KGD_ENGINE_RLC: -- hdr = (const union amdgpu_firmware_header *) -- adev->gfx.rlc_fw->data; -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; - break; - - case KGD_ENGINE_SDMA1: -- hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[0].fw->data; -+ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; - break; - - case KGD_ENGINE_SDMA2: -- hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[1].fw->data; -+ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; - break; - - default: -@@ -636,3 +942,42 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - return hdr->common.ucode_version; - } - -+static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req) -+{ -+ uint32_t value; -+ struct amdgpu_device *adev = get_amdgpu_device(dev); -+ -+ value = RREG32(mmATC_ATS_DEBUG); -+ value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK; -+ value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT); -+ -+ WREG32(mmATC_ATS_DEBUG, value); -+} -+ -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ /* TODO: Don't use hardcoded VMIDs */ -+ if (vmid < 8 || vmid > 15) { -+ pr_err("trying to set page table base for wrong VMID\n"); -+ return; -+ } -+ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); -+} -+ -+ /** -+ * read_vmid_from_vmfault_reg - read vmid from register -+ * -+ * adev: amdgpu_device pointer -+ * @vmid: vmid pointer -+ * read vmid from register (CIK). -+ */ -+static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ -+ uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); -+ -+ return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID); -+} -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c -old mode 100644 -new mode 100755 -index c5044d5..2ff10e9 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c -@@ -20,6 +20,9 @@ - * OTHER DEALINGS IN THE SOFTWARE. - */ - -+#undef pr_fmt -+#define pr_fmt(fmt) "kfd2kgd: " fmt -+ - #include <linux/module.h> - #include <linux/fdtable.h> - #include <linux/uaccess.h> -@@ -28,7 +31,7 @@ - #include "amdgpu.h" - #include "amdgpu_amdkfd.h" - #include "amdgpu_ucode.h" --#include "gfx_v8_0.h" -+#include "amdgpu_amdkfd_gfx_v8.h" - #include "gca/gfx_8_0_sh_mask.h" - #include "gca/gfx_8_0_d.h" - #include "gca/gfx_8_0_enum.h" -@@ -39,7 +42,31 @@ - #include "vi_structs.h" - #include "vid.h" - --struct cik_sdma_rlc_registers; -+enum hqd_dequeue_request_type { -+ NO_ACTION = 0, -+ DRAIN_PIPE, -+ RESET_WAVES, -+ SAVE_WAVES -+}; -+ -+static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { -+ mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, -+ mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, -+ mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, -+ mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL -+}; -+ -+ -+struct vi_sdma_mqd; -+ -+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem); -+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); -+ -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -+ int fd, uint32_t handle, struct kgd_mem **mem); -+ -+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); - - /* - * Register access functions -@@ -55,17 +82,26 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr); - static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr); --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t wptr_shift, uint32_t wptr_mask, -+ struct mm_struct *mm); -+static int kgd_hqd_dump(struct kgd_dev *kgd, -+ uint32_t pipe_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs); -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -+ uint32_t __user *wptr, struct mm_struct *mm); -+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -+ uint32_t engine_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs); - static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); - static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); --static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, -+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -+ enum kfd_preempt_type reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id); - static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout); --static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); - static int kgd_address_watch_disable(struct kgd_dev *kgd); - static int kgd_address_watch_execute(struct kgd_dev *kgd, - unsigned int watch_point_id, -@@ -84,20 +120,61 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, - static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid); - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); --static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -+static void set_num_of_requests(struct kgd_dev *kgd, -+ uint8_t num_of_requests); -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid); -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype); -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base); -+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); -+ -+/* Because of REG_GET_FIELD() being used, we put this function in the -+ * asic specific file. -+ */ -+static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, -+ struct tile_config *config) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ -+ config->gb_addr_config = adev->gfx.config.gb_addr_config; -+ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -+ MC_ARB_RAMCFG, NOOFBANK); -+ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -+ MC_ARB_RAMCFG, NOOFRANKS); -+ -+ config->tile_config_ptr = adev->gfx.config.tile_mode_array; -+ config->num_tile_configs = -+ ARRAY_SIZE(adev->gfx.config.tile_mode_array); -+ config->macro_tile_config_ptr = -+ adev->gfx.config.macrotile_mode_array; -+ config->num_macro_tile_configs = -+ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); -+ -+ return 0; -+} - - static const struct kfd2kgd_calls kfd2kgd = { - .init_gtt_mem_allocation = alloc_gtt_mem, - .free_gtt_mem = free_gtt_mem, -- .get_vmem_size = get_vmem_size, -+ .get_local_mem_info = get_local_mem_info, - .get_gpu_clock_counter = get_gpu_clock_counter, - .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, -+ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, -+ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, -+ .create_process_gpumem = create_process_gpumem, -+ .destroy_process_gpumem = destroy_process_gpumem, -+ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, -+ .open_graphic_handle = open_graphic_handle, - .program_sh_mem_settings = kgd_program_sh_mem_settings, - .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, - .init_pipeline = kgd_init_pipeline, - .init_interrupts = kgd_init_interrupts, - .hqd_load = kgd_hqd_load, - .hqd_sdma_load = kgd_hqd_sdma_load, -+ .hqd_dump = kgd_hqd_dump, -+ .hqd_sdma_dump = kgd_hqd_sdma_dump, - .hqd_is_occupied = kgd_hqd_is_occupied, - .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, - .hqd_destroy = kgd_hqd_destroy, -@@ -111,14 +188,56 @@ static const struct kfd2kgd_calls kfd2kgd = { - .get_atc_vmid_pasid_mapping_valid = - get_atc_vmid_pasid_mapping_valid, - .write_vmid_invalidate_request = write_vmid_invalidate_request, -- .get_fw_version = get_fw_version -+ .invalidate_tlbs = invalidate_tlbs, -+ .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, -+ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, -+ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, -+ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, -+ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, -+ .get_fw_version = get_fw_version, -+ .set_num_of_requests = set_num_of_requests, -+ .get_cu_info = get_cu_info, -+ .alloc_memory_of_scratch = alloc_memory_of_scratch, -+ .write_config_static_mem = write_config_static_mem, -+ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, -+ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, -+ .set_vm_context_page_table_base = set_vm_context_page_table_base, -+ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, -+ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, -+ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, -+ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, -+ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, -+ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, -+ .submit_ib = amdgpu_amdkfd_submit_ib, -+ .get_tile_config = amdgpu_amdkfd_get_tile_config, -+ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, -+ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, -+ .get_vram_usage = amdgpu_amdkfd_get_vram_usage - }; - --struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void) -+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions() - { - return (struct kfd2kgd_calls *)&kfd2kgd; - } - -+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem) -+{ -+ return 0; -+} -+ -+/* Destroys the GPU allocation and frees the kgd_mem structure */ -+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) -+{ -+ -+} -+ -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -+ int fd, uint32_t handle, struct kgd_mem **mem) -+{ -+ return 0; -+} -+ - static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) - { - return (struct amdgpu_device *)kgd; -@@ -147,7 +266,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - -- uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - lock_srbm(kgd, mec, pipe, queue_id, 0); -@@ -216,21 +335,28 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) - uint32_t mec; - uint32_t pipe; - -- mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - lock_srbm(kgd, mec, pipe, 0, 0); - -- WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK); -+ WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | -+ CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); - - unlock_srbm(kgd); - - return 0; - } - --static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) -+static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m) - { -- return 0; -+ uint32_t retval; -+ -+ retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + -+ m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET; -+ pr_debug("sdma base address: 0x%x\n", retval); -+ -+ return retval; - } - - static inline struct vi_mqd *get_mqd(void *mqd) -@@ -238,9 +364,9 @@ static inline struct vi_mqd *get_mqd(void *mqd) - return (struct vi_mqd *)mqd; - } - --static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) -+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) - { -- return (struct cik_sdma_rlc_registers *)mqd; -+ return (struct vi_sdma_mqd *)mqd; - } - - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -@@ -252,16 +378,18 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - struct vi_mqd *m; - uint32_t *mqd_hqd; - uint32_t reg, wptr_val, data; -+ bool valid_wptr = false; - - m = get_mqd(mqd); - - acquire_queue(kgd, pipe_id, queue_id); -- /*HIQ is set during driver init period with vmid set to 0. For SRIOV -- * world switching support let the RLC know about the HIQ. -- * -- * Workaround: This causes reboots on CZ. Disable this on CZ, which -- * doesn't support SRIOV anyway. -- */ -+ -+ /* HIQ is set during driver init period with vmid set to 0. For SRIOV -+ * world switching support let the RLC know about the HIQ. -+ * -+ * Workaround: This causes reboots on CZ. Disable this on CZ, which -+ * doesn't support SRIOV anyway. -+ */ - if (m->cp_hqd_vmid == 0 && - adev->asic_type != CHIP_CARRIZO) { - uint32_t value, mec, pipe; -@@ -304,7 +432,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); - WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); - -- if (read_user_wptr(mm, wptr, wptr_val)) -+ /* read_user_ptr may take the mm->mmap_sem. -+ * release srbm_mutex to avoid circular dependency between -+ * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. -+ */ -+ release_queue(kgd); -+ valid_wptr = read_user_wptr(mm, wptr, wptr_val); -+ acquire_queue(kgd, pipe_id, queue_id); -+ if (valid_wptr) - WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); - - data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); -@@ -315,8 +450,138 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - return 0; - } - --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) -+static int kgd_hqd_dump(struct kgd_dev *kgd, -+ uint32_t pipe_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t i = 0, reg; -+#define HQD_N_REGS (54+4) -+#define DUMP_REG(addr) do { \ -+ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ -+ break; \ -+ (*dump)[i][0] = (addr) << 2; \ -+ (*dump)[i++][1] = RREG32(addr); \ -+ } while (0) -+ -+ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -+ if (*dump == NULL) -+ return -ENOMEM; -+ -+ acquire_queue(kgd, pipe_id, queue_id); -+ -+ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); -+ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); -+ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); -+ DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); -+ -+ for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++) -+ DUMP_REG(reg); -+ -+ release_queue(kgd); -+ -+ WARN_ON_ONCE(i != HQD_N_REGS); -+ *n_regs = i; -+ -+ return 0; -+} -+ -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -+ uint32_t __user *wptr, struct mm_struct *mm) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ struct vi_sdma_mqd *m; -+ uint32_t sdma_base_addr; -+ uint32_t temp, timeout = 2000; -+ uint32_t data; -+ -+ m = get_sdma_mqd(mqd); -+ sdma_base_addr = get_sdma_base_addr(m); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -+ m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); -+ -+ while (true) { -+ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -+ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -+ break; -+ if (timeout == 0) -+ return -ETIME; -+ msleep(10); -+ timeout -= 10; -+ } -+ if (m->sdma_engine_id) { -+ data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); -+ data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, -+ RESUME_CTX, 0); -+ WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); -+ } else { -+ data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); -+ data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, -+ RESUME_CTX, 0); -+ WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); -+ } -+ -+ data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, -+ ENABLE, 1); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); -+ -+ if (read_user_wptr(mm, wptr, data)) -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); -+ else -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, -+ m->sdmax_rlcx_rb_rptr); -+ -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, -+ m->sdmax_rlcx_virtual_addr); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, -+ m->sdmax_rlcx_rb_base_hi); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, -+ m->sdmax_rlcx_rb_rptr_addr_lo); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, -+ m->sdmax_rlcx_rb_rptr_addr_hi); -+ -+ data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, -+ RB_ENABLE, 1); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); -+ -+ return 0; -+} -+ -+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -+ uint32_t engine_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs) - { -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + -+ queue_id * KFD_VI_SDMA_QUEUE_OFFSET; -+ uint32_t i = 0, reg; -+#undef HQD_N_REGS -+#define HQD_N_REGS (19+4+2+3+7) -+ -+ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -+ if (*dump == NULL) -+ return -ENOMEM; -+ -+ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) -+ DUMP_REG(sdma_offset + reg); -+ for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; -+ reg++) -+ DUMP_REG(sdma_offset + reg); -+ for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; -+ reg++) -+ DUMP_REG(sdma_offset + reg); -+ for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG; -+ reg++) -+ DUMP_REG(sdma_offset + reg); -+ for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL; -+ reg++) -+ DUMP_REG(sdma_offset + reg); -+ -+ WARN_ON_ONCE(i != HQD_N_REGS); -+ *n_regs = i; -+ - return 0; - } - -@@ -345,7 +610,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct cik_sdma_rlc_registers *m; -+ struct vi_sdma_mqd *m; - uint32_t sdma_base_addr; - uint32_t sdma_rlc_rb_cntl; - -@@ -360,29 +625,102 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) - return false; - } - --static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, -+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -+ enum kfd_preempt_type reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t temp; -- int timeout = utimeout; -+ enum hqd_dequeue_request_type type; -+ unsigned long flags, end_jiffies; -+ int retry; -+ struct vi_mqd *m = get_mqd(mqd); - - acquire_queue(kgd, pipe_id, queue_id); - -- WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); -+ if (m->cp_hqd_vmid == 0) -+ WREG32_FIELD(RLC_CP_SCHEDULERS, scheduler1, 0); - -+ switch (reset_type) { -+ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: -+ type = DRAIN_PIPE; -+ break; -+ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: -+ type = RESET_WAVES; -+ break; -+ default: -+ type = DRAIN_PIPE; -+ break; -+ } -+ -+ /* Workaround: If IQ timer is active and the wait time is close to or -+ * equal to 0, dequeueing is not safe. Wait until either the wait time -+ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is -+ * cleared before continuing. Also, ensure wait times are set to at -+ * least 0x3. -+ */ -+ local_irq_save(flags); -+ preempt_disable(); -+ retry = 5000; /* wait for 500 usecs at maximum */ -+ while (true) { -+ temp = RREG32(mmCP_HQD_IQ_TIMER); -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { -+ pr_debug("HW is processing IQ\n"); -+ goto loop; -+ } -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) -+ == 3) /* SEM-rearm is safe */ -+ break; -+ /* Wait time 3 is safe for CP, but our MMIO read/write -+ * time is close to 1 microsecond, so check for 10 to -+ * leave more buffer room -+ */ -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) -+ >= 10) -+ break; -+ pr_debug("IQ timer is active\n"); -+ } else -+ break; -+loop: -+ if (!retry) { -+ pr_err("CP HQD IQ timer status time out\n"); -+ break; -+ } -+ ndelay(100); -+ --retry; -+ } -+ retry = 1000; -+ while (true) { -+ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); -+ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) -+ break; -+ pr_debug("Dequeue request is pending\n"); -+ -+ if (!retry) { -+ pr_err("CP HQD dequeue request time out\n"); -+ break; -+ } -+ ndelay(100); -+ --retry; -+ } -+ local_irq_restore(flags); -+ preempt_enable(); -+ -+ WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); -+ -+ end_jiffies = (utimeout * HZ / 1000) + jiffies; - while (true) { - temp = RREG32(mmCP_HQD_ACTIVE); -- if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) -+ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) - break; -- if (timeout <= 0) { -- pr_err("kfd: cp queue preemption time out.\n"); -+ if (time_after(jiffies, end_jiffies)) { -+ pr_err("cp queue preemption time out.\n"); - release_queue(kgd); - return -ETIME; - } -- msleep(20); -- timeout -= 20; -+ usleep_range(500, 1000); - } - - release_queue(kgd); -@@ -393,10 +731,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct cik_sdma_rlc_registers *m; -+ struct vi_sdma_mqd *m; - uint32_t sdma_base_addr; - uint32_t temp; -- int timeout = utimeout; -+ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); -@@ -407,18 +745,19 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - - while (true) { - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) -+ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) - break; -- if (timeout <= 0) -+ if (time_after(jiffies, end_jiffies)) - return -ETIME; -- msleep(20); -- timeout -= 20; -+ usleep_range(500, 1000); - } - - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -+ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | -+ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); -+ -+ m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); - - return 0; - } -@@ -440,7 +779,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); -- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; -+ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; - } - - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) -@@ -450,8 +789,83 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) - WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); - } - -+/* -+ * FIXME: Poliars test failed with this package, FIJI works fine -+ * From the CP spec it does not official support the invalidation -+ * with the specified pasid in the package, so disable it for V8 -+ * -+ */ -+#ifdef V8_SUPPORT_IT_OFFICIAL -+static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) -+{ -+ signed long r; -+ struct dma_fence *f; -+ struct amdgpu_ring *ring = &adev->gfx.kiq.ring; -+ -+ mutex_lock(&adev->gfx.kiq.ring_mutex); -+ amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ -+ amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); -+ amdgpu_ring_write(ring, -+ PACKET3_INVALIDATE_TLBS_DST_SEL(1) | -+ PACKET3_INVALIDATE_TLBS_PASID(pasid)); -+ amdgpu_fence_emit(ring, &f); -+ amdgpu_ring_commit(ring); -+ mutex_unlock(&adev->gfx.kiq.ring_mutex); -+ -+ r = dma_fence_wait(f, false); -+ if (r) -+ DRM_ERROR("wait for kiq fence error: %ld.\n", r); -+ dma_fence_put(f); -+ -+ return r; -+} -+#endif -+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ int vmid; -+ -+#ifdef V8_SUPPORT_IT_OFFICIAL -+ struct amdgpu_ring *ring = &adev->gfx.kiq.ring; -+ -+ if (ring->ready) -+ return invalidate_tlbs_with_kiq(adev, pasid); -+#endif -+ -+ for (vmid = 0; vmid < 16; vmid++) { -+ if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) -+ continue; -+ if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & -+ ATC_VMID0_PASID_MAPPING__VALID_MASK) { -+ if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & -+ ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { -+ WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); -+ break; -+ } -+ } -+ } -+ -+ return 0; -+} -+ - static int kgd_address_watch_disable(struct kgd_dev *kgd) - { -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ union TCP_WATCH_CNTL_BITS cntl; -+ unsigned int i; -+ -+ cntl.u32All = 0; -+ -+ cntl.bitfields.valid = 0; -+ cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; -+ cntl.bitfields.atc = 1; -+ -+ /* Turning off this address until we set all the registers */ -+ for (i = 0; i < MAX_WATCH_ADDRESSES; i++) -+ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); -+ - return 0; - } - -@@ -461,6 +875,32 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, - uint32_t addr_hi, - uint32_t addr_lo) - { -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ union TCP_WATCH_CNTL_BITS cntl; -+ -+ cntl.u32All = cntl_val; -+ -+ /* Turning off this watch point until we set all the registers */ -+ cntl.bitfields.valid = 0; -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); -+ -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_ADDR_HI], -+ addr_hi); -+ -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_ADDR_LO], -+ addr_lo); -+ -+ /* Enable the watch point */ -+ cntl.bitfields.valid = 1; -+ -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -+ + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); -+ - return 0; - } - -@@ -493,6 +933,32 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, - unsigned int watch_point_id, - unsigned int reg_offset) - { -+ return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; -+} -+ -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype) -+{ -+ uint32_t reg; -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | -+ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | -+ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | -+ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; -+ -+ WREG32(mmSH_STATIC_MEM_CONFIG, reg); -+ return 0; -+} -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ lock_srbm(kgd, 0, 0, 0, vmid); -+ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); -+ unlock_srbm(kgd); -+ - return 0; - } - -@@ -501,47 +967,45 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - const union amdgpu_firmware_header *hdr; - -- BUG_ON(kgd == NULL); -- - switch (type) { - case KGD_ENGINE_PFP: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.pfp_fw->data; -+ adev->gfx.pfp_fw->data; - break; - - case KGD_ENGINE_ME: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.me_fw->data; -+ adev->gfx.me_fw->data; - break; - - case KGD_ENGINE_CE: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.ce_fw->data; -+ adev->gfx.ce_fw->data; - break; - - case KGD_ENGINE_MEC1: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.mec_fw->data; -+ adev->gfx.mec_fw->data; - break; - - case KGD_ENGINE_MEC2: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.mec2_fw->data; -+ adev->gfx.mec2_fw->data; - break; - - case KGD_ENGINE_RLC: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.rlc_fw->data; -+ adev->gfx.rlc_fw->data; - break; - - case KGD_ENGINE_SDMA1: - hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[0].fw->data; -+ adev->sdma.instance[0].fw->data; - break; - - case KGD_ENGINE_SDMA2: - hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[1].fw->data; -+ adev->sdma.instance[1].fw->data; - break; - - default: -@@ -554,3 +1018,21 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - /* Only 12 bit in use*/ - return hdr->common.ucode_version; - } -+ -+static void set_num_of_requests(struct kgd_dev *kgd, -+ uint8_t num_of_requests) -+{ -+ pr_debug("This is a stub\n"); -+} -+ -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ /* TODO: Don't use hardcoded VMIDs */ -+ if (vmid < 8 || vmid > 15) { -+ pr_err("trying to set page table base for wrong VMID\n"); -+ return; -+ } -+ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); -+} -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h -new file mode 100644 -index 0000000..3c94919 ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h -@@ -0,0 +1,62 @@ -+/* -+ * Copyright 2015 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#ifndef AMDGPU_AMDKFD_GFX_V8_H_INCLUDED -+#define AMDGPU_AMDKFD_GFX_V8_H_INCLUDED -+ -+#include <linux/types.h> -+ -+enum { -+ MAX_TRAPID = 8, /* 3 bits in the bitfield. */ -+ MAX_WATCH_ADDRESSES = 4 -+}; -+ -+enum { -+ ADDRESS_WATCH_REG_ADDR_HI = 0, -+ ADDRESS_WATCH_REG_ADDR_LO, -+ ADDRESS_WATCH_REG_CNTL, -+ ADDRESS_WATCH_REG_MAX -+}; -+ -+/* not defined in the VI reg file */ -+enum { -+ ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, -+ ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, -+ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, -+ /* extend the mask to 26 bits in order to match the low address field */ -+ ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, -+ ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF -+}; -+ -+union TCP_WATCH_CNTL_BITS { -+ struct { -+ uint32_t mask:24; -+ uint32_t vmid:4; -+ uint32_t atc:1; -+ uint32_t mode:2; -+ uint32_t valid:1; -+ } bitfields, bits; -+ uint32_t u32All; -+ signed int i32All; -+ float f32All; -+}; -+#endif -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c -new file mode 100644 -index 0000000..edbae19 ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c -@@ -0,0 +1,1227 @@ -+/* -+ * Copyright 2014 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+#undef pr_fmt -+#define pr_fmt(fmt) "kfd2kgd: " fmt -+ -+#include <linux/module.h> -+#include <linux/fdtable.h> -+#include <linux/uaccess.h> -+#include <linux/firmware.h> -+#include <drm/drmP.h> -+#include "amdgpu.h" -+#include "amdgpu_amdkfd.h" -+#include "amdgpu_ucode.h" -+#include "amdgpu_amdkfd_gfx_v8.h" -+#include "vega10/soc15ip.h" -+#include "vega10/GC/gc_9_0_offset.h" -+#include "vega10/GC/gc_9_0_sh_mask.h" -+#include "vega10/vega10_enum.h" -+#include "vega10/SDMA0/sdma0_4_0_offset.h" -+#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" -+#include "vega10/SDMA1/sdma1_4_0_offset.h" -+#include "vega10/SDMA1/sdma1_4_0_sh_mask.h" -+#include "vega10/ATHUB/athub_1_0_offset.h" -+#include "vega10/ATHUB/athub_1_0_sh_mask.h" -+#include "vega10/OSSSYS/osssys_4_0_offset.h" -+#include "vega10/OSSSYS/osssys_4_0_sh_mask.h" -+#include "soc15_common.h" -+#include "v9_structs.h" -+#include "soc15.h" -+#include "soc15d.h" -+ -+/* HACK: MMHUB and GC both have VM-related register with the same -+ * names but different offsets. Define the MMHUB register we need here -+ * with a prefix. A proper solution would be to move the functions -+ * programming these registers into gfx_v9_0.c and mmhub_v1_0.c -+ * respectively. -+ */ -+#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3 -+#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0 -+ -+#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705 -+#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0 -+ -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0 -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0 -+ -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0 -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0 -+ -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0 -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c -+#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0 -+ -+#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727 -+#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0 -+#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728 -+#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0 -+ -+enum hqd_dequeue_request_type { -+ NO_ACTION = 0, -+ DRAIN_PIPE, -+ RESET_WAVES, -+ SAVE_WAVES -+}; -+ -+static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { -+ mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, -+ mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, -+ mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, -+ mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL -+}; -+ -+ -+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem); -+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); -+ -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -+ int fd, uint32_t handle, struct kgd_mem **mem); -+ -+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -+ -+/* -+ * Register access functions -+ */ -+ -+static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t sh_mem_config, -+ uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, -+ uint32_t sh_mem_bases); -+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, -+ unsigned int vmid); -+static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, -+ uint32_t hpd_size, uint64_t hpd_gpu_addr); -+static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); -+static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t wptr_shift, uint32_t wptr_mask, -+ struct mm_struct *mm); -+static int kgd_hqd_dump(struct kgd_dev *kgd, -+ uint32_t pipe_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs); -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -+ uint32_t __user *wptr, struct mm_struct *mm); -+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -+ uint32_t engine_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs); -+static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, -+ uint32_t pipe_id, uint32_t queue_id); -+static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); -+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -+ enum kfd_preempt_type reset_type, -+ unsigned int utimeout, uint32_t pipe_id, -+ uint32_t queue_id); -+static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, -+ unsigned int utimeout); -+static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); -+static uint32_t get_watch_base_addr(void); -+static int kgd_address_watch_disable(struct kgd_dev *kgd); -+static int kgd_address_watch_execute(struct kgd_dev *kgd, -+ unsigned int watch_point_id, -+ uint32_t cntl_val, -+ uint32_t addr_hi, -+ uint32_t addr_lo); -+static int kgd_wave_control_execute(struct kgd_dev *kgd, -+ uint32_t gfx_index_val, -+ uint32_t sq_cmd); -+static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, -+ unsigned int watch_point_id, -+ unsigned int reg_offset); -+ -+static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, -+ uint8_t vmid); -+static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, -+ uint8_t vmid); -+static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); -+static void set_num_of_requests(struct kgd_dev *kgd, -+ uint8_t num_of_requests); -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid); -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype); -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base); -+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); -+ -+/* Because of REG_GET_FIELD() being used, we put this function in the -+ * asic specific file. -+ */ -+static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, -+ struct tile_config *config) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ -+ config->gb_addr_config = adev->gfx.config.gb_addr_config; -+#if 0 -+/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but -+ * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu -+ * changes commented out related code, doing the same here for now but -+ * need to sync with Ken et al -+ */ -+ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -+ MC_ARB_RAMCFG, NOOFBANK); -+ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -+ MC_ARB_RAMCFG, NOOFRANKS); -+#endif -+ -+ config->tile_config_ptr = adev->gfx.config.tile_mode_array; -+ config->num_tile_configs = -+ ARRAY_SIZE(adev->gfx.config.tile_mode_array); -+ config->macro_tile_config_ptr = -+ adev->gfx.config.macrotile_mode_array; -+ config->num_macro_tile_configs = -+ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); -+ -+ return 0; -+} -+ -+static const struct kfd2kgd_calls kfd2kgd = { -+ .init_gtt_mem_allocation = alloc_gtt_mem, -+ .free_gtt_mem = free_gtt_mem, -+ .get_local_mem_info = get_local_mem_info, -+ .get_gpu_clock_counter = get_gpu_clock_counter, -+ .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, -+ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, -+ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, -+ .create_process_gpumem = create_process_gpumem, -+ .destroy_process_gpumem = destroy_process_gpumem, -+ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, -+ .open_graphic_handle = open_graphic_handle, -+ .program_sh_mem_settings = kgd_program_sh_mem_settings, -+ .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, -+ .init_pipeline = kgd_init_pipeline, -+ .init_interrupts = kgd_init_interrupts, -+ .hqd_load = kgd_hqd_load, -+ .hqd_sdma_load = kgd_hqd_sdma_load, -+ .hqd_dump = kgd_hqd_dump, -+ .hqd_sdma_dump = kgd_hqd_sdma_dump, -+ .hqd_is_occupied = kgd_hqd_is_occupied, -+ .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, -+ .hqd_destroy = kgd_hqd_destroy, -+ .hqd_sdma_destroy = kgd_hqd_sdma_destroy, -+ .address_watch_disable = kgd_address_watch_disable, -+ .address_watch_execute = kgd_address_watch_execute, -+ .wave_control_execute = kgd_wave_control_execute, -+ .address_watch_get_offset = kgd_address_watch_get_offset, -+ .get_atc_vmid_pasid_mapping_pasid = -+ get_atc_vmid_pasid_mapping_pasid, -+ .get_atc_vmid_pasid_mapping_valid = -+ get_atc_vmid_pasid_mapping_valid, -+ .write_vmid_invalidate_request = write_vmid_invalidate_request, -+ .invalidate_tlbs = invalidate_tlbs, -+ .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, -+ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, -+ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, -+ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, -+ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, -+ .get_fw_version = get_fw_version, -+ .set_num_of_requests = set_num_of_requests, -+ .get_cu_info = get_cu_info, -+ .alloc_memory_of_scratch = alloc_memory_of_scratch, -+ .write_config_static_mem = write_config_static_mem, -+ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, -+ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, -+ .set_vm_context_page_table_base = set_vm_context_page_table_base, -+ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, -+ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, -+ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, -+ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, -+ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, -+ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, -+ .submit_ib = amdgpu_amdkfd_submit_ib, -+ .get_tile_config = amdgpu_amdkfd_get_tile_config, -+ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, -+ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, -+ .get_vram_usage = amdgpu_amdkfd_get_vram_usage -+}; -+ -+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions() -+{ -+ return (struct kfd2kgd_calls *)&kfd2kgd; -+} -+ -+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem) -+{ -+ return 0; -+} -+ -+/* Destroys the GPU allocation and frees the kgd_mem structure */ -+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) -+{ -+ -+} -+ -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -+ int fd, uint32_t handle, struct kgd_mem **mem) -+{ -+ return 0; -+} -+ -+static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) -+{ -+ return (struct amdgpu_device *)kgd; -+} -+ -+static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, -+ uint32_t queue, uint32_t vmid) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ -+ mutex_lock(&adev->srbm_mutex); -+ soc15_grbm_select(adev, mec, pipe, queue, vmid); -+} -+ -+static void unlock_srbm(struct kgd_dev *kgd) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ -+ soc15_grbm_select(adev, 0, 0, 0, 0); -+ mutex_unlock(&adev->srbm_mutex); -+} -+ -+static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, -+ uint32_t queue_id) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ -+ uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); -+ -+ lock_srbm(kgd, mec, pipe, queue_id, 0); -+} -+ -+static uint32_t get_queue_mask(struct amdgpu_device *adev, -+ uint32_t pipe_id, uint32_t queue_id) -+{ -+ unsigned int bit = (pipe_id * adev->gfx.mec.num_pipe_per_mec + -+ queue_id) & 31; -+ -+ return ((uint32_t)1) << bit; -+} -+ -+static void release_queue(struct kgd_dev *kgd) -+{ -+ unlock_srbm(kgd); -+} -+ -+static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t sh_mem_config, -+ uint32_t sh_mem_ape1_base, -+ uint32_t sh_mem_ape1_limit, -+ uint32_t sh_mem_bases) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ -+ lock_srbm(kgd, 0, 0, 0, vmid); -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); -+ /* APE1 no longer exists on GFX9 */ -+ -+ unlock_srbm(kgd); -+} -+ -+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, -+ unsigned int vmid) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ -+ /* -+ * We have to assume that there is no outstanding mapping. -+ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because -+ * a mapping is in progress or because a mapping finished -+ * and the SW cleared it. -+ * So the protocol is to always wait & clear. -+ */ -+ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | -+ ATC_VMID0_PASID_MAPPING__VALID_MASK; -+ -+ /* -+ * need to do this twice, once for gfx and once for mmhub -+ * for ATC add 16 to VMID for mmhub, for IH different registers. -+ * ATC_VMID0..15 registers are separate from ATC_VMID16..31. -+ */ -+ -+ WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, -+ pasid_mapping); -+ -+ while (!(RREG32(SOC15_REG_OFFSET( -+ ATHUB, 0, -+ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & -+ (1U << vmid))) -+ cpu_relax(); -+ -+ WREG32(SOC15_REG_OFFSET(ATHUB, 0, -+ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), -+ 1U << vmid); -+ -+ /* Mapping vmid to pasid also for IH block */ -+ WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, -+ pasid_mapping); -+ -+ WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, -+ pasid_mapping); -+ -+ while (!(RREG32(SOC15_REG_OFFSET( -+ ATHUB, 0, -+ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & -+ (1U << (vmid + 16)))) -+ cpu_relax(); -+ -+ WREG32(SOC15_REG_OFFSET(ATHUB, 0, -+ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), -+ 1U << (vmid + 16)); -+ -+ /* Mapping vmid to pasid also for IH block */ -+ WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, -+ pasid_mapping); -+ return 0; -+} -+ -+static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, -+ uint32_t hpd_size, uint64_t hpd_gpu_addr) -+{ -+ /* amdgpu owns the per-pipe state */ -+ return 0; -+} -+ -+/* TODO - RING0 form of field is obsolete, seems to date back to SI -+ * but still works -+ */ -+ -+static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t mec; -+ uint32_t pipe; -+ -+ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); -+ -+ lock_srbm(kgd, mec, pipe, 0, 0); -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), -+ CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | -+ CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); -+ -+ unlock_srbm(kgd); -+ -+ return 0; -+} -+ -+static uint32_t get_sdma_base_addr(unsigned int engine_id, -+ unsigned int queue_id) -+{ -+ static const uint32_t base[2] = { -+ SOC15_REG_OFFSET(SDMA0, 0, -+ mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, -+ SOC15_REG_OFFSET(SDMA1, 0, -+ mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL -+ }; -+ uint32_t retval; -+ -+ retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - -+ mmSDMA0_RLC0_RB_CNTL); -+ -+ pr_debug("sdma base address: 0x%x\n", retval); -+ -+ return retval; -+} -+ -+static uint32_t get_watch_base_addr(void) -+{ -+ uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) - -+ mmTCP_WATCH0_ADDR_H; -+ -+ pr_debug("kfd: reg watch base address: 0x%x\n", retval); -+ -+ return retval; -+} -+ -+static inline struct v9_mqd *get_mqd(void *mqd) -+{ -+ return (struct v9_mqd *)mqd; -+} -+ -+static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) -+{ -+ return (struct v9_sdma_mqd *)mqd; -+} -+ -+static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t wptr_shift, uint32_t wptr_mask, -+ struct mm_struct *mm) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ struct v9_mqd *m; -+ uint32_t *mqd_hqd; -+ uint32_t reg, hqd_base, data; -+ -+ m = get_mqd(mqd); -+ -+ acquire_queue(kgd, pipe_id, queue_id); -+ -+ /* HIQ is set during driver init period with vmid set to 0*/ -+ if (m->cp_hqd_vmid == 0) { -+ uint32_t value, mec, pipe; -+ -+ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); -+ -+ pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", -+ mec, pipe, queue_id); -+ value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); -+ value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, -+ ((mec << 5) | (pipe << 3) | queue_id | 0x80)); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); -+ } -+ -+ /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ -+ mqd_hqd = &m->cp_mqd_base_addr_lo; -+ hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); -+ -+ for (reg = hqd_base; -+ reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) -+ WREG32(reg, mqd_hqd[reg - hqd_base]); -+ -+ -+ /* Activate doorbell logic before triggering WPTR poll. */ -+ data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, -+ CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); -+ -+ if (wptr) { -+ /* Don't read wptr with get_user because the user -+ * context may not be accessible (if this function -+ * runs in a work queue). Instead trigger a one-shot -+ * polling read from memory in the CP. This assumes -+ * that wptr is GPU-accessible in the queue's VMID via -+ * ATC or SVM. WPTR==RPTR before starting the poll so -+ * the CP starts fetching new commands from the right -+ * place. -+ * -+ * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit -+ * tricky. Assume that the queue didn't overflow. The -+ * number of valid bits in the 32-bit RPTR depends on -+ * the queue size. The remaining bits are taken from -+ * the saved 64-bit WPTR. If the WPTR wrapped, add the -+ * queue size. -+ */ -+ uint32_t queue_size = -+ 2 << REG_GET_FIELD(m->cp_hqd_pq_control, -+ CP_HQD_PQ_CONTROL, QUEUE_SIZE); -+ uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); -+ -+ if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) -+ guessed_wptr += queue_size; -+ guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); -+ guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), -+ lower_32_bits(guessed_wptr)); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), -+ upper_32_bits(guessed_wptr)); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), -+ lower_32_bits((uint64_t)wptr)); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), -+ upper_32_bits((uint64_t)wptr)); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), -+ get_queue_mask(adev, pipe_id, queue_id)); -+ } -+ -+ /* Start the EOP fetcher */ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), -+ REG_SET_FIELD(m->cp_hqd_eop_rptr, -+ CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); -+ -+ data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); -+ -+ release_queue(kgd); -+ -+ return 0; -+} -+ -+static int kgd_hqd_dump(struct kgd_dev *kgd, -+ uint32_t pipe_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t i = 0, reg; -+#define HQD_N_REGS 56 -+#define DUMP_REG(addr) do { \ -+ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ -+ break; \ -+ (*dump)[i][0] = (addr) << 2; \ -+ (*dump)[i++][1] = RREG32(addr); \ -+ } while (0) -+ -+ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -+ if (*dump == NULL) -+ return -ENOMEM; -+ -+ acquire_queue(kgd, pipe_id, queue_id); -+ -+ for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); -+ reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) -+ DUMP_REG(reg); -+ -+ release_queue(kgd); -+ -+ WARN_ON_ONCE(i != HQD_N_REGS); -+ *n_regs = i; -+ -+ return 0; -+} -+ -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -+ uint32_t __user *wptr, struct mm_struct *mm) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ struct v9_sdma_mqd *m; -+ uint32_t sdma_base_addr, sdmax_gfx_context_cntl; -+ uint32_t temp, timeout = 2000; -+ uint32_t data; -+ uint64_t data64; -+ uint64_t __user *wptr64 = (uint64_t __user *)wptr; -+ -+ m = get_sdma_mqd(mqd); -+ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, -+ m->sdma_queue_id); -+ sdmax_gfx_context_cntl = m->sdma_engine_id ? -+ SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : -+ SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); -+ -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -+ m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); -+ -+ while (true) { -+ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -+ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -+ break; -+ if (timeout == 0) -+ return -ETIME; -+ msleep(10); -+ timeout -= 10; -+ } -+ data = RREG32(sdmax_gfx_context_cntl); -+ data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, -+ RESUME_CTX, 0); -+ WREG32(sdmax_gfx_context_cntl, data); -+ -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, -+ m->sdmax_rlcx_doorbell_offset); -+ -+ data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, -+ ENABLE, 1); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, -+ m->sdmax_rlcx_rb_rptr_hi); -+ -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); -+ if (read_user_wptr(mm, wptr64, data64)) { -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, -+ lower_32_bits(data64)); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, -+ upper_32_bits(data64)); -+ } else { -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, -+ m->sdmax_rlcx_rb_rptr); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, -+ m->sdmax_rlcx_rb_rptr_hi); -+ } -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); -+ -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, -+ m->sdmax_rlcx_rb_base_hi); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, -+ m->sdmax_rlcx_rb_rptr_addr_lo); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, -+ m->sdmax_rlcx_rb_rptr_addr_hi); -+ -+ data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, -+ RB_ENABLE, 1); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); -+ -+ return 0; -+} -+ -+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -+ uint32_t engine_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t sdma_base_addr = get_sdma_base_addr(engine_id, queue_id); -+ uint32_t i = 0, reg; -+#undef HQD_N_REGS -+#define HQD_N_REGS (19+6+7+10) -+ -+ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -+ if (*dump == NULL) -+ return -ENOMEM; -+ -+ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) -+ DUMP_REG(sdma_base_addr + reg); -+ for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) -+ DUMP_REG(sdma_base_addr + reg); -+ for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; -+ reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) -+ DUMP_REG(sdma_base_addr + reg); -+ for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; -+ reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) -+ DUMP_REG(sdma_base_addr + reg); -+ -+ WARN_ON_ONCE(i != HQD_N_REGS); -+ *n_regs = i; -+ -+ return 0; -+} -+ -+static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, -+ uint32_t pipe_id, uint32_t queue_id) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t act; -+ bool retval = false; -+ uint32_t low, high; -+ -+ acquire_queue(kgd, pipe_id, queue_id); -+ act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); -+ if (act) { -+ low = lower_32_bits(queue_address >> 8); -+ high = upper_32_bits(queue_address >> 8); -+ -+ if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && -+ high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) -+ retval = true; -+ } -+ release_queue(kgd); -+ return retval; -+} -+ -+static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ struct v9_sdma_mqd *m; -+ uint32_t sdma_base_addr; -+ uint32_t sdma_rlc_rb_cntl; -+ -+ m = get_sdma_mqd(mqd); -+ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, -+ m->sdma_queue_id); -+ -+ sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); -+ -+ if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) -+ return true; -+ -+ return false; -+} -+ -+static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -+ enum kfd_preempt_type reset_type, -+ unsigned int utimeout, uint32_t pipe_id, -+ uint32_t queue_id) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ enum hqd_dequeue_request_type type; -+ unsigned long end_jiffies; -+ uint32_t temp; -+ struct v9_mqd *m = get_mqd(mqd); -+ -+#if 0 -+ unsigned long flags; -+ int retry; -+#endif -+ -+ acquire_queue(kgd, pipe_id, queue_id); -+ -+ if (m->cp_hqd_vmid == 0) -+ WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); -+ -+ switch (reset_type) { -+ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: -+ type = DRAIN_PIPE; -+ break; -+ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: -+ type = RESET_WAVES; -+ break; -+ default: -+ type = DRAIN_PIPE; -+ break; -+ } -+ -+#if 0 /* Is this still needed? */ -+ /* Workaround: If IQ timer is active and the wait time is close to or -+ * equal to 0, dequeueing is not safe. Wait until either the wait time -+ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is -+ * cleared before continuing. Also, ensure wait times are set to at -+ * least 0x3. -+ */ -+ local_irq_save(flags); -+ preempt_disable(); -+ retry = 5000; /* wait for 500 usecs at maximum */ -+ while (true) { -+ temp = RREG32(mmCP_HQD_IQ_TIMER); -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { -+ pr_debug("HW is processing IQ\n"); -+ goto loop; -+ } -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) -+ == 3) /* SEM-rearm is safe */ -+ break; -+ /* Wait time 3 is safe for CP, but our MMIO read/write -+ * time is close to 1 microsecond, so check for 10 to -+ * leave more buffer room -+ */ -+ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) -+ >= 10) -+ break; -+ pr_debug("IQ timer is active\n"); -+ } else -+ break; -+loop: -+ if (!retry) { -+ pr_err("CP HQD IQ timer status time out\n"); -+ break; -+ } -+ ndelay(100); -+ --retry; -+ } -+ retry = 1000; -+ while (true) { -+ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); -+ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) -+ break; -+ pr_debug("Dequeue request is pending\n"); -+ -+ if (!retry) { -+ pr_err("CP HQD dequeue request time out\n"); -+ break; -+ } -+ ndelay(100); -+ --retry; -+ } -+ local_irq_restore(flags); -+ preempt_enable(); -+#endif -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); -+ -+ end_jiffies = (utimeout * HZ / 1000) + jiffies; -+ while (true) { -+ temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); -+ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) -+ break; -+ if (time_after(jiffies, end_jiffies)) { -+ pr_err("cp queue preemption time out.\n"); -+ release_queue(kgd); -+ return -ETIME; -+ } -+ usleep_range(500, 1000); -+ } -+ -+ release_queue(kgd); -+ return 0; -+} -+ -+static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, -+ unsigned int utimeout) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ struct v9_sdma_mqd *m; -+ uint32_t sdma_base_addr; -+ uint32_t temp; -+ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; -+ -+ m = get_sdma_mqd(mqd); -+ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, -+ m->sdma_queue_id); -+ -+ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); -+ temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); -+ -+ while (true) { -+ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -+ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -+ break; -+ if (time_after(jiffies, end_jiffies)) -+ return -ETIME; -+ usleep_range(500, 1000); -+ } -+ -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -+ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | -+ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); -+ -+ m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); -+ m->sdmax_rlcx_rb_rptr_hi = -+ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); -+ -+ return 0; -+} -+ -+static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, -+ uint8_t vmid) -+{ -+ uint32_t reg; -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) -+ + vmid); -+ return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; -+} -+ -+static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, -+ uint8_t vmid) -+{ -+ uint32_t reg; -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) -+ + vmid); -+ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; -+} -+ -+static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ uint32_t req = (1 << vmid) | -+ (1 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* light */ -+ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK | -+ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK | -+ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK | -+ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK | -+ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK; -+ -+ spin_lock(&adev->tlb_invalidation_lock); -+ -+ /* Use light weight invalidation. -+ * -+ * TODO 1: agree on the right set of invalidation registers for -+ * KFD use. Use the last one for now. Invalidate both GC and -+ * MMHUB. -+ * -+ * TODO 2: support range-based invalidation, requires kfg2kgd -+ * interface change -+ */ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), -+ 0xffffffff); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), -+ 0x0000001f); -+ -+ WREG32(SOC15_REG_OFFSET(MMHUB, 0, -+ mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), -+ 0xffffffff); -+ WREG32(SOC15_REG_OFFSET(MMHUB, 0, -+ mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), -+ 0x0000001f); -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req); -+ -+ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), -+ req); -+ -+ while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) & -+ (1 << vmid))) -+ cpu_relax(); -+ -+ while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, -+ mmMMHUB_VM_INVALIDATE_ENG16_ACK)) & -+ (1 << vmid))) -+ cpu_relax(); -+ -+ spin_unlock(&adev->tlb_invalidation_lock); -+ -+} -+ -+static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) -+{ -+ signed long r; -+ struct dma_fence *f; -+ struct amdgpu_ring *ring = &adev->gfx.kiq.ring; -+ -+ mutex_lock(&adev->gfx.kiq.ring_mutex); -+ amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ -+ amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); -+ amdgpu_ring_write(ring, -+ PACKET3_INVALIDATE_TLBS_DST_SEL(1) | -+ PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | -+ PACKET3_INVALIDATE_TLBS_PASID(pasid) | -+ PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(2)); -+ amdgpu_fence_emit(ring, &f); -+ amdgpu_ring_commit(ring); -+ mutex_unlock(&adev->gfx.kiq.ring_mutex); -+ -+ r = dma_fence_wait(f, false); -+ if (r) -+ DRM_ERROR("wait for kiq fence error: %ld.\n", r); -+ dma_fence_put(f); -+ -+ return r; -+} -+ -+static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ int vmid; -+ struct amdgpu_ring *ring = &adev->gfx.kiq.ring; -+ -+ if (ring->ready) -+ return invalidate_tlbs_with_kiq(adev, pasid); -+ -+ for (vmid = 0; vmid < 16; vmid++) { -+ if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) -+ continue; -+ if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { -+ if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) -+ == pasid) { -+ write_vmid_invalidate_request(kgd, vmid); -+ break; -+ } -+ } -+ } -+ -+ return 0; -+} -+ -+static int kgd_address_watch_disable(struct kgd_dev *kgd) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ union TCP_WATCH_CNTL_BITS cntl; -+ unsigned int i; -+ uint32_t watch_base_addr; -+ -+ cntl.u32All = 0; -+ -+ cntl.bitfields.valid = 0; -+ cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; -+ cntl.bitfields.atc = 1; -+ -+ watch_base_addr = get_watch_base_addr(); -+ /* Turning off this address until we set all the registers */ -+ for (i = 0; i < MAX_WATCH_ADDRESSES; i++) -+ WREG32(watch_base_addr + -+ watchRegs[i * ADDRESS_WATCH_REG_MAX + -+ ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); -+ -+ return 0; -+} -+ -+static int kgd_address_watch_execute(struct kgd_dev *kgd, -+ unsigned int watch_point_id, -+ uint32_t cntl_val, -+ uint32_t addr_hi, -+ uint32_t addr_lo) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ union TCP_WATCH_CNTL_BITS cntl; -+ uint32_t watch_base_addr; -+ -+ watch_base_addr = get_watch_base_addr(); -+ cntl.u32All = cntl_val; -+ -+ /* Turning off this watch point until we set all the registers */ -+ cntl.bitfields.valid = 0; -+ WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); -+ -+ WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], -+ addr_hi); -+ -+ WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], -+ addr_lo); -+ -+ /* Enable the watch point */ -+ cntl.bitfields.valid = 1; -+ -+ WREG32(watch_base_addr + -+ watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -+ ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); -+ -+ return 0; -+} -+ -+static int kgd_wave_control_execute(struct kgd_dev *kgd, -+ uint32_t gfx_index_val, -+ uint32_t sq_cmd) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t data = 0; -+ -+ mutex_lock(&adev->grbm_idx_mutex); -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); -+ -+ data = REG_SET_FIELD(data, GRBM_GFX_INDEX, -+ INSTANCE_BROADCAST_WRITES, 1); -+ data = REG_SET_FIELD(data, GRBM_GFX_INDEX, -+ SH_BROADCAST_WRITES, 1); -+ data = REG_SET_FIELD(data, GRBM_GFX_INDEX, -+ SE_BROADCAST_WRITES, 1); -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); -+ mutex_unlock(&adev->grbm_idx_mutex); -+ -+ return 0; -+} -+ -+static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, -+ unsigned int watch_point_id, -+ unsigned int reg_offset) -+{ -+ return get_watch_base_addr() + -+ watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; -+} -+ -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype) -+{ -+ /* No longer needed on GFXv9. These values are now hard-coded, -+ * except for the MTYPE which comes from the page table. -+ */ -+ -+ return 0; -+} -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid) -+{ -+ /* No longer needed on GFXv9. The scratch base address is -+ * passed to the shader by the CP. It's the user mode driver's -+ * responsibility. -+ */ -+ -+ return 0; -+} -+ -+/* FIXME: Does this need to be ASIC-specific code? */ -+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ const union amdgpu_firmware_header *hdr; -+ -+ switch (type) { -+ case KGD_ENGINE_PFP: -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; -+ break; -+ -+ case KGD_ENGINE_ME: -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; -+ break; -+ -+ case KGD_ENGINE_CE: -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; -+ break; -+ -+ case KGD_ENGINE_MEC1: -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; -+ break; -+ -+ case KGD_ENGINE_MEC2: -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; -+ break; -+ -+ case KGD_ENGINE_RLC: -+ hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; -+ break; -+ -+ case KGD_ENGINE_SDMA1: -+ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; -+ break; -+ -+ case KGD_ENGINE_SDMA2: -+ hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; -+ break; -+ -+ default: -+ return 0; -+ } -+ -+ if (hdr == NULL) -+ return 0; -+ -+ /* Only 12 bit in use*/ -+ return hdr->common.ucode_version; -+} -+ -+static void set_num_of_requests(struct kgd_dev *kgd, -+ uint8_t num_of_requests) -+{ -+ pr_debug("This is a stub\n"); -+} -+ -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT | -+ AMDGPU_PTE_VALID; -+ -+ /* TODO: Don't use hardcoded VMIDs */ -+ if (vmid < 8 || vmid > 15) { -+ pr_err("trying to set page table base for wrong VMID %u\n", -+ vmid); -+ return; -+ } -+ -+ /* TODO: take advantage of per-process address space size. For -+ * now, all processes share the same address space size, like -+ * on GFX8 and older. -+ */ -+ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); -+ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); -+ -+ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), -+ lower_32_bits(adev->vm_manager.max_pfn - 1)); -+ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), -+ upper_32_bits(adev->vm_manager.max_pfn - 1)); -+ -+ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); -+ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), -+ lower_32_bits(adev->vm_manager.max_pfn - 1)); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), -+ upper_32_bits(adev->vm_manager.max_pfn - 1)); -+ -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); -+ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); -+} -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c -new file mode 100644 -index 0000000..7df892d ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c -@@ -0,0 +1,2578 @@ -+/* -+ * Copyright 2014 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#undef pr_fmt -+#define pr_fmt(fmt) "kfd2kgd: " fmt -+ -+#include <linux/module.h> -+#include <linux/fdtable.h> -+#include <linux/uaccess.h> -+#include <linux/firmware.h> -+#include <linux/list.h> -+#include <linux/sched/mm.h> -+#include <drm/drmP.h> -+#include <linux/dma-buf.h> -+#include <linux/pagemap.h> -+#include "amdgpu_amdkfd.h" -+#include "amdgpu_ucode.h" -+#include "gca/gfx_8_0_sh_mask.h" -+#include "gca/gfx_8_0_d.h" -+#include "gca/gfx_8_0_enum.h" -+#include "oss/oss_3_0_sh_mask.h" -+#include "oss/oss_3_0_d.h" -+#include "gmc/gmc_8_1_sh_mask.h" -+#include "gmc/gmc_8_1_d.h" -+ -+/* Special VM and GART address alignment needed for VI pre-Fiji due to -+ * a HW bug. -+ */ -+#define VI_BO_SIZE_ALIGN (0x8000) -+ -+/* BO flag to indicate a KFD userptr BO */ -+#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63) -+ -+/* Impose limit on how much memory KFD can use */ -+struct kfd_mem_usage_limit { -+ uint64_t max_system_mem_limit; -+ uint64_t max_userptr_mem_limit; -+ int64_t system_mem_used; -+ int64_t userptr_mem_used; -+ spinlock_t mem_limit_lock; -+}; -+ -+static struct kfd_mem_usage_limit kfd_mem_limit; -+ -+/* Struct used for amdgpu_amdkfd_bo_validate */ -+struct amdgpu_vm_parser { -+ uint32_t domain; -+ bool wait; -+}; -+ -+static const char * const domain_bit_to_string[] = { -+ "CPU", -+ "GTT", -+ "VRAM", -+ "GDS", -+ "GWS", -+ "OA" -+}; -+ -+#define domain_string(domain) domain_bit_to_string[ffs(domain)-1] -+ -+static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); -+ -+ -+static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) -+{ -+ return (struct amdgpu_device *)kgd; -+} -+ -+static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, -+ struct kgd_mem *mem) -+{ -+ struct kfd_bo_va_list *entry; -+ -+ list_for_each_entry(entry, &mem->bo_va_list, bo_list) -+ if (entry->bo_va->base.vm == avm) -+ return false; -+ -+ return true; -+} -+ -+/* Set memory usage limits. Current, limits are -+ * System (kernel) memory - 15/16th System RAM -+ * Userptr memory - 15/16th System RAM -+ */ -+void amdgpu_amdkfd_gpuvm_init_mem_limits(void) -+{ -+ struct sysinfo si; -+ uint64_t mem; -+ -+ si_meminfo(&si); -+ mem = si.totalram - si.totalhigh; -+ mem *= si.mem_unit; -+ -+ spin_lock_init(&kfd_mem_limit.mem_limit_lock); -+ kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4); /* 15/16 */ -+ kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 4); /* 15/16 */ -+ pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n", -+ (kfd_mem_limit.max_system_mem_limit >> 20), -+ (kfd_mem_limit.max_userptr_mem_limit >> 20)); -+} -+ -+static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, -+ uint64_t size, u32 domain) -+{ -+ size_t acc_size; -+ int ret = 0; -+ -+ acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, -+ sizeof(struct amdgpu_bo)); -+ -+ spin_lock(&kfd_mem_limit.mem_limit_lock); -+ if (domain == AMDGPU_GEM_DOMAIN_GTT) { -+ if (kfd_mem_limit.system_mem_used + (acc_size + size) > -+ kfd_mem_limit.max_system_mem_limit) { -+ ret = -ENOMEM; -+ goto err_no_mem; -+ } -+ kfd_mem_limit.system_mem_used += (acc_size + size); -+ } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { -+ if ((kfd_mem_limit.system_mem_used + acc_size > -+ kfd_mem_limit.max_system_mem_limit) || -+ (kfd_mem_limit.userptr_mem_used + (size + acc_size) > -+ kfd_mem_limit.max_userptr_mem_limit)) { -+ ret = -ENOMEM; -+ goto err_no_mem; -+ } -+ kfd_mem_limit.system_mem_used += acc_size; -+ kfd_mem_limit.userptr_mem_used += size; -+ } -+err_no_mem: -+ spin_unlock(&kfd_mem_limit.mem_limit_lock); -+ return ret; -+} -+ -+static void unreserve_system_mem_limit(struct amdgpu_device *adev, -+ uint64_t size, u32 domain) -+{ -+ size_t acc_size; -+ -+ acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, -+ sizeof(struct amdgpu_bo)); -+ -+ spin_lock(&kfd_mem_limit.mem_limit_lock); -+ if (domain == AMDGPU_GEM_DOMAIN_GTT) { -+ kfd_mem_limit.system_mem_used -= (acc_size + size); -+ } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { -+ kfd_mem_limit.system_mem_used -= acc_size; -+ kfd_mem_limit.userptr_mem_used -= size; -+ } -+ WARN_ONCE(kfd_mem_limit.system_mem_used < 0, -+ "kfd system memory accounting unbalanced"); -+ WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, -+ "kfd userptr memory accounting unbalanced"); -+ -+ spin_unlock(&kfd_mem_limit.mem_limit_lock); -+} -+ -+void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo) -+{ -+ spin_lock(&kfd_mem_limit.mem_limit_lock); -+ -+ if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) { -+ kfd_mem_limit.system_mem_used -= bo->tbo.acc_size; -+ kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo); -+ } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { -+ kfd_mem_limit.system_mem_used -= -+ (bo->tbo.acc_size + amdgpu_bo_size(bo)); -+ } -+ WARN_ONCE(kfd_mem_limit.system_mem_used < 0, -+ "kfd system memory accounting unbalanced"); -+ WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, -+ "kfd userptr memory accounting unbalanced"); -+ -+ spin_unlock(&kfd_mem_limit.mem_limit_lock); -+} -+ -+ -+/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence(s) from BO's -+ * reservation object. -+ * -+ * @bo: [IN] Remove eviction fence(s) from this BO -+ * @ef: [IN] If ef is specified, then this eviction fence is removed if it -+ * is present in the shared list. -+ * @ef_list: [OUT] Returns list of eviction fences. These fences are removed -+ * from BO's reservation object shared list. -+ * @ef_count: [OUT] Number of fences in ef_list. -+ * -+ * NOTE: If called with ef_list, then amdgpu_amdkfd_add_eviction_fence must be -+ * called to restore the eviction fences and to avoid memory leak. This is -+ * useful for shared BOs. -+ * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held. -+ */ -+static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, -+ struct amdgpu_amdkfd_fence *ef, -+ struct amdgpu_amdkfd_fence ***ef_list, -+ unsigned int *ef_count) -+{ -+ struct reservation_object_list *fobj; -+ struct reservation_object *resv; -+ unsigned int i = 0, j = 0, k = 0, shared_count; -+ unsigned int count = 0; -+ struct amdgpu_amdkfd_fence **fence_list; -+ -+ if (!ef && !ef_list) -+ return -EINVAL; -+ -+ if (ef_list) { -+ *ef_list = NULL; -+ *ef_count = 0; -+ } -+ -+ resv = bo->tbo.resv; -+ fobj = reservation_object_get_list(resv); -+ -+ if (!fobj) -+ return 0; -+ -+ preempt_disable(); -+ write_seqcount_begin(&resv->seq); -+ -+ /* Go through all the shared fences in the resevation object. If -+ * ef is specified and it exists in the list, remove it and reduce the -+ * count. If ef is not specified, then get the count of eviction fences -+ * present. -+ */ -+ shared_count = fobj->shared_count; -+ for (i = 0; i < shared_count; ++i) { -+ struct dma_fence *f; -+ -+ f = rcu_dereference_protected(fobj->shared[i], -+ reservation_object_held(resv)); -+ -+ if (ef) { -+ if (f->context == ef->base.context) { -+ dma_fence_put(f); -+ fobj->shared_count--; -+ } else -+ RCU_INIT_POINTER(fobj->shared[j++], f); -+ -+ } else if (to_amdgpu_amdkfd_fence(f)) -+ count++; -+ } -+ write_seqcount_end(&resv->seq); -+ preempt_enable(); -+ -+ if (ef || !count) -+ return 0; -+ -+ /* Alloc memory for count number of eviction fence pointers. Fill the -+ * ef_list array and ef_count -+ */ -+ -+ fence_list = kcalloc(count, sizeof(struct amdgpu_amdkfd_fence *), -+ GFP_KERNEL); -+ if (!fence_list) -+ return -ENOMEM; -+ -+ preempt_disable(); -+ write_seqcount_begin(&resv->seq); -+ -+ j = 0; -+ for (i = 0; i < shared_count; ++i) { -+ struct dma_fence *f; -+ struct amdgpu_amdkfd_fence *efence; -+ -+ f = rcu_dereference_protected(fobj->shared[i], -+ reservation_object_held(resv)); -+ -+ efence = to_amdgpu_amdkfd_fence(f); -+ if (efence) { -+ fence_list[k++] = efence; -+ fobj->shared_count--; -+ } else -+ RCU_INIT_POINTER(fobj->shared[j++], f); -+ } -+ -+ write_seqcount_end(&resv->seq); -+ preempt_enable(); -+ -+ *ef_list = fence_list; -+ *ef_count = k; -+ -+ return 0; -+} -+ -+/* amdgpu_amdkfd_add_eviction_fence - Adds eviction fence(s) back into BO's -+ * reservation object. -+ * -+ * @bo: [IN] Add eviction fences to this BO -+ * @ef_list: [IN] List of eviction fences to be added -+ * @ef_count: [IN] Number of fences in ef_list. -+ * -+ * NOTE: Must call amdgpu_amdkfd_remove_eviction_fence before calling this -+ * function. -+ */ -+static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo, -+ struct amdgpu_amdkfd_fence **ef_list, -+ unsigned int ef_count) -+{ -+ int i; -+ -+ if (!ef_list || !ef_count) -+ return; -+ -+ for (i = 0; i < ef_count; i++) { -+ amdgpu_bo_fence(bo, &ef_list[i]->base, true); -+ /* Readding the fence takes an additional reference. Drop that -+ * reference. -+ */ -+ dma_fence_put(&ef_list[i]->base); -+ } -+ -+ kfree(ef_list); -+} -+ -+static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, -+ bool wait) -+{ -+ int ret; -+ -+ if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm), -+ "Called with userptr BO")) -+ return -EINVAL; -+ -+ amdgpu_ttm_placement_from_domain(bo, domain); -+ -+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); -+ if (ret) -+ goto validate_fail; -+ if (wait) { -+ struct amdgpu_amdkfd_fence **ef_list; -+ unsigned int ef_count; -+ -+ ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list, -+ &ef_count); -+ if (ret) -+ goto validate_fail; -+ -+ ttm_bo_wait(&bo->tbo, false, false); -+ amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count); -+ } -+ -+validate_fail: -+ return ret; -+} -+ -+static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo) -+{ -+ struct amdgpu_vm_parser *p = param; -+ -+ return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait); -+} -+ -+/* vm_validate_pt_pd_bos - Validate page table and directory BOs -+ * -+ * Also updates page directory entries so we don't need to do this -+ * again later until the page directory is validated again (e.g. after -+ * an eviction or allocating new page tables). -+ */ -+static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm) -+{ -+ struct amdgpu_bo *pd = vm->root.base.bo; -+ struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev); -+ struct amdgpu_vm_parser param; -+ int ret; -+ -+ param.domain = AMDGPU_GEM_DOMAIN_VRAM; -+ param.wait = false; -+ -+ ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate, -+ ¶m); -+ if (ret) { -+ pr_err("amdgpu: failed to validate PT BOs\n"); -+ return ret; -+ } -+ -+ ret = amdgpu_amdkfd_validate(¶m, pd); -+ if (ret) { -+ pr_err("amdgpu: failed to validate PD\n"); -+ return ret; -+ } -+ -+ ret = amdgpu_vm_update_directories(adev, vm); -+ if (ret != 0) -+ return ret; -+ -+ return 0; -+} -+ -+/* add_bo_to_vm - Add a BO to a VM -+ * -+ * Everything that needs to bo done only once when a BO is first added -+ * to a VM. It can later be mapped and unmapped many times without -+ * repeating these steps. -+ * -+ * 1. Allocate and initialize BO VA entry data structure -+ * 2. Add BO to the VM -+ * 3. Determine ASIC-specific PTE flags -+ * 4. Alloc page tables and directories if needed -+ * 4a. Validate new page tables and directories and update directories -+ */ -+static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem, -+ struct amdgpu_vm *avm, bool is_aql, -+ struct kfd_bo_va_list **p_bo_va_entry) -+{ -+ int ret; -+ struct kfd_bo_va_list *bo_va_entry; -+ struct amdkfd_vm *kvm = container_of(avm, -+ struct amdkfd_vm, base); -+ struct amdgpu_bo *pd = avm->root.base.bo; -+ struct amdgpu_bo *bo = mem->bo; -+ uint64_t va = mem->va; -+ struct list_head *list_bo_va = &mem->bo_va_list; -+ unsigned long bo_size = bo->tbo.mem.size; -+ -+ if (!va) { -+ pr_err("Invalid VA when adding BO to VM\n"); -+ return -EINVAL; -+ } -+ -+ if (is_aql) -+ va += bo_size; -+ -+ bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL); -+ if (!bo_va_entry) -+ return -ENOMEM; -+ -+ pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va, -+ va + bo_size, avm); -+ -+ /* Add BO to VM internal data structures*/ -+ bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo); -+ if (bo_va_entry->bo_va == NULL) { -+ ret = -EINVAL; -+ pr_err("Failed to add BO object to VM. ret == %d\n", -+ ret); -+ goto err_vmadd; -+ } -+ -+ bo_va_entry->va = va; -+ bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev, -+ mem->mapping_flags); -+ bo_va_entry->kgd_dev = (void *)adev; -+ list_add(&bo_va_entry->bo_list, list_bo_va); -+ -+ if (p_bo_va_entry) -+ *p_bo_va_entry = bo_va_entry; -+ -+ /* Allocate new page tables if neeeded and validate -+ * them. Clearing of new page tables and validate need to wait -+ * on move fences. We don't want that to trigger the eviction -+ * fence, so remove it temporarily. -+ */ -+ amdgpu_amdkfd_remove_eviction_fence(pd, -+ kvm->process_info->eviction_fence, -+ NULL, NULL); -+ -+ ret = amdgpu_vm_alloc_pts(adev, avm, va, amdgpu_bo_size(bo)); -+ if (ret) { -+ pr_err("Failed to allocate pts, err=%d\n", ret); -+ goto err_alloc_pts; -+ } -+ -+ ret = vm_validate_pt_pd_bos(avm); -+ if (ret != 0) { -+ pr_err("validate_pt_pd_bos() failed\n"); -+ goto err_alloc_pts; -+ } -+ -+ /* Add the eviction fence back */ -+ amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); -+ -+ return 0; -+ -+err_alloc_pts: -+ amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); -+ amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va); -+ list_del(&bo_va_entry->bo_list); -+err_vmadd: -+ kfree(bo_va_entry); -+ return ret; -+} -+ -+static void remove_bo_from_vm(struct amdgpu_device *adev, -+ struct kfd_bo_va_list *entry, unsigned long size) -+{ -+ pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n", -+ entry->va, -+ entry->va + size, entry); -+ amdgpu_vm_bo_rmv(adev, entry->bo_va); -+ list_del(&entry->bo_list); -+ kfree(entry); -+} -+ -+static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, -+ struct amdkfd_process_info *process_info, -+ bool userptr) -+{ -+ struct ttm_validate_buffer *entry = &mem->validate_list; -+ struct amdgpu_bo *bo = mem->bo; -+ -+ INIT_LIST_HEAD(&entry->head); -+ entry->shared = true; -+ entry->bo = &bo->tbo; -+ mutex_lock(&process_info->lock); -+ if (userptr) -+ list_add_tail(&entry->head, &process_info->userptr_valid_list); -+ else -+ list_add_tail(&entry->head, &process_info->kfd_bo_list); -+ mutex_unlock(&process_info->lock); -+} -+ -+/* Initializes user pages. It registers the MMU notifier and validates -+ * the userptr BO in the GTT domain. -+ * -+ * The BO must already be on the userptr_valid_list. Otherwise an -+ * eviction and restore may happen that leaves the new BO unmapped -+ * with the user mode queues running. -+ * -+ * Takes the process_info->lock to protect against concurrent restore -+ * workers. -+ * -+ * Returns 0 for success, negative errno for errors. -+ */ -+static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, -+ uint64_t user_addr) -+{ -+ struct amdkfd_process_info *process_info = mem->process_info; -+ struct amdgpu_bo *bo = mem->bo; -+ int ret = 0; -+ -+ mutex_lock(&process_info->lock); -+ -+ ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); -+ if (ret) { -+ pr_err("%s: Failed to set userptr: %d\n", __func__, ret); -+ goto out; -+ } -+ -+ ret = amdgpu_mn_register(bo, user_addr); -+ if (ret) { -+ pr_err("%s: Failed to register MMU notifier: %d\n", -+ __func__, ret); -+ goto out; -+ } -+ -+ /* If no restore worker is running concurrently, user_pages -+ * should not be allocated -+ */ -+ WARN(mem->user_pages, "Leaking user_pages array"); -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ mem->user_pages = drm_calloc_large(bo->tbo.ttm->num_pages, -+ sizeof(struct page *)); -+#else -+ mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages, -+ sizeof(struct page *), -+ GFP_KERNEL | __GFP_ZERO); -+#endif -+ if (!mem->user_pages) { -+ pr_err("%s: Failed to allocate pages array\n", __func__); -+ ret = -ENOMEM; -+ goto unregister_out; -+ } -+ -+ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); -+ if (ret) { -+ pr_err("%s: Failed to get user pages: %d\n", __func__, ret); -+ goto free_out; -+ } -+ -+ amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages); -+ -+ ret = amdgpu_bo_reserve(bo, true); -+ if (ret) { -+ pr_err("%s: Failed to reserve BO\n", __func__); -+ goto release_out; -+ } -+ amdgpu_ttm_placement_from_domain(bo, mem->domain); -+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, -+ true, false); -+ if (ret) -+ pr_err("%s: failed to validate BO\n", __func__); -+ amdgpu_bo_unreserve(bo); -+ -+release_out: -+ if (ret) -+ release_pages(mem->user_pages, bo->tbo.ttm->num_pages, 0); -+free_out: -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ drm_free_large(mem->user_pages); -+#else -+ kvfree(mem->user_pages); -+#endif -+ mem->user_pages = NULL; -+unregister_out: -+ if (ret) -+ amdgpu_mn_unregister(bo); -+out: -+ mutex_unlock(&process_info->lock); -+ return ret; -+} -+ -+static int __map_bo_to_kernel(struct amdgpu_bo *bo, u32 domain, void **kptr) -+{ -+ int ret; -+ -+ ret = amdgpu_bo_reserve(bo, true); -+ if (ret) { -+ pr_err("Failed to reserve bo. ret %d\n", ret); -+ return ret; -+ } -+ -+ ret = amdgpu_bo_pin(bo, domain, NULL); -+ if (ret) { -+ pr_err("Failed to pin bo. ret %d\n", ret); -+ goto pin_failed; -+ } -+ -+ ret = amdgpu_bo_kmap(bo, kptr); -+ if (ret) { -+ pr_err("Failed to map bo to kernel. ret %d\n", ret); -+ goto kmap_failed; -+ } -+ -+ amdgpu_bo_unreserve(bo); -+ -+ return ret; -+ -+kmap_failed: -+ amdgpu_bo_unpin(bo); -+pin_failed: -+ amdgpu_bo_unreserve(bo); -+ -+ return ret; -+} -+ -+static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, -+ uint64_t size, void *vm, struct kgd_mem **mem, -+ uint64_t *offset, u32 domain, u64 flags, -+ struct sg_table *sg, bool aql_queue, -+ bool readonly, bool execute, bool coherent, bool no_sub, -+ bool userptr) -+{ -+ struct amdgpu_device *adev; -+ int ret; -+ struct amdgpu_bo *bo; -+ uint64_t user_addr = 0; -+ int byte_align; -+ u32 alloc_domain; -+ uint32_t mapping_flags; -+ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; -+ -+ if (aql_queue) -+ size = size >> 1; -+ if (userptr) { -+ if (!offset || !*offset) -+ return -EINVAL; -+ user_addr = *offset; -+ } -+ -+ adev = get_amdgpu_device(kgd); -+ byte_align = (adev->family == AMDGPU_FAMILY_VI && -+ adev->asic_type != CHIP_FIJI && -+ adev->asic_type != CHIP_POLARIS10 && -+ adev->asic_type != CHIP_POLARIS11) ? -+ VI_BO_SIZE_ALIGN : 1; -+ -+ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); -+ if (*mem == NULL) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ INIT_LIST_HEAD(&(*mem)->bo_va_list); -+ mutex_init(&(*mem)->lock); -+ (*mem)->coherent = coherent; -+ (*mem)->no_substitute = no_sub; -+ (*mem)->aql_queue = aql_queue; -+ -+ mapping_flags = AMDGPU_VM_PAGE_READABLE; -+ if (!readonly) -+ mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE; -+ if (execute) -+ mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; -+ if (coherent) -+ mapping_flags |= AMDGPU_VM_MTYPE_UC; -+ else -+ mapping_flags |= AMDGPU_VM_MTYPE_NC; -+ -+ (*mem)->mapping_flags = mapping_flags; -+ -+ alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain; -+ -+ amdgpu_sync_create(&(*mem)->sync); -+ -+ ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain); -+ if (ret) { -+ pr_err("Insufficient system memory\n"); -+ goto err_bo_create; -+ } -+ -+ pr_debug("\t create BO VA 0x%llx size 0x%llx domain %s\n", -+ va, size, domain_string(alloc_domain)); -+ -+ /* Allocate buffer object. Userptr objects need to start out -+ * in the CPU domain, get moved to GTT when pinned. -+ */ -+ ret = amdgpu_bo_create(adev, size, byte_align, false, -+ alloc_domain, -+ flags, sg, NULL, 0, &bo); -+ if (ret != 0) { -+ pr_err("Failed to create BO on domain %s. ret %d\n", -+ domain_string(alloc_domain), ret); -+ unreserve_system_mem_limit(adev, size, alloc_domain); -+ goto err_bo_create; -+ } -+ bo->kfd_bo = *mem; -+ (*mem)->bo = bo; -+ if (userptr) -+ bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; -+ -+ (*mem)->va = va; -+ (*mem)->domain = domain; -+ (*mem)->mapped_to_gpu_memory = 0; -+ (*mem)->process_info = kfd_vm->process_info; -+ add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, userptr); -+ -+ if (userptr) { -+ ret = init_user_pages(*mem, current->mm, user_addr); -+ if (ret) { -+ mutex_lock(&kfd_vm->process_info->lock); -+ list_del(&(*mem)->validate_list.head); -+ mutex_unlock(&kfd_vm->process_info->lock); -+ goto allocate_init_user_pages_failed; -+ } -+ } -+ -+ if (offset) -+ *offset = amdgpu_bo_mmap_offset(bo); -+ -+ return 0; -+ -+allocate_init_user_pages_failed: -+ amdgpu_bo_unref(&bo); -+err_bo_create: -+ kfree(*mem); -+err: -+ return ret; -+} -+ -+/* Reserving a BO and its page table BOs must happen atomically to -+ * avoid deadlocks. When updating userptrs we need to temporarily -+ * back-off the reservation and then reacquire it. Track all the -+ * reservation info in a context structure. Buffers can be mapped to -+ * multiple VMs simultaneously (buffers being restored on multiple -+ * GPUs). -+ */ -+struct bo_vm_reservation_context { -+ struct amdgpu_bo_list_entry kfd_bo; -+ unsigned int n_vms; -+ struct amdgpu_bo_list_entry *vm_pd; -+ struct ww_acquire_ctx ticket; -+ struct list_head list, duplicates; -+ struct amdgpu_sync *sync; -+ bool reserved; -+}; -+ -+/** -+ * reserve_bo_and_vm - reserve a BO and a VM unconditionally. -+ * @mem: KFD BO structure. -+ * @vm: the VM to reserve. -+ * @ctx: the struct that will be used in unreserve_bo_and_vms(). -+ */ -+static int reserve_bo_and_vm(struct kgd_mem *mem, -+ struct amdgpu_vm *vm, -+ struct bo_vm_reservation_context *ctx) -+{ -+ struct amdgpu_bo *bo = mem->bo; -+ int ret; -+ -+ WARN_ON(!vm); -+ -+ ctx->reserved = false; -+ ctx->n_vms = 1; -+ ctx->sync = &mem->sync; -+ -+ INIT_LIST_HEAD(&ctx->list); -+ INIT_LIST_HEAD(&ctx->duplicates); -+ -+ ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) -+ * ctx->n_vms, GFP_KERNEL); -+ if (ctx->vm_pd == NULL) -+ return -ENOMEM; -+ -+ ctx->kfd_bo.robj = bo; -+ ctx->kfd_bo.priority = 0; -+ ctx->kfd_bo.tv.bo = &bo->tbo; -+ ctx->kfd_bo.tv.shared = true; -+ ctx->kfd_bo.user_pages = NULL; -+ list_add(&ctx->kfd_bo.tv.head, &ctx->list); -+ -+ amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]); -+ -+ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, -+ false, &ctx->duplicates); -+ if (!ret) -+ ctx->reserved = true; -+ else -+ pr_err("Failed to reserve buffers in ttm\n"); -+ -+ if (ret) { -+ kfree(ctx->vm_pd); -+ ctx->vm_pd = NULL; -+ } -+ -+ return ret; -+} -+ -+enum VA_TYPE { -+ VA_NOT_MAPPED = 0, -+ VA_MAPPED, -+ VA_DO_NOT_CARE, -+}; -+ -+/** -+ * reserve_bo_and_vm - reserve a BO and some VMs that the BO has been added -+ * to, conditionally based on map_type. -+ * @mem: KFD BO structure. -+ * @vm: the VM to reserve. If NULL, then all VMs associated with the BO -+ * is used. Otherwise, a single VM associated with the BO. -+ * @map_type: the mapping status that will be used to filter the VMs. -+ * @ctx: the struct that will be used in unreserve_bo_and_vms(). -+ */ -+static int reserve_bo_and_cond_vms(struct kgd_mem *mem, -+ struct amdgpu_vm *vm, enum VA_TYPE map_type, -+ struct bo_vm_reservation_context *ctx) -+{ -+ struct amdgpu_bo *bo = mem->bo; -+ struct kfd_bo_va_list *entry; -+ unsigned int i; -+ int ret; -+ -+ ctx->reserved = false; -+ ctx->n_vms = 0; -+ ctx->vm_pd = NULL; -+ ctx->sync = &mem->sync; -+ -+ INIT_LIST_HEAD(&ctx->list); -+ INIT_LIST_HEAD(&ctx->duplicates); -+ -+ list_for_each_entry(entry, &mem->bo_va_list, bo_list) { -+ if ((vm && vm != entry->bo_va->base.vm) || -+ (entry->is_mapped != map_type -+ && map_type != VA_DO_NOT_CARE)) -+ continue; -+ -+ ctx->n_vms++; -+ } -+ -+ if (ctx->n_vms != 0) { -+ ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) -+ * ctx->n_vms, GFP_KERNEL); -+ if (ctx->vm_pd == NULL) -+ return -ENOMEM; -+ } -+ -+ ctx->kfd_bo.robj = bo; -+ ctx->kfd_bo.priority = 0; -+ ctx->kfd_bo.tv.bo = &bo->tbo; -+ ctx->kfd_bo.tv.shared = true; -+ ctx->kfd_bo.user_pages = NULL; -+ list_add(&ctx->kfd_bo.tv.head, &ctx->list); -+ -+ i = 0; -+ list_for_each_entry(entry, &mem->bo_va_list, bo_list) { -+ if ((vm && vm != entry->bo_va->base.vm) || -+ (entry->is_mapped != map_type -+ && map_type != VA_DO_NOT_CARE)) -+ continue; -+ -+ amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list, -+ &ctx->vm_pd[i]); -+ i++; -+ } -+ -+ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, -+ false, &ctx->duplicates); -+ if (!ret) -+ ctx->reserved = true; -+ else -+ pr_err("Failed to reserve buffers in ttm.\n"); -+ -+ if (ret) { -+ kfree(ctx->vm_pd); -+ ctx->vm_pd = NULL; -+ } -+ -+ return ret; -+} -+ -+static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, -+ bool wait, bool intr) -+{ -+ int ret = 0; -+ -+ if (wait) -+ ret = amdgpu_sync_wait(ctx->sync, intr); -+ -+ if (ctx->reserved) -+ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); -+ kfree(ctx->vm_pd); -+ -+ ctx->sync = NULL; -+ -+ ctx->reserved = false; -+ ctx->vm_pd = NULL; -+ -+ return ret; -+} -+ -+static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, -+ struct kfd_bo_va_list *entry, -+ struct amdgpu_sync *sync) -+{ -+ struct amdgpu_bo_va *bo_va = entry->bo_va; -+ struct amdgpu_vm *vm = bo_va->base.vm; -+ struct amdkfd_vm *kvm = container_of(vm, struct amdkfd_vm, base); -+ struct amdgpu_bo *pd = vm->root.base.bo; -+ -+ /* Remove eviction fence from PD (and thereby from PTs too as they -+ * share the resv. object. Otherwise during PT update job (see -+ * amdgpu_vm_bo_update_mapping), eviction fence will get added to -+ * job->sync object -+ */ -+ amdgpu_amdkfd_remove_eviction_fence(pd, -+ kvm->process_info->eviction_fence, -+ NULL, NULL); -+ amdgpu_vm_bo_unmap(adev, bo_va, entry->va); -+ -+ amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); -+ -+ /* Add the eviction fence back */ -+ amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); -+ -+ amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); -+ -+ /* Sync objects can't handle multiple GPUs (contexts) updating -+ * sync->last_vm_update. Fortunately we don't need it for -+ * KFD's purposes, so we can just drop that fence. -+ */ -+ if (sync->last_vm_update) { -+ dma_fence_put(sync->last_vm_update); -+ sync->last_vm_update = NULL; -+ } -+ -+ return 0; -+} -+ -+static int update_gpuvm_pte(struct amdgpu_device *adev, -+ struct kfd_bo_va_list *entry, -+ struct amdgpu_sync *sync) -+{ -+ int ret; -+ struct amdgpu_vm *vm; -+ struct amdgpu_bo_va *bo_va; -+ struct amdgpu_bo *bo; -+ -+ bo_va = entry->bo_va; -+ vm = bo_va->base.vm; -+ bo = bo_va->base.bo; -+ -+ /* Update the page tables */ -+ ret = amdgpu_vm_bo_update(adev, bo_va, false); -+ if (ret != 0) { -+ pr_err("amdgpu_vm_bo_update failed\n"); -+ return ret; -+ } -+ -+ amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); -+ -+ /* Sync objects can't handle multiple GPUs (contexts) updating -+ * sync->last_vm_update. Fortunately we don't need it for -+ * KFD's purposes, so we can just drop that fence. -+ */ -+ if (sync->last_vm_update) { -+ dma_fence_put(sync->last_vm_update); -+ sync->last_vm_update = NULL; -+ } -+ -+ return 0; -+} -+ -+static int map_bo_to_gpuvm(struct amdgpu_device *adev, -+ struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, -+ bool no_update_pte) -+{ -+ int ret; -+ -+ /* Set virtual address for the allocation */ -+ ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0, -+ amdgpu_bo_size(entry->bo_va->base.bo), entry->pte_flags); -+ if (ret != 0) { -+ pr_err("Failed to map VA 0x%llx in vm. ret %d\n", -+ entry->va, ret); -+ return ret; -+ } -+ -+ if (no_update_pte) -+ return 0; -+ -+ ret = update_gpuvm_pte(adev, entry, sync); -+ if (ret != 0) { -+ pr_err("update_gpuvm_pte() failed\n"); -+ goto update_gpuvm_pte_failed; -+ } -+ -+ return 0; -+ -+update_gpuvm_pte_failed: -+ unmap_bo_from_gpuvm(adev, entry, sync); -+ return ret; -+} -+ -+static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size) -+{ -+ struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL); -+ -+ if (!sg) -+ return NULL; -+ if (sg_alloc_table(sg, 1, GFP_KERNEL)) { -+ kfree(sg); -+ return NULL; -+ } -+ sg->sgl->dma_address = addr; -+ sg->sgl->length = size; -+#ifdef CONFIG_NEED_SG_DMA_LENGTH -+ sg->sgl->dma_length = size; -+#endif -+ return sg; -+} -+ -+int amdgpu_amdkfd_gpuvm_sync_memory( -+ struct kgd_dev *kgd, struct kgd_mem *mem, bool intr) -+{ -+ int ret = 0; -+ struct amdgpu_sync sync; -+ struct amdgpu_device *adev; -+ -+ adev = get_amdgpu_device(kgd); -+ amdgpu_sync_create(&sync); -+ -+ mutex_lock(&mem->lock); -+ amdgpu_sync_clone(adev, &mem->sync, &sync); -+ mutex_unlock(&mem->lock); -+ -+ ret = amdgpu_sync_wait(&sync, intr); -+ amdgpu_sync_free(&sync); -+ return ret; -+} -+ -+#define BOOL_TO_STR(b) (b == true) ? "true" : "false" -+ -+int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( -+ struct kgd_dev *kgd, uint64_t va, uint64_t size, -+ void *vm, struct kgd_mem **mem, -+ uint64_t *offset, uint32_t flags) -+{ -+ bool aql_queue, public, readonly, execute, coherent, no_sub, userptr; -+ u64 alloc_flag; -+ uint32_t domain; -+ uint64_t *temp_offset; -+ struct sg_table *sg = NULL; -+ -+ if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) { -+ pr_err("current hw doesn't support paged memory\n"); -+ return -EINVAL; -+ } -+ -+ domain = 0; -+ alloc_flag = 0; -+ temp_offset = NULL; -+ -+ aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false; -+ public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false; -+ readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false; -+ execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false; -+ coherent = (flags & ALLOC_MEM_FLAGS_COHERENT) ? true : false; -+ no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false; -+ userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false; -+ -+ /* -+ * Check on which domain to allocate BO -+ */ -+ if (flags & ALLOC_MEM_FLAGS_VRAM) { -+ domain = AMDGPU_GEM_DOMAIN_VRAM; -+ alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; -+ if (public) { -+ alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; -+ temp_offset = offset; -+ } -+ alloc_flag |= AMDGPU_GEM_CREATE_VRAM_CLEARED; -+ } else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) { -+ domain = AMDGPU_GEM_DOMAIN_GTT; -+ alloc_flag = 0; -+ temp_offset = offset; -+ } else if (flags & ALLOC_MEM_FLAGS_DOORBELL) { -+ domain = AMDGPU_GEM_DOMAIN_GTT; -+ alloc_flag = 0; -+ temp_offset = offset; -+ if (size > UINT_MAX) -+ return -EINVAL; -+ sg = create_doorbell_sg(*offset, size); -+ if (!sg) -+ return -ENOMEM; -+ } -+ -+ if (offset && !userptr) -+ *offset = 0; -+ -+ pr_debug("Allocate VA 0x%llx - 0x%llx domain %s aql %s\n", -+ va, va + size, domain_string(domain), -+ BOOL_TO_STR(aql_queue)); -+ -+ pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s coherent %s no_sub %s\n", -+ alloc_flag, BOOL_TO_STR(public), -+ BOOL_TO_STR(readonly), BOOL_TO_STR(execute), -+ BOOL_TO_STR(coherent), BOOL_TO_STR(no_sub)); -+ -+ return __alloc_memory_of_gpu(kgd, va, size, vm, mem, -+ temp_offset, domain, -+ alloc_flag, sg, -+ aql_queue, readonly, execute, -+ coherent, no_sub, userptr); -+} -+ -+int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) -+{ -+ struct amdgpu_device *adev; -+ struct kfd_bo_va_list *entry, *tmp; -+ struct bo_vm_reservation_context ctx; -+ int ret = 0; -+ struct ttm_validate_buffer *bo_list_entry; -+ struct amdkfd_process_info *process_info; -+ unsigned long bo_size; -+ -+ adev = get_amdgpu_device(kgd); -+ process_info = ((struct amdkfd_vm *)vm)->process_info; -+ -+ bo_size = mem->bo->tbo.mem.size; -+ -+ mutex_lock(&mem->lock); -+ -+ if (mem->mapped_to_gpu_memory > 0) { -+ pr_err("BO VA 0x%llx size 0x%lx is already mapped to vm %p.\n", -+ mem->va, bo_size, vm); -+ mutex_unlock(&mem->lock); -+ return -EBUSY; -+ } -+ -+ mutex_unlock(&mem->lock); -+ /* lock is not needed after this, since mem is unused and will -+ * be freed anyway -+ */ -+ -+ /* No more MMU notifiers */ -+ amdgpu_mn_unregister(mem->bo); -+ -+ /* Make sure restore workers don't access the BO any more */ -+ bo_list_entry = &mem->validate_list; -+ mutex_lock(&process_info->lock); -+ list_del(&bo_list_entry->head); -+ mutex_unlock(&process_info->lock); -+ -+ /* Free user pages if necessary */ -+ if (mem->user_pages) { -+ pr_debug("%s: Freeing user_pages array\n", __func__); -+ if (mem->user_pages[0]) -+ release_pages(mem->user_pages, -+ mem->bo->tbo.ttm->num_pages, 0); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ drm_free_large(mem->user_pages); -+#else -+ kvfree(mem->user_pages); -+#endif -+ } -+ -+ ret = reserve_bo_and_cond_vms(mem, NULL, VA_DO_NOT_CARE, &ctx); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ /* The eviction fence should be removed by the last unmap. -+ * TODO: Log an error condition if the bo still has the eviction fence -+ * attached -+ */ -+ amdgpu_amdkfd_remove_eviction_fence(mem->bo, -+ process_info->eviction_fence, -+ NULL, NULL); -+ pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va, -+ mem->va + bo_size * (1 + mem->aql_queue)); -+ -+ /* Remove from VM internal data structures */ -+ list_for_each_entry_safe(entry, tmp, &mem->bo_va_list, bo_list) { -+ remove_bo_from_vm((struct amdgpu_device *)entry->kgd_dev, -+ entry, bo_size); -+ } -+ -+ ret = unreserve_bo_and_vms(&ctx, false, false); -+ -+ /* Free the sync object */ -+ amdgpu_sync_free(&mem->sync); -+ -+ /* If the SG is not NULL, it's one we created for a doorbell -+ * BO. We need to free it. -+ */ -+ if (mem->bo->tbo.sg) { -+ sg_free_table(mem->bo->tbo.sg); -+ kfree(mem->bo->tbo.sg); -+ } -+ -+ /* Free the BO*/ -+ amdgpu_bo_unref(&mem->bo); -+ kfree(mem); -+ -+ return ret; -+} -+ -+int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) -+{ -+ struct amdgpu_device *adev; -+ int ret; -+ struct amdgpu_bo *bo; -+ uint32_t domain; -+ struct kfd_bo_va_list *entry; -+ struct bo_vm_reservation_context ctx; -+ struct kfd_bo_va_list *bo_va_entry = NULL; -+ struct kfd_bo_va_list *bo_va_entry_aql = NULL; -+ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; -+ unsigned long bo_size; -+ bool is_invalid_userptr; -+ -+ adev = get_amdgpu_device(kgd); -+ -+ /* Make sure restore is not running concurrently. Since we -+ * don't map invalid userptr BOs, we rely on the next restore -+ * worker to do the mapping -+ */ -+ mutex_lock(&mem->process_info->lock); -+ -+ /* Lock mmap-sem. If we find an invalid userptr BO, we can be -+ * sure that the MMU notifier is no longer running -+ * concurrently and the queues are actually stopped -+ */ -+ down_read(¤t->mm->mmap_sem); -+ is_invalid_userptr = atomic_read(&mem->invalid); -+ up_read(¤t->mm->mmap_sem); -+ -+ mutex_lock(&mem->lock); -+ -+ bo = mem->bo; -+ -+ if (!bo) { -+ pr_err("Invalid BO when mapping memory to GPU\n"); -+ return -EINVAL; -+ } -+ -+ domain = mem->domain; -+ bo_size = bo->tbo.mem.size; -+ -+ pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n", -+ mem->va, -+ mem->va + bo_size * (1 + mem->aql_queue), -+ vm, domain_string(domain)); -+ -+ ret = reserve_bo_and_vm(mem, vm, &ctx); -+ if (unlikely(ret != 0)) -+ goto bo_reserve_failed; -+ -+ /* Userptr can be marked as "not invalid", but not actually be -+ * validated yet (still in the system domain). In that case -+ * the queues are still stopped and we can leave mapping for -+ * the next restore worker -+ */ -+ if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) -+ is_invalid_userptr = true; -+ -+ if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, mem)) { -+ ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, false, -+ &bo_va_entry); -+ if (ret != 0) -+ goto add_bo_to_vm_failed; -+ if (mem->aql_queue) { -+ ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, -+ true, &bo_va_entry_aql); -+ if (ret != 0) -+ goto add_bo_to_vm_failed_aql; -+ } -+ } -+ -+ if (mem->mapped_to_gpu_memory == 0 && -+ !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { -+ /* Validate BO only once. The eviction fence gets added to BO -+ * the first time it is mapped. Validate will wait for all -+ * background evictions to complete. -+ */ -+ ret = amdgpu_amdkfd_bo_validate(bo, domain, true); -+ if (ret) { -+ pr_debug("Validate failed\n"); -+ goto map_bo_to_gpuvm_failed; -+ } -+ } -+ -+ list_for_each_entry(entry, &mem->bo_va_list, bo_list) { -+ if (entry->bo_va->base.vm == vm && !entry->is_mapped) { -+ pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n", -+ entry->va, entry->va + bo_size, -+ entry); -+ -+ ret = map_bo_to_gpuvm(adev, entry, ctx.sync, -+ is_invalid_userptr); -+ if (ret != 0) { -+ pr_err("Failed to map radeon bo to gpuvm\n"); -+ goto map_bo_to_gpuvm_failed; -+ } -+ entry->is_mapped = true; -+ mem->mapped_to_gpu_memory++; -+ pr_debug("\t INC mapping count %d\n", -+ mem->mapped_to_gpu_memory); -+ } -+ } -+ -+ if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) == NULL) -+ amdgpu_bo_fence(bo, -+ &kfd_vm->process_info->eviction_fence->base, -+ true); -+ ret = unreserve_bo_and_vms(&ctx, false, false); -+ -+ mutex_unlock(&mem->process_info->lock); -+ mutex_unlock(&mem->lock); -+ return ret; -+ -+map_bo_to_gpuvm_failed: -+ if (bo_va_entry_aql) -+ remove_bo_from_vm(adev, bo_va_entry_aql, bo_size); -+add_bo_to_vm_failed_aql: -+ if (bo_va_entry) -+ remove_bo_from_vm(adev, bo_va_entry, bo_size); -+add_bo_to_vm_failed: -+ unreserve_bo_and_vms(&ctx, false, false); -+bo_reserve_failed: -+ mutex_unlock(&mem->process_info->lock); -+ mutex_unlock(&mem->lock); -+ return ret; -+} -+ -+static u64 get_vm_pd_gpu_offset(void *vm) -+{ -+ struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; -+ struct amdgpu_device *adev = -+ amdgpu_ttm_adev(avm->root.base.bo->tbo.bdev); -+ u64 offset; -+ -+ BUG_ON(avm == NULL); -+ -+ amdgpu_bo_reserve(avm->root.base.bo, false); -+ -+ offset = amdgpu_bo_gpu_offset(avm->root.base.bo); -+ -+ amdgpu_bo_unreserve(avm->root.base.bo); -+ -+ /* On some ASICs the FB doesn't start at 0. Adjust FB offset -+ * to an actual MC address. -+ */ -+ if (adev->gart.gart_funcs->get_vm_pde) -+ offset = amdgpu_gart_get_vm_pde(adev, offset); -+ -+ return offset; -+} -+ -+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, -+ void **process_info, -+ struct dma_fence **ef) -+{ -+ int ret; -+ struct amdkfd_vm *new_vm; -+ struct amdkfd_process_info *info; -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ -+ new_vm = kzalloc(sizeof(*new_vm), GFP_KERNEL); -+ if (new_vm == NULL) -+ return -ENOMEM; -+ -+ /* Initialize the VM context, allocate the page directory and zero it */ -+ ret = amdgpu_vm_init(adev, &new_vm->base, AMDGPU_VM_CONTEXT_COMPUTE); -+ if (ret != 0) { -+ pr_err("Failed init vm ret %d\n", ret); -+ /* Undo everything related to the new VM context */ -+ goto vm_init_fail; -+ } -+ new_vm->adev = adev; -+ -+ if (!*process_info) { -+ info = kzalloc(sizeof(*info), GFP_KERNEL); -+ if (!info) { -+ pr_err("Failed to create amdkfd_process_info"); -+ ret = -ENOMEM; -+ goto alloc_process_info_fail; -+ } -+ -+ mutex_init(&info->lock); -+ INIT_LIST_HEAD(&info->vm_list_head); -+ INIT_LIST_HEAD(&info->kfd_bo_list); -+ INIT_LIST_HEAD(&info->userptr_valid_list); -+ INIT_LIST_HEAD(&info->userptr_inval_list); -+ -+ info->eviction_fence = -+ amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), -+ current->mm); -+ if (info->eviction_fence == NULL) { -+ pr_err("Failed to create eviction fence\n"); -+ goto create_evict_fence_fail; -+ } -+ -+ info->pid = get_task_pid(current->group_leader, -+ PIDTYPE_PID); -+ atomic_set(&info->evicted_bos, 0); -+ INIT_DELAYED_WORK(&info->work, -+ amdgpu_amdkfd_restore_userptr_worker); -+ -+ *process_info = info; -+ *ef = dma_fence_get(&info->eviction_fence->base); -+ } -+ -+ new_vm->process_info = *process_info; -+ -+ mutex_lock(&new_vm->process_info->lock); -+ list_add_tail(&new_vm->vm_list_node, -+ &(new_vm->process_info->vm_list_head)); -+ new_vm->process_info->n_vms++; -+ mutex_unlock(&new_vm->process_info->lock); -+ -+ *vm = (void *) new_vm; -+ -+ pr_debug("Created process vm %p\n", *vm); -+ -+ return ret; -+ -+create_evict_fence_fail: -+ kfree(info); -+alloc_process_info_fail: -+ amdgpu_vm_fini(adev, &new_vm->base); -+vm_init_fail: -+ kfree(new_vm); -+ return ret; -+ -+} -+ -+void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *) vm; -+ struct amdgpu_vm *avm = &kfd_vm->base; -+ struct amdgpu_bo *pd; -+ struct amdkfd_process_info *process_info; -+ -+ if (WARN_ON(!kgd || !vm)) -+ return; -+ -+ pr_debug("Destroying process vm %p\n", vm); -+ /* Release eviction fence from PD */ -+ pd = avm->root.base.bo; -+ amdgpu_bo_reserve(pd, false); -+ amdgpu_bo_fence(pd, NULL, false); -+ amdgpu_bo_unreserve(pd); -+ -+ process_info = kfd_vm->process_info; -+ -+ mutex_lock(&process_info->lock); -+ process_info->n_vms--; -+ list_del(&kfd_vm->vm_list_node); -+ mutex_unlock(&process_info->lock); -+ -+ /* Release per-process resources */ -+ if (!process_info->n_vms) { -+ WARN_ON(!list_empty(&process_info->kfd_bo_list)); -+ WARN_ON(!list_empty(&process_info->userptr_valid_list)); -+ WARN_ON(!list_empty(&process_info->userptr_inval_list)); -+ -+ dma_fence_put(&process_info->eviction_fence->base); -+ cancel_delayed_work_sync(&process_info->work); -+ put_pid(process_info->pid); -+ kfree(process_info); -+ } -+ -+ /* Release the VM context */ -+ amdgpu_vm_fini(adev, avm); -+ kfree(vm); -+} -+ -+uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm) -+{ -+ return get_vm_pd_gpu_offset(vm) >> AMDGPU_GPU_PAGE_SHIFT; -+} -+ -+int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, -+ struct kfd_vm_fault_info *mem) -+{ -+ struct amdgpu_device *adev; -+ -+ adev = (struct amdgpu_device *) kgd; -+ if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) { -+ *mem = *adev->mc.vm_fault_info; -+ mb(); -+ atomic_set(&adev->mc.vm_fault_info_updated, 0); -+ } -+ return 0; -+} -+ -+static bool is_mem_on_local_device(struct kgd_dev *kgd, -+ struct list_head *bo_va_list, void *vm) -+{ -+ struct kfd_bo_va_list *entry; -+ -+ list_for_each_entry(entry, bo_va_list, bo_list) { -+ if (entry->kgd_dev == kgd && entry->bo_va->base.vm == vm) -+ return true; -+ } -+ -+ return false; -+} -+ -+int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) -+{ -+ struct kfd_bo_va_list *entry; -+ struct amdgpu_device *adev; -+ unsigned int mapped_before; -+ int ret = 0; -+ struct bo_vm_reservation_context ctx; -+ struct amdkfd_process_info *process_info; -+ unsigned long bo_size; -+ -+ adev = (struct amdgpu_device *) kgd; -+ process_info = ((struct amdkfd_vm *)vm)->process_info; -+ -+ bo_size = mem->bo->tbo.mem.size; -+ -+ mutex_lock(&mem->lock); -+ -+ /* -+ * Make sure that this BO mapped on KGD before unmappping it -+ */ -+ if (!is_mem_on_local_device(kgd, &mem->bo_va_list, vm)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (mem->mapped_to_gpu_memory == 0) { -+ pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", -+ mem->va, bo_size, vm); -+ ret = -EINVAL; -+ goto out; -+ } -+ mapped_before = mem->mapped_to_gpu_memory; -+ -+ ret = reserve_bo_and_cond_vms(mem, vm, VA_MAPPED, &ctx); -+ if (unlikely(ret != 0)) -+ goto out; -+ -+ pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n", -+ mem->va, -+ mem->va + bo_size * (1 + mem->aql_queue), -+ vm); -+ -+ list_for_each_entry(entry, &mem->bo_va_list, bo_list) { -+ if (entry->bo_va->base.vm == vm && entry->is_mapped) { -+ pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n", -+ entry->va, -+ entry->va + bo_size, -+ entry); -+ -+ ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync); -+ if (ret == 0) { -+ entry->is_mapped = false; -+ } else { -+ pr_err("failed to unmap VA 0x%llx\n", -+ mem->va); -+ goto unreserve_out; -+ } -+ -+ mem->mapped_to_gpu_memory--; -+ pr_debug("\t DEC mapping count %d\n", -+ mem->mapped_to_gpu_memory); -+ } -+ } -+ -+ /* If BO is unmapped from all VMs, unfence it. It can be evicted if -+ * required. -+ */ -+ if (mem->mapped_to_gpu_memory == 0 && -+ !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) -+ amdgpu_amdkfd_remove_eviction_fence(mem->bo, -+ process_info->eviction_fence, -+ NULL, NULL); -+ -+ if (mapped_before == mem->mapped_to_gpu_memory) { -+ pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", -+ mem->va, bo_size, vm); -+ ret = -EINVAL; -+ } -+ -+unreserve_out: -+ unreserve_bo_and_vms(&ctx, false, false); -+out: -+ mutex_unlock(&mem->lock); -+ return ret; -+} -+ -+int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) -+{ -+ struct amdgpu_device *adev; -+ -+ adev = get_amdgpu_device(kgd); -+ if (!adev) { -+ pr_err("Could not get amdgpu device in %s\n", __func__); -+ return -ENODEV; -+ } -+ -+ return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev); -+} -+ -+int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, -+ struct kgd_mem *mem, void **kptr) -+{ -+ int ret; -+ struct amdgpu_bo *bo = mem->bo; -+ -+ if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { -+ pr_err("userptr can't be mapped to kernel\n"); -+ return -EINVAL; -+ } -+ -+ /* delete kgd_mem from kfd_bo_list to avoid re-validating -+ * this BO in BO's restoring after eviction. -+ */ -+ mutex_lock(&mem->process_info->lock); -+ -+ list_del_init(&mem->validate_list.head); -+ -+ ret = __map_bo_to_kernel(bo, AMDGPU_GEM_DOMAIN_GTT, kptr); -+ if (!ret) -+ mem->kptr = *kptr; -+ -+ mutex_unlock(&mem->process_info->lock); -+ -+ return ret; -+} -+ -+static int pin_bo_wo_map(struct kgd_mem *mem) -+{ -+ struct amdgpu_bo *bo = mem->bo; -+ int ret = 0; -+ -+ ret = amdgpu_bo_reserve(bo, false); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ ret = amdgpu_bo_pin(bo, mem->domain, NULL); -+ amdgpu_bo_unreserve(bo); -+ -+ return ret; -+} -+ -+static void unpin_bo_wo_map(struct kgd_mem *mem) -+{ -+ struct amdgpu_bo *bo = mem->bo; -+ int ret = 0; -+ -+ ret = amdgpu_bo_reserve(bo, false); -+ if (unlikely(ret != 0)) -+ return; -+ -+ amdgpu_bo_unpin(bo); -+ amdgpu_bo_unreserve(bo); -+} -+ -+#define AMD_GPU_PAGE_SHIFT PAGE_SHIFT -+#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT) -+ -+static int get_sg_table(struct amdgpu_device *adev, -+ struct kgd_mem *mem, uint64_t offset, -+ uint64_t size, struct sg_table **ret_sg) -+{ -+ struct amdgpu_bo *bo = mem->bo; -+ struct sg_table *sg = NULL; -+ unsigned long bus_addr; -+ unsigned int chunks; -+ unsigned int i; -+ struct scatterlist *s; -+ uint64_t offset_in_page; -+ unsigned int page_size; -+ int ret; -+ -+ sg = kmalloc(sizeof(*sg), GFP_KERNEL); -+ if (!sg) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) -+ page_size = AMD_GPU_PAGE_SIZE; -+ else -+ page_size = PAGE_SIZE; -+ -+ -+ offset_in_page = offset & (page_size - 1); -+ chunks = (size + offset_in_page + page_size - 1) -+ / page_size; -+ -+ ret = sg_alloc_table(sg, chunks, GFP_KERNEL); -+ if (unlikely(ret)) -+ goto out; -+ -+ if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) { -+ bus_addr = bo->tbo.offset + adev->mc.aper_base + offset; -+ -+ for_each_sg(sg->sgl, s, sg->orig_nents, i) { -+ uint64_t chunk_size, length; -+ -+ chunk_size = page_size - offset_in_page; -+ length = min(size, chunk_size); -+ -+ sg_set_page(s, NULL, length, offset_in_page); -+ s->dma_address = bus_addr; -+ s->dma_length = length; -+ -+ size -= length; -+ offset_in_page = 0; -+ bus_addr += length; -+ } -+ } else { -+ struct page **pages; -+ unsigned int cur_page; -+ -+ pages = bo->tbo.ttm->pages; -+ -+ cur_page = offset / page_size; -+ for_each_sg(sg->sgl, s, sg->orig_nents, i) { -+ uint64_t chunk_size, length; -+ -+ chunk_size = page_size - offset_in_page; -+ length = min(size, chunk_size); -+ -+ sg_set_page(s, pages[cur_page], length, offset_in_page); -+ s->dma_address = page_to_phys(pages[cur_page]); -+ s->dma_length = length; -+ -+ size -= length; -+ offset_in_page = 0; -+ cur_page++; -+ } -+ } -+ -+ *ret_sg = sg; -+ return 0; -+out: -+ kfree(sg); -+ *ret_sg = NULL; -+ return ret; -+} -+ -+int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, -+ struct kgd_mem *mem, uint64_t offset, -+ uint64_t size, struct sg_table **ret_sg) -+{ -+ int ret; -+ struct amdgpu_device *adev; -+ -+ ret = pin_bo_wo_map(mem); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ adev = get_amdgpu_device(kgd); -+ -+ ret = get_sg_table(adev, mem, offset, size, ret_sg); -+ if (ret) -+ unpin_bo_wo_map(mem); -+ -+ return ret; -+} -+ -+void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( -+ struct kgd_mem *mem, struct sg_table *sg) -+{ -+ sg_free_table(sg); -+ kfree(sg); -+ -+ unpin_bo_wo_map(mem); -+} -+ -+int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, -+ struct dma_buf *dma_buf, -+ uint64_t va, void *vm, -+ struct kgd_mem **mem, uint64_t *size, -+ uint64_t *mmap_offset) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ struct drm_gem_object *obj; -+ struct amdgpu_bo *bo; -+ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; -+ -+ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) -+ /* Can't handle non-graphics buffers */ -+ return -EINVAL; -+ -+ obj = dma_buf->priv; -+ if (obj->dev->dev_private != adev) -+ /* Can't handle buffers from other devices */ -+ return -EINVAL; -+ -+ bo = gem_to_amdgpu_bo(obj); -+ if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | -+ AMDGPU_GEM_DOMAIN_GTT | -+ AMDGPU_GEM_DOMAIN_DGMA))) -+ /* Only VRAM and GTT BOs are supported */ -+ return -EINVAL; -+ -+ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); -+ if (*mem == NULL) -+ return -ENOMEM; -+ -+ if (size) -+ *size = amdgpu_bo_size(bo); -+ -+ if (mmap_offset) -+ *mmap_offset = amdgpu_bo_mmap_offset(bo); -+ -+ INIT_LIST_HEAD(&(*mem)->bo_va_list); -+ mutex_init(&(*mem)->lock); -+ (*mem)->mapping_flags = -+ AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | -+ AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC; -+ -+ (*mem)->bo = amdgpu_bo_ref(bo); -+ (*mem)->va = va; -+ if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) -+ (*mem)->domain = AMDGPU_GEM_DOMAIN_VRAM; -+ else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) -+ (*mem)->domain = AMDGPU_GEM_DOMAIN_GTT; -+ else -+ (*mem)->domain = AMDGPU_GEM_DOMAIN_DGMA; -+ (*mem)->mapped_to_gpu_memory = 0; -+ (*mem)->process_info = kfd_vm->process_info; -+ add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, false); -+ amdgpu_sync_create(&(*mem)->sync); -+ -+ return 0; -+} -+ -+int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, -+ struct kgd_mem *mem, -+ struct dma_buf **dmabuf) -+{ -+ struct amdgpu_device *adev = NULL; -+ struct amdgpu_bo *bo = NULL; -+ struct drm_gem_object *gobj = NULL; -+ -+ if (!dmabuf || !kgd || !vm || !mem) -+ return -EINVAL; -+ -+ adev = get_amdgpu_device(kgd); -+ bo = mem->bo; -+ -+ gobj = amdgpu_gem_prime_foreign_bo(adev, bo); -+ if (gobj == NULL) { -+ pr_err("Export BO failed. Unable to find/create GEM object\n"); -+ return -EINVAL; -+ } -+ -+ *dmabuf = amdgpu_gem_prime_export(adev->ddev, gobj, 0); -+ return 0; -+} -+ -+static int process_validate_vms(struct amdkfd_process_info *process_info) -+{ -+ struct amdkfd_vm *peer_vm; -+ int ret; -+ -+ list_for_each_entry(peer_vm, &process_info->vm_list_head, -+ vm_list_node) { -+ ret = vm_validate_pt_pd_bos(&peer_vm->base); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* Evict a userptr BO by stopping the queues if necessary -+ * -+ * Runs in MMU notifier, may be in RECLAIM_FS context. This means it -+ * cannot do any memory allocations, and cannot take any locks that -+ * are held elsewhere while allocating memory. Therefore this is as -+ * simple as possible, using atomic counters. -+ * -+ * It doesn't do anything to the BO itself. The real work happens in -+ * restore, where we get updated page addresses. This function only -+ * ensures that GPU access to the BO is stopped. -+ */ -+int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, -+ struct mm_struct *mm) -+{ -+ struct amdkfd_process_info *process_info = mem->process_info; -+ int invalid, evicted_bos; -+ int r = 0; -+ -+ invalid = atomic_inc_return(&mem->invalid); -+ evicted_bos = atomic_inc_return(&process_info->evicted_bos); -+ if (evicted_bos == 1) { -+ /* First eviction, stop the queues */ -+ r = kgd2kfd->quiesce_mm(NULL, mm); -+ if (r != 0) -+ pr_err("Failed to quiesce KFD\n"); -+ schedule_delayed_work(&process_info->work, 1); -+ } -+ -+ return r; -+} -+ -+/* Update invalid userptr BOs -+ * -+ * Moves invalidated (evicted) userptr BOs from userptr_valid_list to -+ * userptr_inval_list and updates user pages for all BOs that have -+ * been invalidated since their last update. -+ */ -+static int update_invalid_user_pages(struct amdkfd_process_info *process_info, -+ struct mm_struct *mm) -+{ -+ struct kgd_mem *mem, *tmp_mem; -+ struct amdgpu_bo *bo; -+ int invalid, ret; -+ -+ /* Move all invalidated BOs to the userptr_inval_list and -+ * release their user pages by migration to the CPU domain -+ */ -+ list_for_each_entry_safe(mem, tmp_mem, -+ &process_info->userptr_valid_list, -+ validate_list.head) { -+ if (!atomic_read(&mem->invalid)) -+ continue; /* BO is still valid */ -+ -+ bo = mem->bo; -+ -+ if (amdgpu_bo_reserve(bo, true)) -+ return -EAGAIN; -+ amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); -+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); -+ amdgpu_bo_unreserve(bo); -+ if (ret) { -+ pr_err("%s: Failed to invalidate userptr BO\n", -+ __func__); -+ return -EAGAIN; -+ } -+ -+ list_move_tail(&mem->validate_list.head, -+ &process_info->userptr_inval_list); -+ } -+ -+ if (list_empty(&process_info->userptr_inval_list)) -+ return 0; /* All evicted userptr BOs were freed */ -+ -+ /* Go through userptr_inval_list and update any invalid user_pages */ -+ list_for_each_entry(mem, &process_info->userptr_inval_list, -+ validate_list.head) { -+ invalid = atomic_read(&mem->invalid); -+ if (!invalid) -+ /* BO hasn't been invalidated since the last -+ * revalidation attempt. Keep its BO list. -+ */ -+ continue; -+ -+ bo = mem->bo; -+ -+ if (!mem->user_pages) { -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ mem->user_pages = -+ drm_calloc_large(bo->tbo.ttm->num_pages, -+ sizeof(struct page *)); -+#else -+ mem->user_pages = -+ kvmalloc_array(bo->tbo.ttm->num_pages, -+ sizeof(struct page *), -+ GFP_KERNEL | __GFP_ZERO); -+#endif -+ if (!mem->user_pages) { -+ pr_err("%s: Failed to allocate pages array\n", -+ __func__); -+ return -ENOMEM; -+ } -+ } else if (mem->user_pages[0]) { -+ release_pages(mem->user_pages, -+ bo->tbo.ttm->num_pages, 0); -+ } -+ -+ /* Get updated user pages */ -+ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, -+ mem->user_pages); -+ if (ret) { -+ mem->user_pages[0] = NULL; -+ pr_info("%s: Failed to get user pages: %d\n", -+ __func__, ret); -+ /* Pretend it succeeded. It will fail later -+ * with a VM fault if the GPU tries to access -+ * it. Better than hanging indefinitely with -+ * stalled user mode queues. -+ */ -+ } -+ -+ /* Mark the BO as valid unless it was invalidated -+ * again concurrently -+ */ -+ if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) -+ return -EAGAIN; -+ } -+ return 0; -+} -+ -+/* Validate invalid userptr BOs -+ * -+ * Validates BOs on the userptr_inval_list, and moves them back to the -+ * userptr_valid_list. Also updates GPUVM page tables with new page -+ * addresses and waits for the page table updates to complete. -+ */ -+static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) -+{ -+ struct amdgpu_bo_list_entry *pd_bo_list_entries; -+ struct list_head resv_list, duplicates; -+ struct ww_acquire_ctx ticket; -+ struct amdgpu_sync sync; -+ -+ struct amdkfd_vm *peer_vm; -+ struct kgd_mem *mem, *tmp_mem; -+ struct amdgpu_bo *bo; -+ int i, ret; -+ -+ pd_bo_list_entries = kcalloc(process_info->n_vms, -+ sizeof(struct amdgpu_bo_list_entry), -+ GFP_KERNEL); -+ if (!pd_bo_list_entries) { -+ pr_err("%s: Failed to allocate PD BO list entries\n", __func__); -+ return -ENOMEM; -+ } -+ -+ INIT_LIST_HEAD(&resv_list); -+ INIT_LIST_HEAD(&duplicates); -+ -+ /* Get all the page directory BOs that need to be reserved */ -+ i = 0; -+ list_for_each_entry(peer_vm, &process_info->vm_list_head, -+ vm_list_node) -+ amdgpu_vm_get_pd_bo(&peer_vm->base, &resv_list, -+ &pd_bo_list_entries[i++]); -+ /* Add the userptr_inval_list entries to resv_list */ -+ list_for_each_entry(mem, &process_info->userptr_inval_list, -+ validate_list.head) { -+ list_add_tail(&mem->resv_list.head, &resv_list); -+ mem->resv_list.bo = mem->validate_list.bo; -+ mem->resv_list.shared = mem->validate_list.shared; -+ } -+ -+ /* Reserve all BOs and page tables for validation */ -+ ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); -+ WARN(!list_empty(&duplicates), "Duplicates should be empty"); -+ if (ret) -+ goto out; -+ -+ amdgpu_sync_create(&sync); -+ -+ /* Avoid triggering eviction fences when unmapping invalid -+ * userptr BOs (waits for all fences, doesn't use -+ * FENCE_OWNER_VM) -+ */ -+ list_for_each_entry(peer_vm, &process_info->vm_list_head, -+ vm_list_node) -+ amdgpu_amdkfd_remove_eviction_fence(peer_vm->base.root.base.bo, -+ process_info->eviction_fence, -+ NULL, NULL); -+ -+ ret = process_validate_vms(process_info); -+ if (ret) -+ goto unreserve_out; -+ -+ /* Validate BOs and update GPUVM page tables */ -+ list_for_each_entry_safe(mem, tmp_mem, -+ &process_info->userptr_inval_list, -+ validate_list.head) { -+ struct kfd_bo_va_list *bo_va_entry; -+ -+ bo = mem->bo; -+ -+ /* Copy pages array and validate the BO if we got user pages */ -+ if (mem->user_pages[0]) { -+ amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, -+ mem->user_pages); -+ amdgpu_ttm_placement_from_domain(bo, mem->domain); -+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, -+ false, false); -+ if (ret) { -+ pr_err("%s: failed to validate BO\n", __func__); -+ goto unreserve_out; -+ } -+ } -+ -+ /* Validate succeeded, now the BO owns the pages, free -+ * our copy of the pointer array. Put this BO back on -+ * the userptr_valid_list. If we need to revalidate -+ * it, we need to start from scratch. -+ */ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ drm_free_large(mem->user_pages); -+#else -+ kvfree(mem->user_pages); -+#endif -+ mem->user_pages = NULL; -+ list_move_tail(&mem->validate_list.head, -+ &process_info->userptr_valid_list); -+ -+ /* Update mapping. If the BO was not validated -+ * (because we couldn't get user pages), this will -+ * clear the page table entries, which will result in -+ * VM faults if the GPU tries to access the invalid -+ * memory. -+ */ -+ list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { -+ if (!bo_va_entry->is_mapped) -+ continue; -+ -+ ret = update_gpuvm_pte((struct amdgpu_device *) -+ bo_va_entry->kgd_dev, -+ bo_va_entry, &sync); -+ if (ret) { -+ pr_err("%s: update PTE failed\n", __func__); -+ /* make sure this gets validated again */ -+ atomic_inc(&mem->invalid); -+ goto unreserve_out; -+ } -+ } -+ } -+unreserve_out: -+ list_for_each_entry(peer_vm, &process_info->vm_list_head, -+ vm_list_node) -+ amdgpu_bo_fence(peer_vm->base.root.base.bo, -+ &process_info->eviction_fence->base, true); -+ ttm_eu_backoff_reservation(&ticket, &resv_list); -+ amdgpu_sync_wait(&sync, false); -+ amdgpu_sync_free(&sync); -+out: -+ kfree(pd_bo_list_entries); -+ -+ return ret; -+} -+ -+/* Worker callback to restore evicted userptr BOs -+ * -+ * Tries to update and validate all userptr BOs. If successful and no -+ * concurrent evictions happened, the queues are restarted. Otherwise, -+ * reschedule for another attempt later. -+ */ -+static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct amdkfd_process_info *process_info = -+ container_of(dwork, struct amdkfd_process_info, work); -+ struct task_struct *usertask; -+ struct mm_struct *mm; -+ int evicted_bos; -+ -+ evicted_bos = atomic_read(&process_info->evicted_bos); -+ if (!evicted_bos) -+ return; -+ -+ /* Reference task and mm in case of concurrent process termination */ -+ usertask = get_pid_task(process_info->pid, PIDTYPE_PID); -+ if (!usertask) -+ return; -+ mm = get_task_mm(usertask); -+ if (!mm) { -+ put_task_struct(usertask); -+ return; -+ } -+ -+ mutex_lock(&process_info->lock); -+ -+ if (update_invalid_user_pages(process_info, mm)) -+ goto unlock_out; -+ /* userptr_inval_list can be empty if all evicted userptr BOs -+ * have been freed. In that case there is nothing to validate -+ * and we can just restart the queues. -+ */ -+ if (!list_empty(&process_info->userptr_inval_list)) { -+ if (atomic_read(&process_info->evicted_bos) != evicted_bos) -+ goto unlock_out; /* Concurrent eviction, try again */ -+ -+ if (validate_invalid_user_pages(process_info)) -+ goto unlock_out; -+ } -+ /* Final check for concurrent evicton and atomic update. If -+ * another eviction happens after successful update, it will -+ * be a first eviction that calls quiesce_mm. The eviction -+ * reference counting inside KFD will handle this case. -+ */ -+ if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != -+ evicted_bos) -+ goto unlock_out; -+ evicted_bos = 0; -+ if (kgd2kfd->resume_mm(NULL, mm)) { -+ pr_err("%s: Failed to resume KFD\n", __func__); -+ /* No recovery from this failure. Probably the CP is -+ * hanging. No point trying again. -+ */ -+ } -+unlock_out: -+ mutex_unlock(&process_info->lock); -+ mmput(mm); -+ put_task_struct(usertask); -+ -+ /* If validation failed, reschedule another attempt */ -+ if (evicted_bos) -+ schedule_delayed_work(&process_info->work, 1); -+} -+ -+/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given -+ * KFD process identified by process_info -+ * -+ * @process_info: amdkfd_process_info of the KFD process -+ * -+ * After memory eviction, restore thread calls this function. The function -+ * should be called when the Process is still valid. BO restore involves - -+ * -+ * 1. Release old eviction fence and create new one -+ * 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list. -+ * 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of -+ * BOs that need to be reserved. -+ * 4. Reserve all the BOs -+ * 5. Validate of PD and PT BOs. -+ * 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence -+ * 7. Add fence to all PD and PT BOs. -+ * 8. Unreserve all BOs -+ */ -+ -+int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) -+{ -+ struct amdgpu_bo_list_entry *pd_bo_list; -+ struct amdkfd_process_info *process_info = info; -+ struct amdkfd_vm *peer_vm; -+ struct kgd_mem *mem; -+ struct bo_vm_reservation_context ctx; -+ struct amdgpu_amdkfd_fence *new_fence; -+ int ret = 0, i; -+ struct list_head duplicate_save; -+ struct amdgpu_sync sync_obj; -+ -+ INIT_LIST_HEAD(&duplicate_save); -+ INIT_LIST_HEAD(&ctx.list); -+ INIT_LIST_HEAD(&ctx.duplicates); -+ -+ pd_bo_list = kcalloc(process_info->n_vms, -+ sizeof(struct amdgpu_bo_list_entry), -+ GFP_KERNEL); -+ if (pd_bo_list == NULL) -+ return -ENOMEM; -+ -+ i = 0; -+ mutex_lock(&process_info->lock); -+ list_for_each_entry(peer_vm, &process_info->vm_list_head, -+ vm_list_node) -+ amdgpu_vm_get_pd_bo(&peer_vm->base, &ctx.list, -+ &pd_bo_list[i++]); -+ -+ /* Reserve all BOs and page tables/directory. Add all BOs from -+ * kfd_bo_list to ctx.list -+ */ -+ list_for_each_entry(mem, &process_info->kfd_bo_list, -+ validate_list.head) { -+ -+ list_add_tail(&mem->resv_list.head, &ctx.list); -+ mem->resv_list.bo = mem->validate_list.bo; -+ mem->resv_list.shared = mem->validate_list.shared; -+ } -+ -+ ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list, -+ false, &duplicate_save); -+ if (ret) { -+ pr_debug("Memory eviction: TTM Reserve Failed. Try again\n"); -+ goto ttm_reserve_fail; -+ } -+ -+ amdgpu_sync_create(&sync_obj); -+ ctx.sync = &sync_obj; -+ -+ /* Validate PDs and PTs */ -+ ret = process_validate_vms(process_info); -+ if (ret) -+ goto validate_map_fail; -+ -+ /* Wait for PD/PTs validate to finish */ -+ /* FIXME: I think this isn't needed */ -+ list_for_each_entry(peer_vm, &process_info->vm_list_head, -+ vm_list_node) { -+ struct amdgpu_bo *bo = peer_vm->base.root.base.bo; -+ -+ ttm_bo_wait(&bo->tbo, false, false); -+ } -+ -+ /* Validate BOs and map them to GPUVM (update VM page tables). */ -+ list_for_each_entry(mem, &process_info->kfd_bo_list, -+ validate_list.head) { -+ -+ struct amdgpu_bo *bo = mem->bo; -+ uint32_t domain = mem->domain; -+ struct kfd_bo_va_list *bo_va_entry; -+ -+ ret = amdgpu_amdkfd_bo_validate(bo, domain, false); -+ if (ret) { -+ pr_debug("Memory eviction: Validate BOs failed. Try again\n"); -+ goto validate_map_fail; -+ } -+ -+ list_for_each_entry(bo_va_entry, &mem->bo_va_list, -+ bo_list) { -+ ret = update_gpuvm_pte((struct amdgpu_device *) -+ bo_va_entry->kgd_dev, -+ bo_va_entry, -+ ctx.sync); -+ if (ret) { -+ pr_debug("Memory eviction: update PTE failed. Try again\n"); -+ goto validate_map_fail; -+ } -+ } -+ } -+ -+ amdgpu_sync_wait(ctx.sync, false); -+ -+ /* Release old eviction fence and create new one, because fence only -+ * goes from unsignaled to signaled, fence cannot be reused. -+ * Use context and mm from the old fence. -+ */ -+ new_fence = amdgpu_amdkfd_fence_create( -+ process_info->eviction_fence->base.context, -+ process_info->eviction_fence->mm); -+ if (!new_fence) { -+ pr_err("Failed to create eviction fence\n"); -+ ret = -ENOMEM; -+ goto validate_map_fail; -+ } -+ dma_fence_put(&process_info->eviction_fence->base); -+ process_info->eviction_fence = new_fence; -+ *ef = dma_fence_get(&new_fence->base); -+ -+ /* Wait for validate to finish and attach new eviction fence */ -+ list_for_each_entry(mem, &process_info->kfd_bo_list, -+ validate_list.head) -+ ttm_bo_wait(&mem->bo->tbo, false, false); -+ list_for_each_entry(mem, &process_info->kfd_bo_list, -+ validate_list.head) -+ amdgpu_bo_fence(mem->bo, -+ &process_info->eviction_fence->base, true); -+ -+ /* Attach eviction fence to PD / PT BOs */ -+ list_for_each_entry(peer_vm, &process_info->vm_list_head, -+ vm_list_node) { -+ struct amdgpu_bo *bo = peer_vm->base.root.base.bo; -+ -+ amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true); -+ } -+validate_map_fail: -+ ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list); -+ amdgpu_sync_free(&sync_obj); -+ttm_reserve_fail: -+ mutex_unlock(&process_info->lock); -+evict_fence_fail: -+ kfree(pd_bo_list); -+ return ret; -+} -+ -+int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, -+ uint64_t src_offset, struct kgd_mem *dst_mem, -+ uint64_t dst_offset, uint64_t size, -+ struct dma_fence **f, uint64_t *actual_size) -+{ -+ struct amdgpu_device *adev = NULL; -+ struct ttm_mem_reg *src = NULL, *dst = NULL; -+ struct ttm_buffer_object *src_ttm_bo, *dst_ttm_bo; -+ struct drm_mm_node *src_mm, *dst_mm; -+ struct amdgpu_ring *ring; -+ struct ww_acquire_ctx ticket; -+ struct list_head list; -+ struct ttm_validate_buffer resv_list[2]; -+ uint64_t src_start, dst_start; -+ uint64_t src_left, dst_left, cur_copy_size, total_copy_size = 0; -+ struct dma_fence *fence = NULL; -+ int r; -+ -+ if (!kgd || !src_mem || !dst_mem) -+ return -EINVAL; -+ -+ if (actual_size) -+ *actual_size = 0; -+ -+ adev = get_amdgpu_device(kgd); -+ src_ttm_bo = &src_mem->bo->tbo; -+ dst_ttm_bo = &dst_mem->bo->tbo; -+ src = &src_ttm_bo->mem; -+ dst = &dst_ttm_bo->mem; -+ src_mm = (struct drm_mm_node *)src->mm_node; -+ dst_mm = (struct drm_mm_node *)dst->mm_node; -+ -+ ring = adev->mman.buffer_funcs_ring; -+ -+ INIT_LIST_HEAD(&list); -+ -+ resv_list[0].bo = src_ttm_bo; -+ resv_list[0].shared = true; -+ resv_list[1].bo = dst_ttm_bo; -+ resv_list[1].shared = true; -+ -+ list_add_tail(&resv_list[0].head, &list); -+ list_add_tail(&resv_list[1].head, &list); -+ -+ if (!ring->ready) { -+ pr_err("Trying to move memory with ring turned off.\n"); -+ return -EINVAL; -+ } -+ -+ r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL); -+ if (r) { -+ pr_err("Copy buffer failed. Unable to reserve bo (%d)\n", r); -+ return r; -+ } -+ -+ switch (src->mem_type) { -+ case TTM_PL_TT: -+ r = amdgpu_ttm_bind(src_ttm_bo, src); -+ if (r) { -+ DRM_ERROR("Copy failed. Cannot bind to gart\n"); -+ goto copy_fail; -+ } -+ break; -+ case TTM_PL_VRAM: -+ /* VRAM could be scattered. Find the node in which the offset -+ * belongs to -+ */ -+ while (src_offset >= (src_mm->size << PAGE_SHIFT)) { -+ src_offset -= (src_mm->size << PAGE_SHIFT); -+ ++src_mm; -+ } -+ break; -+ default: -+ DRM_ERROR("Unknown placement %d\n", src->mem_type); -+ r = -EINVAL; -+ goto copy_fail; -+ } -+ src_start = src_mm->start << PAGE_SHIFT; -+ src_start += src_ttm_bo->bdev->man[src->mem_type].gpu_offset; -+ src_start += src_offset; -+ src_left = (src_mm->size << PAGE_SHIFT) - src_offset; -+ -+ switch (dst->mem_type) { -+ case TTM_PL_TT: -+ r = amdgpu_ttm_bind(dst_ttm_bo, dst); -+ if (r) { -+ DRM_ERROR("Copy failed. Cannot bind to gart\n"); -+ goto copy_fail; -+ } -+ break; -+ case TTM_PL_VRAM: -+ while (dst_offset >= (dst_mm->size << PAGE_SHIFT)) { -+ dst_offset -= (dst_mm->size << PAGE_SHIFT); -+ ++dst_mm; -+ } -+ break; -+ default: -+ DRM_ERROR("Unknown placement %d\n", dst->mem_type); -+ r = -EINVAL; -+ goto copy_fail; -+ } -+ dst_start = dst_mm->start << PAGE_SHIFT; -+ dst_start += dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; -+ dst_start += dst_offset; -+ dst_left = (dst_mm->size << PAGE_SHIFT) - dst_offset; -+ -+ do { -+ struct dma_fence *next; -+ -+ /* src_left/dst_left: amount of space left in the current node -+ * Copy minimum of (src_left, dst_left, amount of bytes left to -+ * copy) -+ */ -+ cur_copy_size = min3(src_left, dst_left, -+ (size - total_copy_size)); -+ -+ r = amdgpu_copy_buffer(ring, src_start, dst_start, -+ cur_copy_size, NULL, &next, false, false); -+ if (r) -+ break; -+ -+ /* Just keep the last fence */ -+ dma_fence_put(fence); -+ fence = next; -+ -+ total_copy_size += cur_copy_size; -+ /* Required amount of bytes copied. Done. */ -+ if (total_copy_size >= size) -+ break; -+ -+ /* If end of src or dst node is reached, move to next node */ -+ src_left -= cur_copy_size; -+ if (!src_left) { -+ ++src_mm; -+ src_start = src_mm->start << PAGE_SHIFT; -+ src_start += -+ src_ttm_bo->bdev->man[src->mem_type].gpu_offset; -+ src_left = src_mm->size << PAGE_SHIFT; -+ } else -+ src_start += cur_copy_size; -+ -+ dst_left -= cur_copy_size; -+ if (!dst_left) { -+ ++dst_mm; -+ dst_start = dst_mm->start << PAGE_SHIFT; -+ dst_start += -+ dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; -+ dst_left = dst_mm->size << PAGE_SHIFT; -+ } else -+ dst_start += cur_copy_size; -+ -+ } while (total_copy_size < size); -+ -+ /* Failure could occur after partial copy. So fill in amount copied -+ * and fence, still fill-in -+ */ -+ if (actual_size) -+ *actual_size = total_copy_size; -+ -+ if (fence) { -+ amdgpu_bo_fence(src_mem->bo, fence, true); -+ amdgpu_bo_fence(dst_mem->bo, fence, true); -+ } -+ -+ if (f) -+ *f = fence; -+ -+copy_fail: -+ ttm_eu_backoff_reservation(&ticket, &list); -+ return r; -+} -+ -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -index 9c472c5..2be2e05 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -@@ -817,11 +817,7 @@ static struct drm_driver kms_driver = { - .driver_features = - DRIVER_USE_AGP | - DRIVER_HAVE_IRQ | DRIVER_IRQ_SHARED | DRIVER_GEM | --#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) - DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ, --#else -- DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET, --#endif - .load = amdgpu_driver_load_kms, - .open = amdgpu_driver_open_kms, - .postclose = amdgpu_driver_postclose_kms, -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -old mode 100644 -new mode 100755 -index 283dc1b..f421505 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -@@ -36,6 +36,7 @@ - #include <drm/drm_cache.h> - #include "amdgpu.h" - #include "amdgpu_trace.h" -+#include "amdgpu_amdkfd.h" - - static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) - { -@@ -46,6 +47,8 @@ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) - - if (bo->tbo.mem.mem_type == AMDGPU_PL_DGMA_IMPORT) - kfree(tbo->mem.bus.addr); -+ if (bo->kfd_bo) -+ amdgpu_amdkfd_unreserve_system_memory_limit(bo); - amdgpu_bo_kunmap(bo); - - if (bo->gem_base.import_attach) -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h -old mode 100644 -new mode 100755 -index 8a91658..f73dba5 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h -@@ -89,6 +89,7 @@ struct amdgpu_bo { - - struct ttm_bo_kmap_obj dma_buf_vmap; - struct amdgpu_mn *mn; -+ struct kgd_mem *kfd_bo; - - union { - struct list_head mn_list; -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h -old mode 100644 -new mode 100755 -index 322d2529..af8e544 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h -@@ -36,6 +36,7 @@ - /* some special values for the owner field */ - #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul) - #define AMDGPU_FENCE_OWNER_VM ((void*)1ul) -+#define AMDGPU_FENCE_OWNER_KFD ((void *)2ul) - - #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) - #define AMDGPU_FENCE_FLAG_INT (1 << 1) -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c -old mode 100644 -new mode 100755 -index c586f44..7ee8247 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c -@@ -31,6 +31,7 @@ - #include <drm/drmP.h> - #include "amdgpu.h" - #include "amdgpu_trace.h" -+#include "amdgpu_amdkfd.h" - - struct amdgpu_sync_entry { - struct hlist_node node; -@@ -84,11 +85,20 @@ static bool amdgpu_sync_same_dev(struct amdgpu_device *adev, - */ - static void *amdgpu_sync_get_owner(struct dma_fence *f) - { -- struct amd_sched_fence *s_fence = to_amd_sched_fence(f); -+ struct amd_sched_fence *s_fence; -+ struct amdgpu_amdkfd_fence *kfd_fence; -+ -+ if (f == NULL) -+ return AMDGPU_FENCE_OWNER_UNDEFINED; - -+ s_fence = to_amd_sched_fence(f); - if (s_fence) - return s_fence->owner; - -+ kfd_fence = to_amdgpu_amdkfd_fence(f); -+ if (kfd_fence) -+ return AMDGPU_FENCE_OWNER_KFD; -+ - return AMDGPU_FENCE_OWNER_UNDEFINED; - } - -@@ -171,7 +181,8 @@ int amdgpu_sync_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync, - * @resv: reservation object with embedded fence - * @shared: true if we should only sync to the exclusive fence - * -- * Sync to the fence -+ * Sync to the fence except if it is KFD eviction fence and owner is -+ * AMDGPU_FENCE_OWNER_VM. - */ - int amdgpu_sync_resv(struct amdgpu_device *adev, - struct amdgpu_sync *sync, -@@ -198,11 +209,15 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, - for (i = 0; i < flist->shared_count; ++i) { - f = rcu_dereference_protected(flist->shared[i], - reservation_object_held(resv)); -+ fence_owner = amdgpu_sync_get_owner(f); -+ if (fence_owner == AMDGPU_FENCE_OWNER_KFD && -+ owner != AMDGPU_FENCE_OWNER_UNDEFINED) -+ continue; -+ - if (amdgpu_sync_same_dev(adev, f)) { - /* VM updates are only interesting - * for other VM updates and moves. - */ -- fence_owner = amdgpu_sync_get_owner(f); - if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) && - (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) && - ((owner == AMDGPU_FENCE_OWNER_VM) != -@@ -297,6 +312,31 @@ struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync) - return NULL; - } - -+int amdgpu_sync_clone(struct amdgpu_device *adev, -+ struct amdgpu_sync *source, -+ struct amdgpu_sync *clone) -+{ -+ struct amdgpu_sync_entry *e; -+ struct hlist_node *tmp; -+ struct dma_fence *f; -+ int i, r; -+ -+ hash_for_each_safe(source->fences, i, tmp, e, node) { -+ -+ f = e->fence; -+ if (!dma_fence_is_signaled(f)) { -+ r = amdgpu_sync_fence(adev, clone, f); -+ if (r) -+ return r; -+ } else { -+ hash_del(&e->node); -+ dma_fence_put(f); -+ kmem_cache_free(amdgpu_sync_slab, e); -+ } -+ } -+ return 0; -+} -+ - int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr) - { - struct amdgpu_sync_entry *e; -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h -old mode 100644 -new mode 100755 -index dc76879..8e29bc7 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h -@@ -49,6 +49,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, - struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync, - struct amdgpu_ring *ring); - struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync); -+int amdgpu_sync_clone(struct amdgpu_device *adev, struct amdgpu_sync *source, -+ struct amdgpu_sync *clone); - int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr); - void amdgpu_sync_free(struct amdgpu_sync *sync); - int amdgpu_sync_init(void); -diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h -old mode 100644 -new mode 100755 -index 9f34fab..f22f7a8 ---- a/drivers/gpu/drm/amd/amdgpu/soc15d.h -+++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h -@@ -272,6 +272,7 @@ - # define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) - # define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4) - # define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) -+# define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29) - #define PACKET3_SET_RESOURCES 0xA0 - /* 1. header - * 2. CONTROL -diff --git a/drivers/gpu/drm/amd/amdgpu/vid.h b/drivers/gpu/drm/amd/amdgpu/vid.h -old mode 100644 -new mode 100755 -index 323e21c..d09592a ---- a/drivers/gpu/drm/amd/amdgpu/vid.h -+++ b/drivers/gpu/drm/amd/amdgpu/vid.h -@@ -27,6 +27,8 @@ - #define SDMA1_REGISTER_OFFSET 0x200 /* not a register */ - #define SDMA_MAX_INSTANCE 2 - -+#define KFD_VI_SDMA_QUEUE_OFFSET 0x80 /* not a register */ -+ - /* crtc instance offsets */ - #define CRTC0_REGISTER_OFFSET (0x1b9c - 0x1b9c) - #define CRTC1_REGISTER_OFFSET (0x1d9c - 0x1b9c) -diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile -old mode 100644 -new mode 100755 -index f55a0f8..dba08ec ---- a/drivers/gpu/drm/amd/amdkfd/Makefile -+++ b/drivers/gpu/drm/amd/amdkfd/Makefile -@@ -26,5 +26,3 @@ amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o - - obj-$(CONFIG_HSA_AMD) += amdkfd.o - --AMDKFD_FULL_PATH = $(src) --include $(AMDKFD_FULL_PATH)/backport/Makefile -diff --git a/drivers/gpu/drm/amd/amdkfd/backport/backport.h b/drivers/gpu/drm/amd/amdkfd/backport/backport.h -index 8b13b98..e1f8c1d 100644 ---- a/drivers/gpu/drm/amd/amdkfd/backport/backport.h -+++ b/drivers/gpu/drm/amd/amdkfd/backport/backport.h -@@ -2,12 +2,5 @@ - #define AMDKFD_BACKPORT_H - - #include <linux/version.h> --#if defined(BUILD_AS_DKMS) --#include <kcl/kcl_amd_asic_type.h> --#endif --#include <kcl/kcl_compat.h> --#include <kcl/kcl_pci.h> --#include <kcl/kcl_mn.h> --#include <kcl/kcl_fence.h> - - #endif -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -index b2795af..207a05e 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -@@ -25,9 +25,7 @@ - #include <linux/err.h> - #include <linux/fs.h> - #include <linux/sched.h> --#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) - #include <linux/sched/mm.h> --#endif - #include <linux/slab.h> - #include <linux/uaccess.h> - #include <linux/compat.h> -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -index 5f597a6..4e94081 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -@@ -811,11 +811,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, - */ - pgdat = NODE_DATA(numa_node_id); - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) -- mem_in_bytes += pgdat->node_zones[zone_type].present_pages; --#else - mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; --#endif - mem_in_bytes <<= PAGE_SHIFT; - - sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c -index c6b447d..6b3a1fa 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c -@@ -326,11 +326,6 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd); - - static int kfd_resume(struct kfd_dev *kfd); - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) --void kfd_init_processes_srcu(void); --void kfd_cleanup_processes_srcu(void); --#endif -- - static const struct kfd_device_info *lookup_device_info(unsigned short did) - { - size_t i; -@@ -633,10 +628,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - - kfd_ib_mem_init(kfd); - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) -- kfd_init_processes_srcu(); --#endif -- - if (kfd_resume(kfd)) { - dev_err(kfd_device, "Error resuming kfd\n"); - goto kfd_resume_error; -@@ -678,9 +669,6 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) - { - if (kfd->init_complete) { - kgd2kfd_suspend(kfd); --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) -- kfd_cleanup_processes_srcu(); --#endif - kfd_cwsr_fini(kfd); - device_queue_manager_uninit(kfd->dqm); - kfd_interrupt_exit(kfd); -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c -index 8debe6e..7eacf42 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c -@@ -24,10 +24,8 @@ - #include <linux/slab.h> - #include <linux/types.h> - #include <linux/uaccess.h> --#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) - #include <linux/sched/mm.h> - #include <linux/sched/signal.h> --#endif - #include <linux/mman.h> - #include <linux/memory.h> - #include "kfd_priv.h" -@@ -269,13 +267,7 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) - { - struct kfd_event *ev; - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *node; -- -- hash_for_each_possible(p->events, ev, node, events, id) --#else - hash_for_each_possible(p->events, ev, events, id) --#endif - if (ev->event_id == id) - return ev; - -@@ -420,13 +412,7 @@ static void destroy_events(struct kfd_process *p) - struct hlist_node *tmp; - unsigned int hash_bkt; - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *node; -- -- hash_for_each_safe(p->events, hash_bkt, node, tmp, ev, events) --#else - hash_for_each_safe(p->events, hash_bkt, tmp, ev, events) --#endif - destroy_event(p, ev); - } - -@@ -972,16 +958,9 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, - int bkt; - bool send_signal = true; - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *node; -- ev_data = (struct kfd_hsa_memory_exception_data *) event_data; -- -- hash_for_each(p->events, bkt, node, ev, events) --#else - ev_data = (struct kfd_hsa_memory_exception_data *) event_data; - - hash_for_each(p->events, bkt, ev, events) --#endif - if (ev->type == type) { - send_signal = false; - dev_dbg(kfd_device, -@@ -1114,9 +1093,6 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, - int bkt; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - struct kfd_hsa_memory_exception_data memory_exception_data; --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *node; --#endif - - if (!p) - return; /* Presumably process exited. */ -@@ -1136,11 +1112,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, - } - mutex_lock(&p->event_mutex); - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- hash_for_each(p->events, bkt, node, ev, events) { --#else - hash_for_each(p->events, bkt, ev, events) { --#endif - if (ev->type == KFD_EVENT_TYPE_MEMORY) { - ev->memory_exception_data = memory_exception_data; - set_event(ev); -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c -index 4f4392a..47dcf4a 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c -@@ -61,11 +61,7 @@ int kfd_interrupt_init(struct kfd_dev *kfd) - return r; - } - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) -- kfd->ih_wq = create_rt_workqueue("KFD IH"); --#else - kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); --#endif - spin_lock_init(&kfd->interrupt_lock); - - INIT_WORK(&kfd->interrupt_work, interrupt_wq); -@@ -115,15 +111,9 @@ bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) - count = kfifo_in(&kfd->ih_fifo, ih_ring_entry, - kfd->device_info->ih_ring_entry_size); - if (count != kfd->device_info->ih_ring_entry_size) { --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) -- dev_err(kfd_chardev(), -- "Interrupt ring overflow, dropping interrupt %d\n", -- count); --#else - dev_err_ratelimited(kfd_chardev(), - "Interrupt ring overflow, dropping interrupt %d\n", - count); --#endif - return false; - } - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c -index c6be3ba..e67eb9f 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c -@@ -192,21 +192,13 @@ int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, - { - int r; - struct kfd_ipc_obj *entry, *found = NULL; --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *tmp_node; --#endif - - mutex_lock(&kfd_ipc_handles.lock); - /* Convert the user provided handle to hash key and search only in that - * bucket - */ --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- hlist_for_each_entry(entry, tmp_node, -- &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { --#else - hlist_for_each_entry(entry, - &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { --#endif - if (!memcmp(entry->share_handle, share_handle, - sizeof(entry->share_handle))) { - found = entry; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c -index 64bf653..5724d33 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c -@@ -465,19 +465,15 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, - - static int debugfs_show_mqd(struct seq_file *m, void *data) - { --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct cik_mqd), false); --#endif - return 0; - } - - static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) - { --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct cik_sdma_rlc_registers), false); --#endif - return 0; - } - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c -index 0713cac..6c302d2 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c -@@ -455,19 +455,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, - - static int debugfs_show_mqd(struct seq_file *m, void *data) - { --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct v9_mqd), false); --#endif - return 0; - } - - static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) - { --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct v9_sdma_mqd), false); --#endif - return 0; - } - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c -index a5ba6f7..5c26e5a 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c -@@ -468,19 +468,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, - - static int debugfs_show_mqd(struct seq_file *m, void *data) - { --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct vi_mqd), false); --#endif - return 0; - } - - static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) - { --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct vi_sdma_mqd), false); --#endif - return 0; - } - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -index 9fcb6fb..7cca7b4 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -@@ -410,10 +410,8 @@ int pm_debugfs_runlist(struct seq_file *m, void *data) - return 0; - } - --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); --#endif - - return 0; - } -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -old mode 100644 -new mode 100755 -index ebe311e..88fdfc9 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -@@ -36,11 +36,7 @@ - #include <linux/interval_tree.h> - #include <linux/seq_file.h> - #include <linux/kref.h> --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) --#include <linux/kfifo-new.h> --#else - #include <linux/kfifo.h> --#endif - #include <kgd_kfd_interface.h> - - #include <drm/amd_rdma.h> -@@ -727,7 +723,7 @@ struct kfd_process { - size_t signal_event_count; - bool signal_event_limit_reached; - -- struct rb_root bo_interval_tree; -+ struct rb_root_cached bo_interval_tree; - - /* Information used for memory eviction */ - void *process_info; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c -index b458995..c798fa3 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c -@@ -23,10 +23,8 @@ - #include <linux/mutex.h> - #include <linux/log2.h> - #include <linux/sched.h> --#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) - #include <linux/sched/mm.h> - #include <linux/sched/task.h> --#endif - #include <linux/slab.h> - #if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - #include <linux/amd-iommu.h> -@@ -50,20 +48,7 @@ struct mm_struct; - static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); - static DEFINE_MUTEX(kfd_processes_mutex); - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) --static struct srcu_struct kfd_processes_srcu; --void kfd_init_processes_srcu(void) --{ -- init_srcu_struct(&kfd_processes_srcu); --} -- --void kfd_cleanup_processes_srcu(void) --{ -- cleanup_srcu_struct(&kfd_processes_srcu); --} --#else - DEFINE_STATIC_SRCU(kfd_processes_srcu); --#endif - - static struct workqueue_struct *kfd_process_wq; - -@@ -81,11 +66,7 @@ static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); - void kfd_process_create_wq(void) - { - if (!kfd_process_wq) --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) -- kfd_process_wq = create_workqueue("kfd_process_wq"); --#else - kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0); --#endif - } - - void kfd_process_destroy_wq(void) -@@ -273,15 +254,8 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) - { - struct kfd_process *process; - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *node; -- -- hash_for_each_possible_rcu(kfd_processes_table, process, node, -- kfd_processes, (uintptr_t)mm) --#else - hash_for_each_possible_rcu(kfd_processes_table, process, - kfd_processes, (uintptr_t)mm) --#endif - if (process->mm == mm) - return process; - -@@ -586,7 +560,7 @@ static struct kfd_process *create_process(const struct task_struct *thread, - if (!process) - goto err_alloc_process; - -- process->bo_interval_tree = RB_ROOT; -+ process->bo_interval_tree = RB_ROOT_CACHED; - - process->pasid = kfd_pasid_alloc(); - if (process->pasid == 0) -@@ -1026,13 +1000,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) - - int idx = srcu_read_lock(&kfd_processes_srcu); - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *node; -- -- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { --#else - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { --#endif - if (p->pasid == pasid) { - kref_get(&p->ref); - ret_p = p; -@@ -1051,13 +1019,7 @@ void kfd_suspend_all_processes(void) - unsigned int temp; - int idx = srcu_read_lock(&kfd_processes_srcu); - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *node; -- -- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { --#else - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { --#endif - if (cancel_delayed_work_sync(&p->eviction_work.dwork)) - dma_fence_put(p->eviction_work.quiesce_fence); - cancel_delayed_work_sync(&p->restore_work); -@@ -1077,13 +1039,7 @@ int kfd_resume_all_processes(void) - unsigned int temp; - int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *node; -- -- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { --#else - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { --#endif - if (!schedule_delayed_work(&p->restore_work, 0)) { - pr_err("Restore process %d failed during resume\n", - p->pasid); -@@ -1171,13 +1127,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) - - int idx = srcu_read_lock(&kfd_processes_srcu); - --#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -- struct hlist_node *node; -- -- hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { --#else - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { --#endif - seq_printf(m, "Process %d PASID %d:\n", - p->lead_thread->tgid, p->pasid); - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -index ffd8e0f..d08e3de 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -@@ -122,9 +122,7 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) - struct kfd_mem_properties *mem; - struct kfd_cache_properties *cache; - struct kfd_iolink_properties *iolink; --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - struct kfd_perf_properties *perf; --#endif - - list_del(&dev->list); - -@@ -149,14 +147,12 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) - kfree(iolink); - } - --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - while (dev->perf_props.next != &dev->perf_props) { - perf = container_of(dev->perf_props.next, - struct kfd_perf_properties, list); - list_del(&perf->list); - kfree(perf); - } --#endif - - kfree(dev); - } -@@ -192,9 +188,7 @@ struct kfd_topology_device *kfd_create_topology_device( - INIT_LIST_HEAD(&dev->mem_props); - INIT_LIST_HEAD(&dev->cache_props); - INIT_LIST_HEAD(&dev->io_link_props); --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - INIT_LIST_HEAD(&dev->perf_props); --#endif - - list_add_tail(&dev->list, device_list); - -@@ -374,7 +368,6 @@ static struct kobj_type cache_type = { - .sysfs_ops = &cache_ops, - }; - --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - /****** Sysfs of Performance Counters ******/ - - struct kfd_perf_attr { -@@ -407,7 +400,6 @@ static struct kfd_perf_attr perf_attr_iommu[] = { - KFD_PERF_DESC(counter_ids, 0), - }; - /****************************************/ --#endif - - static ssize_t node_show(struct kobject *kobj, struct attribute *attr, - char *buffer) -@@ -546,9 +538,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) - struct kfd_iolink_properties *iolink; - struct kfd_cache_properties *cache; - struct kfd_mem_properties *mem; --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - struct kfd_perf_properties *perf; --#endif - - if (dev->kobj_iolink) { - list_for_each_entry(iolink, &dev->io_link_props, list) -@@ -590,7 +580,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) - dev->kobj_mem = NULL; - } - --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - if (dev->kobj_perf) { - list_for_each_entry(perf, &dev->perf_props, list) { - kfree(perf->attr_group); -@@ -600,7 +589,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) - kobject_put(dev->kobj_perf); - dev->kobj_perf = NULL; - } --#endif - - if (dev->kobj_node) { - sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); -@@ -618,11 +606,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, - struct kfd_iolink_properties *iolink; - struct kfd_cache_properties *cache; - struct kfd_mem_properties *mem; --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - struct kfd_perf_properties *perf; - uint32_t num_attrs; - struct attribute **attrs; --#endif - int ret; - uint32_t i; - -@@ -653,11 +639,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, - if (!dev->kobj_iolink) - return -ENOMEM; - --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node); - if (!dev->kobj_perf) - return -ENOMEM; --#endif - - /* - * Creating sysfs files for node properties -@@ -749,7 +733,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, - i++; - } - --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - /* All hardware blocks have the same number of attributes. */ - num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr); - list_for_each_entry(perf, &dev->perf_props, list) { -@@ -775,7 +758,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, - if (ret < 0) - return ret; - } --#endif - - return 0; - } -@@ -942,7 +924,6 @@ static void find_system_memory(const struct dmi_header *dm, - } - } - --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - /* - * Performance counters information is not part of CRAT but we would like to - * put them in the sysfs under topology directory for Thunk to get the data. -@@ -966,7 +947,6 @@ static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) - - return 0; - } --#endif - - /* kfd_add_non_crat_information - Add information that is not currently - * defined in CRAT but is necessary for KFD topology -@@ -1074,11 +1054,9 @@ int kfd_topology_init(void) - } - } - --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - kdev = list_first_entry(&temp_topology_device_list, - struct kfd_topology_device, list); - kfd_add_perf_to_topology(kdev); --#endif - - down_write(&topology_lock); - kfd_topology_update_device_list(&temp_topology_device_list, -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -index b59b32c..f22d420 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -@@ -141,14 +141,12 @@ struct kfd_iolink_properties { - struct attribute attr; - }; - --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - struct kfd_perf_properties { - struct list_head list; - char block_name[16]; - uint32_t max_concurrent; - struct attribute_group *attr_group; - }; --#endif - - struct kfd_topology_device { - struct list_head list; -@@ -160,17 +158,13 @@ struct kfd_topology_device { - struct list_head cache_props; - uint32_t io_link_count; - struct list_head io_link_props; --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - struct list_head perf_props; --#endif - struct kfd_dev *gpu; - struct kobject *kobj_node; - struct kobject *kobj_mem; - struct kobject *kobj_cache; - struct kobject *kobj_iolink; --#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) - struct kobject *kobj_perf; --#endif - struct attribute attr_gpuid; - struct attribute attr_name; - struct attribute attr_props; -diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -index 2780641..977b21b 100644 ---- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -@@ -707,11 +707,7 @@ static int dm_display_resume(struct drm_device *ddev) - - err: - DRM_ERROR("Restoring old state failed with %i\n", ret); --#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) -- drm_atomic_state_free(state); --#else - drm_atomic_state_put(state); --#endif - - return ret; - } -diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h -old mode 100644 -new mode 100755 -index 36f3766..b6cf2d5 ---- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h -+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h -@@ -30,6 +30,7 @@ - - #include <linux/types.h> - #include <linux/bitmap.h> -+#include <linux/dma-buf.h> - - struct pci_dev; - -@@ -40,6 +41,46 @@ struct kfd_dev; - struct kgd_dev; - - struct kgd_mem; -+struct kfd_process_device; -+struct amdgpu_bo; -+ -+enum kfd_preempt_type { -+ KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN = 0, -+ KFD_PREEMPT_TYPE_WAVEFRONT_RESET, -+}; -+ -+struct kfd_vm_fault_info { -+ uint64_t page_addr; -+ uint32_t vmid; -+ uint32_t mc_id; -+ uint32_t status; -+ bool prot_valid; -+ bool prot_read; -+ bool prot_write; -+ bool prot_exec; -+}; -+ -+struct kfd_cu_info { -+ uint32_t num_shader_engines; -+ uint32_t num_shader_arrays_per_engine; -+ uint32_t num_cu_per_sh; -+ uint32_t cu_active_number; -+ uint32_t cu_ao_mask; -+ uint32_t simd_per_cu; -+ uint32_t max_waves_per_simd; -+ uint32_t wave_front_size; -+ uint32_t max_scratch_slots_per_cu; -+ uint32_t lds_size; -+ uint32_t cu_bitmap[4][4]; -+}; -+ -+/* For getting GPU local memory information from KGD */ -+struct kfd_local_mem_info { -+ uint64_t local_mem_size_private; -+ uint64_t local_mem_size_public; -+ uint32_t vram_width; -+ uint32_t mem_clk_max; -+}; - - enum kgd_memory_pool { - KGD_POOL_SYSTEM_CACHEABLE = 1, -@@ -72,6 +113,21 @@ struct kgd2kfd_shared_resources { - /* Bit n == 1 means Queue n is available for KFD */ - DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); - -+ /* Doorbell assignments (SOC15 and later chips only). Only -+ * specific doorbells are routed to each SDMA engine. Others -+ * are routed to IH and VCN. They are not usable by the CP. -+ * -+ * Any doorbell number D that satisfies the following condition -+ * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val -+ * -+ * KFD currently uses 1024 (= 0x3ff) doorbells per process. If -+ * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means -+ * mask would be set to 0x1f8 and val set to 0x0f0. -+ */ -+ unsigned int sdma_doorbell[2][2]; -+ unsigned int reserved_doorbell_mask; -+ unsigned int reserved_doorbell_val; -+ - /* Base address of doorbell aperture. */ - phys_addr_t doorbell_physical_address; - -@@ -80,8 +136,41 @@ struct kgd2kfd_shared_resources { - - /* Number of bytes at start of aperture reserved for KGD. */ - size_t doorbell_start_offset; -+ -+ /* GPUVM address space size in bytes */ -+ uint64_t gpuvm_size; - }; - -+struct tile_config { -+ uint32_t *tile_config_ptr; -+ uint32_t *macro_tile_config_ptr; -+ uint32_t num_tile_configs; -+ uint32_t num_macro_tile_configs; -+ -+ uint32_t gb_addr_config; -+ uint32_t num_banks; -+ uint32_t num_ranks; -+}; -+ -+/* -+ * Allocation flag domains currently only VRAM and GTT domain supported -+ */ -+#define ALLOC_MEM_FLAGS_VRAM (1 << 0) -+#define ALLOC_MEM_FLAGS_GTT (1 << 1) -+#define ALLOC_MEM_FLAGS_USERPTR (1 << 2) -+#define ALLOC_MEM_FLAGS_DOORBELL (1 << 3) -+ -+/* -+ * Allocation flags attributes/access options. -+ */ -+#define ALLOC_MEM_FLAGS_NONPAGED (1 << 31) -+#define ALLOC_MEM_FLAGS_READONLY (1 << 30) -+#define ALLOC_MEM_FLAGS_PUBLIC (1 << 29) -+#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) -+#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) -+#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) -+#define ALLOC_MEM_FLAGS_COHERENT (1 << 25) -+ - /** - * struct kfd2kgd_calls - * -@@ -90,7 +179,7 @@ struct kgd2kfd_shared_resources { - * - * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture - * -- * @get_vmem_size: Retrieves (physical) size of VRAM -+ * @get_local_mem_info: Retrieves information about GPU local memory - * - * @get_gpu_clock_counter: Retrieves GPU clock counter - * -@@ -112,6 +201,12 @@ struct kgd2kfd_shared_resources { - * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot. - * used only for no HWS mode. - * -+ * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs. -+ * Array is allocated with kmalloc, needs to be freed with kfree by caller. -+ * -+ * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs. -+ * Array is allocated with kmalloc, needs to be freed with kfree by caller. -+ * - * @hqd_is_occupies: Checks if a hqd slot is occupied. - * - * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot. -@@ -121,8 +216,34 @@ struct kgd2kfd_shared_resources { - * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that - * SDMA hqd slot. - * -+ * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs -+ * -+ * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs -+ * - * @get_fw_version: Returns FW versions from the header - * -+ * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to -+ * IOMMU when address translation failed -+ * -+ * @get_cu_info: Retrieves activated cu info -+ * -+ * @get_dmabuf_info: Returns information about a dmabuf if it was -+ * created by the GPU driver -+ * -+ * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object -+ * Supports only DMA buffers created by GPU driver on the same GPU -+ * -+ * @export_dmabuf: Emports a KFD BO for sharing with other process -+ * -+ * @submit_ib: Submits an IB to the engine specified by inserting the IB to -+ * the corresonded ring (ring type). -+ * -+ * @restore_process_bos: Restore all BOs that belongs to the process -+ * -+ * @copy_mem_to_mem: Copies size bytes from source BO to destination BO -+ * -+ * @get_vram_usage: Returns current VRAM usage -+ * - * This structure contains function pointers to services that the kgd driver - * provides to amdkfd driver. - * -@@ -134,11 +255,23 @@ struct kfd2kgd_calls { - - void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); - -- uint64_t (*get_vmem_size)(struct kgd_dev *kgd); -+ void(*get_local_mem_info)(struct kgd_dev *kgd, -+ struct kfd_local_mem_info *mem_info); - uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd); - - uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd); - -+ int (*create_process_vm)(struct kgd_dev *kgd, void **vm, -+ void **process_info, struct dma_fence **ef); -+ void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm); -+ -+ int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem); -+ void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem); -+ -+ uint32_t (*get_process_page_dir)(void *vm); -+ -+ int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); -+ - /* Register access functions */ - void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid, - uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, -@@ -151,16 +284,28 @@ struct kfd2kgd_calls { - uint32_t hpd_size, uint64_t hpd_gpu_addr); - - int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); -+ - - int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr); -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t wptr_shift, uint32_t wptr_mask, -+ struct mm_struct *mm); -+ -+ int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd, -+ uint32_t __user *wptr, struct mm_struct *mm); -+ -+ int (*hqd_dump)(struct kgd_dev *kgd, -+ uint32_t pipe_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs); - -- int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd); -+ int (*hqd_sdma_dump)(struct kgd_dev *kgd, -+ uint32_t engine_id, uint32_t queue_id, -+ uint32_t (**dump)[2], uint32_t *n_regs); - - bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); - -- int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type, -+ int (*hqd_destroy)(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id); - -@@ -168,7 +313,7 @@ struct kfd2kgd_calls { - - int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd, - unsigned int timeout); -- -+ - int (*address_watch_disable)(struct kgd_dev *kgd); - int (*address_watch_execute)(struct kgd_dev *kgd, - unsigned int watch_point_id, -@@ -187,11 +332,72 @@ struct kfd2kgd_calls { - uint16_t (*get_atc_vmid_pasid_mapping_pasid)( - struct kgd_dev *kgd, - uint8_t vmid); -+ uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd); - void (*write_vmid_invalidate_request)(struct kgd_dev *kgd, - uint8_t vmid); - -+ int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid); -+ -+ int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); -+ -+ int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va, -+ uint64_t size, void *vm, -+ struct kgd_mem **mem, uint64_t *offset, -+ uint32_t flags); -+ int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, -+ void *vm); -+ int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, -+ void *vm); -+ int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, -+ void *vm); -+ - uint16_t (*get_fw_version)(struct kgd_dev *kgd, - enum kgd_engine_type type); -+ -+ void (*set_num_of_requests)(struct kgd_dev *kgd, -+ uint8_t num_of_requests); -+ int (*alloc_memory_of_scratch)(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid); -+ int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype); -+ void (*get_cu_info)(struct kgd_dev *kgd, -+ struct kfd_cu_info *cu_info); -+ int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma); -+ int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd, -+ struct kgd_mem *mem, void **kptr); -+ void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base); -+ -+ int (*pin_get_sg_table_bo)(struct kgd_dev *kgd, -+ struct kgd_mem *mem, uint64_t offset, -+ uint64_t size, struct sg_table **ret_sg); -+ void (*unpin_put_sg_table_bo)(struct kgd_mem *mem, -+ struct sg_table *sg); -+ -+ int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd, -+ struct kgd_dev **dma_buf_kgd, uint64_t *bo_size, -+ void *metadata_buffer, size_t buffer_size, -+ uint32_t *metadata_size, uint32_t *flags); -+ int (*import_dmabuf)(struct kgd_dev *kgd, struct dma_buf *dmabuf, -+ uint64_t va, void *vm, struct kgd_mem **mem, -+ uint64_t *size, uint64_t *mmap_offset); -+ int (*export_dmabuf)(struct kgd_dev *kgd, void *vm, struct kgd_mem *mem, -+ struct dma_buf **dmabuf); -+ -+ int (*get_vm_fault_info)(struct kgd_dev *kgd, -+ struct kfd_vm_fault_info *info); -+ int (*submit_ib)(struct kgd_dev *kgd, enum kgd_engine_type engine, -+ uint32_t vmid, uint64_t gpu_addr, -+ uint32_t *ib_cmd, uint32_t ib_len); -+ int (*get_tile_config)(struct kgd_dev *kgd, -+ struct tile_config *config); -+ -+ int (*restore_process_bos)(void *process_info, struct dma_fence **ef); -+ int (*copy_mem_to_mem)(struct kgd_dev *kgd, struct kgd_mem *src_mem, -+ uint64_t src_offset, struct kgd_mem *dst_mem, -+ uint64_t dest_offset, uint64_t size, -+ struct dma_fence **f, uint64_t *actual_size); -+ uint64_t (*get_vram_usage)(struct kgd_dev *kgd); - }; - - /** -@@ -210,6 +416,13 @@ struct kfd2kgd_calls { - * - * @resume: Notifies amdkfd about a resume action done to a kgd device - * -+ * @quiesce_mm: Quiesce all user queue access to specified MM address space -+ * -+ * @resume_mm: Resume user queue access to specified MM address space -+ * -+ * @schedule_evict_and_restore_process: Schedules work queue that will prepare -+ * for safe eviction of KFD BOs that belong to the specified process. -+ * - * This structure contains function callback pointers so the kgd driver - * will notify to the amdkfd about certain status changes. - * -@@ -224,9 +437,13 @@ struct kgd2kfd_calls { - void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); - void (*suspend)(struct kfd_dev *kfd); - int (*resume)(struct kfd_dev *kfd); -+ int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm); -+ int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm); -+ int (*schedule_evict_and_restore_process)(struct mm_struct *mm, -+ struct dma_fence *fence); - }; - - int kgd2kfd_init(unsigned interface_version, - const struct kgd2kfd_calls **g2f); - --#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ -+#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ -diff --git a/drivers/gpu/drm/amd/include/v9_structs.h b/drivers/gpu/drm/amd/include/v9_structs.h -old mode 100644 -new mode 100755 -index 2fb25ab..ceaf493 ---- a/drivers/gpu/drm/amd/include/v9_structs.h -+++ b/drivers/gpu/drm/amd/include/v9_structs.h -@@ -29,10 +29,10 @@ struct v9_sdma_mqd { - uint32_t sdmax_rlcx_rb_base; - uint32_t sdmax_rlcx_rb_base_hi; - uint32_t sdmax_rlcx_rb_rptr; -+ uint32_t sdmax_rlcx_rb_rptr_hi; - uint32_t sdmax_rlcx_rb_wptr; -+ uint32_t sdmax_rlcx_rb_wptr_hi; - uint32_t sdmax_rlcx_rb_wptr_poll_cntl; -- uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; -- uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; - uint32_t sdmax_rlcx_rb_rptr_addr_hi; - uint32_t sdmax_rlcx_rb_rptr_addr_lo; - uint32_t sdmax_rlcx_ib_cntl; -@@ -44,29 +44,29 @@ struct v9_sdma_mqd { - uint32_t sdmax_rlcx_skip_cntl; - uint32_t sdmax_rlcx_context_status; - uint32_t sdmax_rlcx_doorbell; -- uint32_t sdmax_rlcx_virtual_addr; -- uint32_t sdmax_rlcx_ape1_cntl; -+ uint32_t sdmax_rlcx_status; - uint32_t sdmax_rlcx_doorbell_log; -- uint32_t reserved_22; -- uint32_t reserved_23; -- uint32_t reserved_24; -- uint32_t reserved_25; -- uint32_t reserved_26; -- uint32_t reserved_27; -- uint32_t reserved_28; -- uint32_t reserved_29; -- uint32_t reserved_30; -- uint32_t reserved_31; -- uint32_t reserved_32; -- uint32_t reserved_33; -- uint32_t reserved_34; -- uint32_t reserved_35; -- uint32_t reserved_36; -- uint32_t reserved_37; -- uint32_t reserved_38; -- uint32_t reserved_39; -- uint32_t reserved_40; -- uint32_t reserved_41; -+ uint32_t sdmax_rlcx_watermark; -+ uint32_t sdmax_rlcx_doorbell_offset; -+ uint32_t sdmax_rlcx_csa_addr_lo; -+ uint32_t sdmax_rlcx_csa_addr_hi; -+ uint32_t sdmax_rlcx_ib_sub_remain; -+ uint32_t sdmax_rlcx_preempt; -+ uint32_t sdmax_rlcx_dummy_reg; -+ uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; -+ uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; -+ uint32_t sdmax_rlcx_rb_aql_cntl; -+ uint32_t sdmax_rlcx_minor_ptr_update; -+ uint32_t sdmax_rlcx_midcmd_data0; -+ uint32_t sdmax_rlcx_midcmd_data1; -+ uint32_t sdmax_rlcx_midcmd_data2; -+ uint32_t sdmax_rlcx_midcmd_data3; -+ uint32_t sdmax_rlcx_midcmd_data4; -+ uint32_t sdmax_rlcx_midcmd_data5; -+ uint32_t sdmax_rlcx_midcmd_data6; -+ uint32_t sdmax_rlcx_midcmd_data7; -+ uint32_t sdmax_rlcx_midcmd_data8; -+ uint32_t sdmax_rlcx_midcmd_cntl; - uint32_t reserved_42; - uint32_t reserved_43; - uint32_t reserved_44; -diff --git a/drivers/gpu/drm/amd/include/vi_structs.h b/drivers/gpu/drm/amd/include/vi_structs.h -old mode 100644 -new mode 100755 -index 2023482..717fbae ---- a/drivers/gpu/drm/amd/include/vi_structs.h -+++ b/drivers/gpu/drm/amd/include/vi_structs.h -@@ -153,6 +153,8 @@ struct vi_sdma_mqd { - uint32_t reserved_125; - uint32_t reserved_126; - uint32_t reserved_127; -+ uint32_t sdma_engine_id; -+ uint32_t sdma_queue_id; - }; - - struct vi_mqd { -diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c -old mode 100644 -new mode 100755 -index 2292462..82d97f3 ---- a/drivers/pci/pci.c -+++ b/drivers/pci/pci.c -@@ -2983,6 +2983,87 @@ bool pci_acs_path_enabled(struct pci_dev *start, - } - - /** -+ * pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port -+ * @dev: the PCI device -+ * -+ * Return 0 if the device is capable of generating AtomicOp requests, -+ * all upstream bridges support AtomicOp routing, egress blocking is disabled -+ * on all upstream ports, and the root port supports 32-bit, 64-bit and/or -+ * 128-bit AtomicOp completion, or negative otherwise. -+ */ -+int pci_enable_atomic_ops_to_root(struct pci_dev *dev) -+{ -+ struct pci_bus *bus = dev->bus; -+ -+ if (!pci_is_pcie(dev)) -+ return -EINVAL; -+ -+ switch (pci_pcie_type(dev)) { -+ /* -+ * PCIe 3.0, 6.15 specifies that endpoints and root ports are permitted -+ * to implement AtomicOp requester capabilities. -+ */ -+ case PCI_EXP_TYPE_ENDPOINT: -+ case PCI_EXP_TYPE_LEG_END: -+ case PCI_EXP_TYPE_RC_END: -+ break; -+ default: -+ return -EINVAL; -+ } -+ -+ while (bus->parent) { -+ struct pci_dev *bridge = bus->self; -+ u32 cap; -+ -+ pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap); -+ -+ switch (pci_pcie_type(bridge)) { -+ /* -+ * Upstream, downstream and root ports may implement AtomicOp -+ * routing capabilities. AtomicOp routing via a root port is -+ * not considered. -+ */ -+ case PCI_EXP_TYPE_UPSTREAM: -+ case PCI_EXP_TYPE_DOWNSTREAM: -+ if (!(cap & PCI_EXP_DEVCAP2_ATOMIC_ROUTE)) -+ return -EINVAL; -+ break; -+ -+ /* -+ * Root ports are permitted to implement AtomicOp completion -+ * capabilities. -+ */ -+ case PCI_EXP_TYPE_ROOT_PORT: -+ if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | -+ PCI_EXP_DEVCAP2_ATOMIC_COMP64 | -+ PCI_EXP_DEVCAP2_ATOMIC_COMP128))) -+ return -EINVAL; -+ break; -+ } -+ -+ /* -+ * Upstream ports may block AtomicOps on egress. -+ */ -+ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) { -+ u32 ctl2; -+ -+ pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, -+ &ctl2); -+ if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_BLOCK) -+ return -EINVAL; -+ } -+ -+ bus = bus->parent; -+ } -+ -+ pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, -+ PCI_EXP_DEVCTL2_ATOMIC_REQ); -+ -+ return 0; -+} -+EXPORT_SYMBOL(pci_enable_atomic_ops_to_root); -+ -+/** - * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge - * @dev: the PCI device - * @pin: the INTx pin (1=INTA, 2=INTB, 3=INTC, 4=INTD) -diff --git a/include/drm/amd_rdma.h b/include/drm/amd_rdma.h -new file mode 100644 -index 0000000..b0cab3c ---- /dev/null -+++ b/include/drm/amd_rdma.h -@@ -0,0 +1,70 @@ -+/* -+ * Copyright 2015 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+/* @file This file defined kernel interfaces to communicate with amdkfd */ -+ -+#ifndef AMD_RDMA_H_ -+#define AMD_RDMA_H_ -+ -+ -+/** -+ * Structure describing information needed to P2P access from another device -+ * to specific location of GPU memory -+ */ -+struct amd_p2p_info { -+ uint64_t va; /**< Specify user virt. address -+ * which this page table -+ * described -+ */ -+ uint64_t size; /**< Specify total size of -+ * allocation -+ */ -+ struct pid *pid; /**< Specify process pid to which -+ * virtual address belongs -+ */ -+ struct sg_table *pages; /**< Specify DMA/Bus addresses */ -+ void *priv; /**< Pointer set by AMD kernel -+ * driver -+ */ -+}; -+ -+/** -+ * Structure providing function pointers to support rdma/p2p requirements. -+ * to specific location of GPU memory -+ */ -+struct amd_rdma_interface { -+ int (*get_pages)(uint64_t address, uint64_t length, struct pid *pid, -+ struct amd_p2p_info **amd_p2p_data, -+ void (*free_callback)(void *client_priv), -+ void *client_priv); -+ int (*put_pages)(struct amd_p2p_info **amd_p2p_data); -+ int (*is_gpu_address)(uint64_t address, struct pid *pid); -+ int (*get_page_size)(uint64_t address, uint64_t length, struct pid *pid, -+ unsigned long *page_size); -+}; -+ -+ -+int amdkfd_query_rdma_interface(const struct amd_rdma_interface **rdma); -+ -+ -+#endif /* AMD_RDMA_H_ */ -+ -diff --git a/include/linux/pci.h b/include/linux/pci.h -old mode 100644 -new mode 100755 -index b1abbcc..3df545d ---- a/include/linux/pci.h -+++ b/include/linux/pci.h -@@ -2072,6 +2072,7 @@ void pci_request_acs(void); - bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags); - bool pci_acs_path_enabled(struct pci_dev *start, - struct pci_dev *end, u16 acs_flags); -+int pci_enable_atomic_ops_to_root(struct pci_dev *dev); - - #define PCI_VPD_LRDT 0x80 /* Large Resource Data Type */ - #define PCI_VPD_LRDT_ID(x) ((x) | PCI_VPD_LRDT) -diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h -index 5bb2b45..de5367c 100644 ---- a/include/uapi/linux/kfd_ioctl.h -+++ b/include/uapi/linux/kfd_ioctl.h -@@ -23,15 +23,15 @@ - #ifndef KFD_IOCTL_H_INCLUDED - #define KFD_IOCTL_H_INCLUDED - --#include <drm/drm.h> -+#include <linux/types.h> - #include <linux/ioctl.h> - - #define KFD_IOCTL_MAJOR_VERSION 1 --#define KFD_IOCTL_MINOR_VERSION 1 -+#define KFD_IOCTL_MINOR_VERSION 2 - - struct kfd_ioctl_get_version_args { -- __u32 major_version; /* from KFD */ -- __u32 minor_version; /* from KFD */ -+ uint32_t major_version; /* from KFD */ -+ uint32_t minor_version; /* from KFD */ - }; - - /* For kfd_ioctl_create_queue_args.queue_type. */ -@@ -43,36 +43,51 @@ struct kfd_ioctl_get_version_args { - #define KFD_MAX_QUEUE_PRIORITY 15 - - struct kfd_ioctl_create_queue_args { -- __u64 ring_base_address; /* to KFD */ -- __u64 write_pointer_address; /* from KFD */ -- __u64 read_pointer_address; /* from KFD */ -- __u64 doorbell_offset; /* from KFD */ -- -- __u32 ring_size; /* to KFD */ -- __u32 gpu_id; /* to KFD */ -- __u32 queue_type; /* to KFD */ -- __u32 queue_percentage; /* to KFD */ -- __u32 queue_priority; /* to KFD */ -- __u32 queue_id; /* from KFD */ -- -- __u64 eop_buffer_address; /* to KFD */ -- __u64 eop_buffer_size; /* to KFD */ -- __u64 ctx_save_restore_address; /* to KFD */ -- __u64 ctx_save_restore_size; /* to KFD */ -+ uint64_t ring_base_address; /* to KFD */ -+ uint64_t write_pointer_address; /* from KFD */ -+ uint64_t read_pointer_address; /* from KFD */ -+ uint64_t doorbell_offset; /* from KFD */ -+ -+ uint32_t ring_size; /* to KFD */ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t queue_type; /* to KFD */ -+ uint32_t queue_percentage; /* to KFD */ -+ uint32_t queue_priority; /* to KFD */ -+ uint32_t queue_id; /* from KFD */ -+ -+ uint64_t eop_buffer_address; /* to KFD */ -+ uint64_t eop_buffer_size; /* to KFD */ -+ uint64_t ctx_save_restore_address; /* to KFD */ -+ uint32_t ctx_save_restore_size; /* to KFD */ -+ uint32_t ctl_stack_size; /* to KFD */ - }; - - struct kfd_ioctl_destroy_queue_args { -- __u32 queue_id; /* to KFD */ -- __u32 pad; -+ uint32_t queue_id; /* to KFD */ -+ uint32_t pad; - }; - - struct kfd_ioctl_update_queue_args { -- __u64 ring_base_address; /* to KFD */ -+ uint64_t ring_base_address; /* to KFD */ -+ -+ uint32_t queue_id; /* to KFD */ -+ uint32_t ring_size; /* to KFD */ -+ uint32_t queue_percentage; /* to KFD */ -+ uint32_t queue_priority; /* to KFD */ -+}; - -- __u32 queue_id; /* to KFD */ -- __u32 ring_size; /* to KFD */ -- __u32 queue_percentage; /* to KFD */ -- __u32 queue_priority; /* to KFD */ -+struct kfd_ioctl_set_cu_mask_args { -+ uint32_t queue_id; /* to KFD */ -+ uint32_t num_cu_mask; /* to KFD */ -+ uint64_t cu_mask_ptr; /* to KFD */ -+}; -+ -+struct kfd_ioctl_get_queue_wave_state_args { -+ uint64_t ctl_stack_address; /* to KFD */ -+ uint32_t ctl_stack_used_size; /* from KFD */ -+ uint32_t save_area_used_size; /* from KFD */ -+ uint32_t queue_id; /* to KFD */ -+ uint32_t pad; - }; - - /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ -@@ -80,13 +95,20 @@ struct kfd_ioctl_update_queue_args { - #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 - - struct kfd_ioctl_set_memory_policy_args { -- __u64 alternate_aperture_base; /* to KFD */ -- __u64 alternate_aperture_size; /* to KFD */ -+ uint64_t alternate_aperture_base; /* to KFD */ -+ uint64_t alternate_aperture_size; /* to KFD */ -+ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t default_policy; /* to KFD */ -+ uint32_t alternate_policy; /* to KFD */ -+ uint32_t pad; -+}; - -- __u32 gpu_id; /* to KFD */ -- __u32 default_policy; /* to KFD */ -- __u32 alternate_policy; /* to KFD */ -- __u32 pad; -+struct kfd_ioctl_set_trap_handler_args { -+ uint64_t tba_addr; -+ uint64_t tma_addr; -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t pad; - }; - - /* -@@ -97,35 +119,52 @@ struct kfd_ioctl_set_memory_policy_args { - */ - - struct kfd_ioctl_get_clock_counters_args { -- __u64 gpu_clock_counter; /* from KFD */ -- __u64 cpu_clock_counter; /* from KFD */ -- __u64 system_clock_counter; /* from KFD */ -- __u64 system_clock_freq; /* from KFD */ -+ uint64_t gpu_clock_counter; /* from KFD */ -+ uint64_t cpu_clock_counter; /* from KFD */ -+ uint64_t system_clock_counter; /* from KFD */ -+ uint64_t system_clock_freq; /* from KFD */ - -- __u32 gpu_id; /* to KFD */ -- __u32 pad; -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t pad; - }; - - #define NUM_OF_SUPPORTED_GPUS 7 - - struct kfd_process_device_apertures { -- __u64 lds_base; /* from KFD */ -- __u64 lds_limit; /* from KFD */ -- __u64 scratch_base; /* from KFD */ -- __u64 scratch_limit; /* from KFD */ -- __u64 gpuvm_base; /* from KFD */ -- __u64 gpuvm_limit; /* from KFD */ -- __u32 gpu_id; /* from KFD */ -- __u32 pad; -+ uint64_t lds_base; /* from KFD */ -+ uint64_t lds_limit; /* from KFD */ -+ uint64_t scratch_base; /* from KFD */ -+ uint64_t scratch_limit; /* from KFD */ -+ uint64_t gpuvm_base; /* from KFD */ -+ uint64_t gpuvm_limit; /* from KFD */ -+ uint32_t gpu_id; /* from KFD */ -+ uint32_t pad; - }; - -+/* This IOCTL and the limited NUM_OF_SUPPORTED_GPUS is deprecated. Use -+ * kfd_ioctl_get_process_apertures_new instead, which supports -+ * arbitrary numbers of GPUs. -+ */ - struct kfd_ioctl_get_process_apertures_args { - struct kfd_process_device_apertures - process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */ - - /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ -- __u32 num_of_nodes; -- __u32 pad; -+ uint32_t num_of_nodes; -+ uint32_t pad; -+}; -+ -+struct kfd_ioctl_get_process_apertures_new_args { -+ /* User allocated. Pointer to struct kfd_process_device_apertures -+ * filled in by Kernel -+ */ -+ uint64_t kfd_process_device_apertures_ptr; -+ /* to KFD - indicates amount of memory present in -+ * kfd_process_device_apertures_ptr -+ * from KFD - Number of entries filled by KFD. -+ */ -+ uint32_t num_of_nodes; -+ uint32_t pad; - }; - - #define MAX_ALLOWED_NUM_POINTS 100 -@@ -133,103 +172,245 @@ struct kfd_ioctl_get_process_apertures_args { - #define MAX_ALLOWED_WAC_BUFF_SIZE 128 - - struct kfd_ioctl_dbg_register_args { -- __u32 gpu_id; /* to KFD */ -- __u32 pad; -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t pad; - }; - - struct kfd_ioctl_dbg_unregister_args { -- __u32 gpu_id; /* to KFD */ -- __u32 pad; -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t pad; - }; - - struct kfd_ioctl_dbg_address_watch_args { -- __u64 content_ptr; /* a pointer to the actual content */ -- __u32 gpu_id; /* to KFD */ -- __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ -+ uint64_t content_ptr; /* a pointer to the actual content */ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ - }; - - struct kfd_ioctl_dbg_wave_control_args { -- __u64 content_ptr; /* a pointer to the actual content */ -- __u32 gpu_id; /* to KFD */ -- __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ -+ uint64_t content_ptr; /* a pointer to the actual content */ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ - }; - - /* Matching HSA_EVENTTYPE */ --#define KFD_IOC_EVENT_SIGNAL 0 --#define KFD_IOC_EVENT_NODECHANGE 1 --#define KFD_IOC_EVENT_DEVICESTATECHANGE 2 --#define KFD_IOC_EVENT_HW_EXCEPTION 3 --#define KFD_IOC_EVENT_SYSTEM_EVENT 4 --#define KFD_IOC_EVENT_DEBUG_EVENT 5 --#define KFD_IOC_EVENT_PROFILE_EVENT 6 --#define KFD_IOC_EVENT_QUEUE_EVENT 7 --#define KFD_IOC_EVENT_MEMORY 8 -- --#define KFD_IOC_WAIT_RESULT_COMPLETE 0 --#define KFD_IOC_WAIT_RESULT_TIMEOUT 1 --#define KFD_IOC_WAIT_RESULT_FAIL 2 -- --#define KFD_SIGNAL_EVENT_LIMIT 256 -+#define KFD_IOC_EVENT_SIGNAL 0 -+#define KFD_IOC_EVENT_NODECHANGE 1 -+#define KFD_IOC_EVENT_DEVICESTATECHANGE 2 -+#define KFD_IOC_EVENT_HW_EXCEPTION 3 -+#define KFD_IOC_EVENT_SYSTEM_EVENT 4 -+#define KFD_IOC_EVENT_DEBUG_EVENT 5 -+#define KFD_IOC_EVENT_PROFILE_EVENT 6 -+#define KFD_IOC_EVENT_QUEUE_EVENT 7 -+#define KFD_IOC_EVENT_MEMORY 8 -+ -+#define KFD_IOC_WAIT_RESULT_COMPLETE 0 -+#define KFD_IOC_WAIT_RESULT_TIMEOUT 1 -+#define KFD_IOC_WAIT_RESULT_FAIL 2 -+ -+#define KFD_SIGNAL_EVENT_LIMIT 4096 - - struct kfd_ioctl_create_event_args { -- __u64 event_page_offset; /* from KFD */ -- __u32 event_trigger_data; /* from KFD - signal events only */ -- __u32 event_type; /* to KFD */ -- __u32 auto_reset; /* to KFD */ -- __u32 node_id; /* to KFD - only valid for certain -+ uint64_t event_page_offset; /* from KFD */ -+ uint32_t event_trigger_data; /* from KFD - signal events only */ -+ uint32_t event_type; /* to KFD */ -+ uint32_t auto_reset; /* to KFD */ -+ uint32_t node_id; /* to KFD - only valid for certain - event types */ -- __u32 event_id; /* from KFD */ -- __u32 event_slot_index; /* from KFD */ -+ uint32_t event_id; /* from KFD */ -+ uint32_t event_slot_index; /* from KFD */ - }; - - struct kfd_ioctl_destroy_event_args { -- __u32 event_id; /* to KFD */ -- __u32 pad; -+ uint32_t event_id; /* to KFD */ -+ uint32_t pad; - }; - - struct kfd_ioctl_set_event_args { -- __u32 event_id; /* to KFD */ -- __u32 pad; -+ uint32_t event_id; /* to KFD */ -+ uint32_t pad; - }; - - struct kfd_ioctl_reset_event_args { -- __u32 event_id; /* to KFD */ -- __u32 pad; -+ uint32_t event_id; /* to KFD */ -+ uint32_t pad; - }; - - struct kfd_memory_exception_failure { -- __u32 NotPresent; /* Page not present or supervisor privilege */ -- __u32 ReadOnly; /* Write access to a read-only page */ -- __u32 NoExecute; /* Execute access to a page marked NX */ -- __u32 pad; -+ uint32_t NotPresent; /* Page not present or supervisor privilege */ -+ uint32_t ReadOnly; /* Write access to a read-only page */ -+ uint32_t NoExecute; /* Execute access to a page marked NX */ -+ uint32_t imprecise; /* Can't determine the exact fault address */ - }; - --/* memory exception data*/ -+/* memory exception data */ - struct kfd_hsa_memory_exception_data { - struct kfd_memory_exception_failure failure; -- __u64 va; -- __u32 gpu_id; -- __u32 pad; -+ uint64_t va; -+ uint32_t gpu_id; -+ uint32_t pad; - }; - --/* Event data*/ -+/* Event data */ - struct kfd_event_data { - union { - struct kfd_hsa_memory_exception_data memory_exception_data; - }; /* From KFD */ -- __u64 kfd_event_data_ext; /* pointer to an extension structure -- for future exception types */ -- __u32 event_id; /* to KFD */ -- __u32 pad; -+ uint64_t kfd_event_data_ext; /* pointer to an extension structure -+ for future exception types */ -+ uint32_t event_id; /* to KFD */ -+ uint32_t pad; - }; - - struct kfd_ioctl_wait_events_args { -- __u64 events_ptr; /* pointed to struct -+ uint64_t events_ptr; /* pointed to struct - kfd_event_data array, to KFD */ -- __u32 num_events; /* to KFD */ -- __u32 wait_for_all; /* to KFD */ -- __u32 timeout; /* to KFD */ -- __u32 wait_result; /* from KFD */ -+ uint32_t num_events; /* to KFD */ -+ uint32_t wait_for_all; /* to KFD */ -+ uint32_t timeout; /* to KFD */ -+ uint32_t wait_result; /* from KFD */ -+}; -+ -+struct kfd_ioctl_alloc_memory_of_scratch_args { -+ uint64_t va_addr; /* to KFD */ -+ uint64_t size; /* to KFD */ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t pad; -+}; -+ -+/* Allocation flags: memory types */ -+#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM (1 << 0) -+#define KFD_IOC_ALLOC_MEM_FLAGS_GTT (1 << 1) -+#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 2) -+#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 3) -+/* Allocation flags: attributes/access options */ -+#define KFD_IOC_ALLOC_MEM_FLAGS_NONPAGED (1 << 31) -+#define KFD_IOC_ALLOC_MEM_FLAGS_READONLY (1 << 30) -+#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC (1 << 29) -+#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) -+#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) -+#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) -+#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 25) -+ -+struct kfd_ioctl_alloc_memory_of_gpu_args { -+ uint64_t va_addr; /* to KFD */ -+ uint64_t size; /* to KFD */ -+ uint64_t handle; /* from KFD */ -+ uint64_t mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t flags; -+}; -+ -+struct kfd_ioctl_free_memory_of_gpu_args { -+ uint64_t handle; /* to KFD */ -+}; -+ -+struct kfd_ioctl_map_memory_to_gpu_args { -+ uint64_t handle; /* to KFD */ -+ uint64_t device_ids_array_ptr; /* to KFD */ -+ uint32_t device_ids_array_size; /* to KFD */ -+ uint32_t pad; -+}; -+ -+struct kfd_ioctl_unmap_memory_from_gpu_args { -+ uint64_t handle; /* to KFD */ -+ uint64_t device_ids_array_ptr; /* to KFD */ -+ uint32_t device_ids_array_size; /* to KFD */ -+ uint32_t pad; -+}; -+ -+struct kfd_ioctl_set_process_dgpu_aperture_args { -+ uint64_t dgpu_base; -+ uint64_t dgpu_limit; -+ uint32_t gpu_id; -+ uint32_t pad; -+}; -+ -+struct kfd_ioctl_get_dmabuf_info_args { -+ uint64_t size; /* from KFD */ -+ uint64_t metadata_ptr; /* to KFD */ -+ uint32_t metadata_size; /* to KFD (space allocated by user) -+ * from KFD (actual metadata size) */ -+ uint32_t gpu_id; /* from KFD */ -+ uint32_t flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ -+ uint32_t dmabuf_fd; /* to KFD */ -+}; -+ -+struct kfd_ioctl_import_dmabuf_args { -+ uint64_t va_addr; /* to KFD */ -+ uint64_t handle; /* from KFD */ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t dmabuf_fd; /* to KFD */ -+}; -+ -+struct kfd_ioctl_ipc_export_handle_args { -+ uint64_t handle; /* to KFD */ -+ uint32_t share_handle[4]; /* from KFD */ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t pad; -+}; -+ -+struct kfd_ioctl_ipc_import_handle_args { -+ uint64_t handle; /* from KFD */ -+ uint64_t va_addr; /* to KFD */ -+ uint64_t mmap_offset; /* from KFD */ -+ uint32_t share_handle[4]; /* to KFD */ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t pad; -+}; -+ -+struct kfd_ioctl_get_tile_config_args { -+ /* to KFD: pointer to tile array */ -+ uint64_t tile_config_ptr; -+ /* to KFD: pointer to macro tile array */ -+ uint64_t macro_tile_config_ptr; -+ /* to KFD: array size allocated by user mode -+ * from KFD: array size filled by kernel -+ */ -+ uint32_t num_tile_configs; -+ /* to KFD: array size allocated by user mode -+ * from KFD: array size filled by kernel -+ */ -+ uint32_t num_macro_tile_configs; -+ -+ uint32_t gpu_id; /* to KFD */ -+ uint32_t gb_addr_config; /* from KFD */ -+ uint32_t num_banks; /* from KFD */ -+ uint32_t num_ranks; /* from KFD */ -+ /* struct size can be extended later if needed -+ * without breaking ABI compatibility -+ */ -+}; -+ -+struct kfd_memory_range { -+ uint64_t va_addr; -+ uint64_t size; -+}; -+ -+/* flags definitions -+ * BIT0: 0: read operation, 1: write operation. -+ * This also identifies if the src or dst array belongs to remote process -+ */ -+#define KFD_CROSS_MEMORY_RW_BIT (1 << 0) -+#define KFD_SET_CROSS_MEMORY_READ(flags) (flags &= ~KFD_CROSS_MEMORY_RW_BIT) -+#define KFD_SET_CROSS_MEMORY_WRITE(flags) (flags |= KFD_CROSS_MEMORY_RW_BIT) -+#define KFD_IS_CROSS_MEMORY_WRITE(flags) (flags & KFD_CROSS_MEMORY_RW_BIT) -+ -+struct kfd_ioctl_cross_memory_copy_args { -+ /* to KFD: Process ID of the remote process */ -+ uint32_t pid; -+ /* to KFD: See above definition */ -+ uint32_t flags; -+ /* to KFD: Source GPU VM range */ -+ uint64_t src_mem_range_array; -+ /* to KFD: Size of above array */ -+ uint64_t src_mem_array_size; -+ /* to KFD: Destination GPU VM range */ -+ uint64_t dst_mem_range_array; -+ /* to KFD: Size of above array */ -+ uint64_t dst_mem_array_size; -+ /* from KFD: Total amount of bytes copied */ -+ uint64_t bytes_copied; - }; - - -@@ -287,7 +468,56 @@ struct kfd_ioctl_wait_events_args { - #define AMDKFD_IOC_DBG_WAVE_CONTROL \ - AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args) - -+#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU \ -+ AMDKFD_IOWR(0x11, struct kfd_ioctl_alloc_memory_of_gpu_args) -+ -+#define AMDKFD_IOC_FREE_MEMORY_OF_GPU \ -+ AMDKFD_IOWR(0x12, struct kfd_ioctl_free_memory_of_gpu_args) -+ -+#define AMDKFD_IOC_MAP_MEMORY_TO_GPU \ -+ AMDKFD_IOWR(0x13, struct kfd_ioctl_map_memory_to_gpu_args) -+ -+#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU \ -+ AMDKFD_IOWR(0x14, struct kfd_ioctl_unmap_memory_from_gpu_args) -+ -+#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH \ -+ AMDKFD_IOWR(0x15, struct kfd_ioctl_alloc_memory_of_scratch_args) -+ -+#define AMDKFD_IOC_SET_CU_MASK \ -+ AMDKFD_IOW(0x16, struct kfd_ioctl_set_cu_mask_args) -+ -+#define AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE \ -+ AMDKFD_IOW(0x17, \ -+ struct kfd_ioctl_set_process_dgpu_aperture_args) -+ -+#define AMDKFD_IOC_SET_TRAP_HANDLER \ -+ AMDKFD_IOW(0x18, struct kfd_ioctl_set_trap_handler_args) -+ -+#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW \ -+ AMDKFD_IOWR(0x19, struct kfd_ioctl_get_process_apertures_new_args) -+ -+#define AMDKFD_IOC_GET_DMABUF_INFO \ -+ AMDKFD_IOWR(0x1A, struct kfd_ioctl_get_dmabuf_info_args) -+ -+#define AMDKFD_IOC_IMPORT_DMABUF \ -+ AMDKFD_IOWR(0x1B, struct kfd_ioctl_import_dmabuf_args) -+ -+#define AMDKFD_IOC_GET_TILE_CONFIG \ -+ AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_tile_config_args) -+ -+#define AMDKFD_IOC_IPC_IMPORT_HANDLE \ -+ AMDKFD_IOWR(0x1D, struct kfd_ioctl_ipc_import_handle_args) -+ -+#define AMDKFD_IOC_IPC_EXPORT_HANDLE \ -+ AMDKFD_IOWR(0x1E, struct kfd_ioctl_ipc_export_handle_args) -+ -+#define AMDKFD_IOC_CROSS_MEMORY_COPY \ -+ AMDKFD_IOWR(0x1F, struct kfd_ioctl_cross_memory_copy_args) -+ -+#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE \ -+ AMDKFD_IOWR(0x20, struct kfd_ioctl_get_queue_wave_state_args) -+ - #define AMDKFD_COMMAND_START 0x01 --#define AMDKFD_COMMAND_END 0x11 -+#define AMDKFD_COMMAND_END 0x21 - - #endif -diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h -old mode 100644 -new mode 100755 -index 87c2c84..1256851 ---- a/include/uapi/linux/pci_regs.h -+++ b/include/uapi/linux/pci_regs.h -@@ -624,7 +624,9 @@ - #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ - #define PCI_EXP_DEVCAP2_ARI 0x00000020 /* Alternative Routing-ID */ - #define PCI_EXP_DEVCAP2_ATOMIC_ROUTE 0x00000040 /* Atomic Op routing */ -+#define PCI_EXP_DEVCAP2_ATOMIC_COMP32 0x00000080 /* 32b AtomicOp completion */ - #define PCI_EXP_DEVCAP2_ATOMIC_COMP64 0x00000100 /* Atomic 64-bit compare */ -+#define PCI_EXP_DEVCAP2_ATOMIC_COMP128 0x00000200 /* 128b AtomicOp completion*/ - #define PCI_EXP_DEVCAP2_LTR 0x00000800 /* Latency tolerance reporting */ - #define PCI_EXP_DEVCAP2_OBFF_MASK 0x000c0000 /* OBFF support mechanism */ - #define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */ -@@ -634,6 +636,7 @@ - #define PCI_EXP_DEVCTL2_ARI 0x0020 /* Alternative Routing-ID */ - #define PCI_EXP_DEVCTL2_ATOMIC_REQ 0x0040 /* Set Atomic requests */ - #define PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK 0x0080 /* Block atomic egress */ -+#define PCI_EXP_DEVCTL2_ATOMIC_BLOCK 0x0040 /* Block AtomicOp on egress */ - #define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x0100 /* Allow IDO for requests */ - #define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x0200 /* Allow IDO for completions */ - #define PCI_EXP_DEVCTL2_LTR_EN 0x0400 /* Enable LTR mechanism */ -diff --git a/kernel/fork.c b/kernel/fork.c -index a19ee25..70d8d5b 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -1082,6 +1082,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) - - return mm; - } -+EXPORT_SYMBOL_GPL(mm_access); - - static void complete_vfork_done(struct task_struct *tsk) - { --- -2.7.4 - |