diff options
Diffstat (limited to 'meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch')
-rw-r--r-- | meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch | 16828 |
1 files changed, 0 insertions, 16828 deletions
diff --git a/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch deleted file mode 100644 index c037b8f2..00000000 --- a/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch +++ /dev/null @@ -1,16828 +0,0 @@ -From 879030b8b91026fde404c0ab73293655d0684333 Mon Sep 17 00:00:00 2001 -From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> -Date: Thu, 18 Oct 2018 18:30:48 +0530 -Subject: [PATCH 1370/4131] port in all files amdkfd source files snapshot at - -commit 9918a8f15a957dff68d8bb7d88a2e6485368b626 -Author: shaoyunl <Shaoyun.Liu@amd.com> -Date: Mon Mar 28 16:13:27 2016 -0400 - - drm/amdkfd: Assign SDMA engine in an alternative order when creating - sdma queues - -Change-Id: I705be5e2d78cfe8c4035eb9493432f466aefb007 -Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> ---- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 271 +++- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 104 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 307 +++- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 241 ++- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 1619 ++++++++++++++++++++ - drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 +- - drivers/gpu/drm/amd/amdkfd/Kconfig | 1 + - drivers/gpu/drm/amd/amdkfd/Makefile | 2 +- - drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 43 +- - drivers/gpu/drm/amd/amdkfd/cik_int.h | 22 +- - drivers/gpu/drm/amd/amdkfd/cik_regs.h | 175 ++- - .../gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h | 1377 +++++++++++++++++ - drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1357 +++++++++++++--- - drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 1163 ++++++++++++++ - drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 40 +- - drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 972 ++++++------ - drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h | 66 +- - drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c | 247 ++- - drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h | 313 ++-- - drivers/gpu/drm/amd/amdkfd/kfd_device.c | 283 +++- - .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 497 +++++- - .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 29 +- - .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c | 2 + - .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c | 106 ++ - drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 3 +- - drivers/gpu/drm/amd/amdkfd/kfd_events.c | 522 ++++--- - drivers/gpu/drm/amd/amdkfd/kfd_events.h | 3 +- - drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 79 +- - drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 6 +- - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 26 +- - drivers/gpu/drm/amd/amdkfd/kfd_module.c | 30 +- - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 3 + - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 3 +- - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 92 +- - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 227 ++- - drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 212 ++- - drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h | 120 +- - drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 269 +++- - drivers/gpu/drm/amd/amdkfd/kfd_process.c | 542 ++++++- - .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 102 +- - drivers/gpu/drm/amd/amdkfd/kfd_rdma.c | 296 ++++ - drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 841 +++++----- - drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 23 +- - drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 144 +- - 44 files changed, 10790 insertions(+), 1992 deletions(-) - create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c - create mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h - create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_crat.c - create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_rdma.c - -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c -index ef56352..daeb85f 100755 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c -@@ -21,12 +21,14 @@ - */ - - #include "amdgpu_amdkfd.h" --#include "amd_shared.h" -+#include <linux/dma-buf.h> - #include <drm/drmP.h> - #include "amdgpu.h" - #include "amdgpu_gfx.h" - #include <linux/module.h> - -+#define AMDKFD_SKIP_UNCOMPILED_CODE 1 -+ - const struct kfd2kgd_calls *kfd2kgd; - const struct kgd2kfd_calls *kgd2kfd; - bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); -@@ -64,12 +66,12 @@ int amdgpu_amdkfd_init(void) - bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev) - { - switch (adev->asic_type) { --#ifdef CONFIG_DRM_AMDGPU_CIK - case CHIP_KAVERI: - kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions(); - break; --#endif - case CHIP_CARRIZO: -+ case CHIP_TONGA: -+ case CHIP_FIJI: - kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); - break; - default: -@@ -102,7 +104,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) - struct kgd2kfd_shared_resources gpu_resources = { - .compute_vmid_bitmap = 0xFF00, - .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec, -- .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe -+ .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe, -+ .gpuvm_size = (uint64_t)amdgpu_vm_size << 30 - }; - - /* this is going to have a few of the MSBs set that we need to -@@ -167,6 +170,115 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev) - return r; - } - -+int amdgpu_amdkfd_evict_mem(struct amdgpu_device *adev, struct kgd_mem *mem, -+ struct mm_struct *mm) -+{ -+ int r; -+ -+ if (!adev->kfd) -+ return -ENODEV; -+ -+ mutex_lock(&mem->data2.lock); -+ -+ if (mem->data2.evicted == 1 && delayed_work_pending(&mem->data2.work)) -+ /* Cancelling a scheduled restoration */ -+ cancel_delayed_work(&mem->data2.work); -+ -+ if (++mem->data2.evicted > 1) { -+ mutex_unlock(&mem->data2.lock); -+ return 0; -+ } -+ -+ r = amdgpu_amdkfd_gpuvm_evict_mem(mem, mm); -+ -+ if (r != 0) -+ /* First eviction failed, setting count back to 0 will -+ * make the corresponding restore fail gracefully */ -+ mem->data2.evicted = 0; -+ else -+ /* First eviction counts as 2. Eviction counter == 1 -+ * means that restoration is scheduled. */ -+ mem->data2.evicted = 2; -+ -+ mutex_unlock(&mem->data2.lock); -+ -+ return r; -+} -+ -+static void amdgdu_amdkfd_restore_mem_worker(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct kgd_mem *mem = container_of(dwork, struct kgd_mem, data2.work); -+ struct amdgpu_device *adev; -+ struct mm_struct *mm; -+ -+ mutex_lock(&mem->data2.lock); -+ -+ adev = mem->data2.bo->adev; -+ mm = mem->data2.mm; -+ -+ /* Restoration may have been canceled by another eviction or -+ * could already be done by a restore scheduled earlier */ -+ if (mem->data2.evicted == 1) { -+ amdgpu_amdkfd_gpuvm_restore_mem(mem, mm); -+ mem->data2.evicted = 0; -+ } -+ -+ mutex_unlock(&mem->data2.lock); -+} -+ -+int amdgpu_amdkfd_schedule_restore_mem(struct amdgpu_device *adev, -+ struct kgd_mem *mem, -+ struct mm_struct *mm, -+ unsigned long delay) -+{ -+ int r = 0; -+ -+ if (!adev->kfd) -+ return -ENODEV; -+ -+ mutex_lock(&mem->data2.lock); -+ -+ if (mem->data2.evicted <= 1) { -+ /* Buffer is not evicted (== 0) or its restoration is -+ * already scheduled (== 1) */ -+ pr_err("Unbalanced restore of evicted buffer %p\n", mem); -+ mutex_unlock(&mem->data2.lock); -+ return -EFAULT; -+ } else if (--mem->data2.evicted > 1) { -+ mutex_unlock(&mem->data2.lock); -+ return 0; -+ } -+ -+ /* mem->data2.evicted is 1 after decrememting. Schedule -+ * restoration. */ -+ if (delayed_work_pending(&mem->data2.work)) -+ cancel_delayed_work(&mem->data2.work); -+ mem->data2.mm = mm; -+ INIT_DELAYED_WORK(&mem->data2.work, -+ amdgdu_amdkfd_restore_mem_worker); -+ schedule_delayed_work(&mem->data2.work, delay); -+ -+ mutex_unlock(&mem->data2.lock); -+ -+ return r; -+} -+ -+void amdgpu_amdkfd_cancel_restore_mem(struct amdgpu_device *adev, -+ struct kgd_mem *mem) -+{ -+ if (delayed_work_pending(&mem->data2.work)) -+ cancel_delayed_work_sync(&mem->data2.work); -+} -+ -+u32 pool_to_domain(enum kgd_memory_pool p) -+{ -+ switch (p) { -+ case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM; -+ default: return AMDGPU_GEM_DOMAIN_GTT; -+ } -+} -+ - int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr) -@@ -192,38 +304,38 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - } - - /* map the buffer */ -- r = amdgpu_bo_reserve((*mem)->bo, true); -+ r = amdgpu_bo_reserve((*mem)->data1.bo, true); - if (r) { - dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r); - goto allocate_mem_reserve_bo_failed; - } - -- r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT, -- &(*mem)->gpu_addr); -+ r = amdgpu_bo_pin((*mem)->data1.bo, AMDGPU_GEM_DOMAIN_GTT, -+ &(*mem)->data1.gpu_addr); - if (r) { - dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r); - goto allocate_mem_pin_bo_failed; - } -- *gpu_addr = (*mem)->gpu_addr; -+ *gpu_addr = (*mem)->data1.gpu_addr; - -- r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); -+ r = amdgpu_bo_kmap((*mem)->data1.bo, &(*mem)->data1.cpu_ptr); - if (r) { - dev_err(adev->dev, - "(%d) failed to map bo to kernel for amdkfd\n", r); - goto allocate_mem_kmap_bo_failed; - } -- *cpu_ptr = (*mem)->cpu_ptr; -+ *cpu_ptr = (*mem)->data1.cpu_ptr; - -- amdgpu_bo_unreserve((*mem)->bo); -+ amdgpu_bo_unreserve((*mem)->data1.bo); - - return 0; - - allocate_mem_kmap_bo_failed: -- amdgpu_bo_unpin((*mem)->bo); -+ amdgpu_bo_unpin((*mem)->data1.bo); - allocate_mem_pin_bo_failed: -- amdgpu_bo_unreserve((*mem)->bo); -+ amdgpu_bo_unreserve((*mem)->data1.bo); - allocate_mem_reserve_bo_failed: -- amdgpu_bo_unref(&(*mem)->bo); -+ amdgpu_bo_unref(&(*mem)->data1.bo); - - return r; - } -@@ -234,22 +346,44 @@ void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) - - BUG_ON(mem == NULL); - -- amdgpu_bo_reserve(mem->bo, true); -- amdgpu_bo_kunmap(mem->bo); -- amdgpu_bo_unpin(mem->bo); -- amdgpu_bo_unreserve(mem->bo); -- amdgpu_bo_unref(&(mem->bo)); -+ amdgpu_bo_reserve(mem->data1.bo, true); -+ amdgpu_bo_kunmap(mem->data1.bo); -+ amdgpu_bo_unpin(mem->data1.bo); -+ amdgpu_bo_unreserve(mem->data1.bo); -+ amdgpu_bo_unref(&(mem->data1.bo)); - kfree(mem); - } - --uint64_t get_vmem_size(struct kgd_dev *kgd) -+void get_local_mem_info(struct kgd_dev *kgd, -+ struct kfd_local_mem_info *mem_info) - { -- struct amdgpu_device *adev = -- (struct amdgpu_device *)kgd; -+ uint64_t address_mask; -+ resource_size_t aper_limit; -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - - BUG_ON(kgd == NULL); - -- return adev->mc.real_vram_size; -+ address_mask = ~((1UL << 40) - 1); -+ aper_limit = adev->mc.aper_base + adev->mc.aper_size; -+ memset(mem_info, 0, sizeof(*mem_info)); -+ if (!(adev->mc.aper_base & address_mask || -+ aper_limit & address_mask)) { -+ mem_info->local_mem_size_public = adev->mc.visible_vram_size; -+ mem_info->local_mem_size_private = adev->mc.real_vram_size - -+ adev->mc.visible_vram_size; -+ mem_info->vram_width = adev->mc.vram_width; -+ } else { -+ pr_err("amdgpu: vram aperture is out of 40bit address base: 0x%llx limit 0x%llx\n", -+ adev->mc.aper_base, aper_limit); -+ } -+ -+ pr_debug("amdgpu: address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n", -+ adev->mc.aper_base, aper_limit, -+ mem_info->local_mem_size_public, -+ mem_info->local_mem_size_private); -+ -+ if (amdgpu_powerplay || adev->pm.funcs->get_mclk) -+ mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100; - } - - uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) -@@ -265,5 +399,94 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd) - { - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - /* The sclk is in quantas of 10kHz */ -- return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100; -+ if (amdgpu_powerplay) -+ return amdgpu_dpm_get_sclk(adev, false) / 100; -+ else -+ return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100; -+} -+ -+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ struct amdgpu_cu_info acu_info; -+ -+ memset(cu_info, 0, sizeof(*cu_info)); -+ if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap)) -+ return; -+ -+ memset(&acu_info, 0, sizeof(acu_info)); -+ -+ cu_info->cu_active_number = acu_info.number; -+ cu_info->cu_ao_mask = acu_info.ao_cu_mask; -+ memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], sizeof(acu_info.bitmap)); -+ cu_info->num_shader_engines = adev->gfx.config.max_shader_engines; -+ cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; -+ cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh; -+ cu_info->simd_per_cu = acu_info.simd_per_cu; -+ cu_info->max_waves_per_simd = acu_info.max_waves_per_simd; -+ cu_info->wave_front_size = acu_info.wave_front_size; -+ cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu; -+ cu_info->lds_size = acu_info.lds_size; -+} -+ -+int map_gtt_bo_to_kernel(struct kgd_dev *kgd, -+ struct kgd_mem *mem, void **kptr) -+{ -+ return 0; -+} -+ -+int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, -+ struct kgd_dev **dma_buf_kgd, -+ uint64_t *bo_size, void *metadata_buffer, -+ size_t buffer_size, uint32_t *metadata_size, -+ uint32_t *flags) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ struct dma_buf *dma_buf; -+ struct drm_gem_object *obj; -+ struct amdgpu_bo *bo; -+ uint64_t metadata_flags; -+ int r = -EINVAL; -+ -+ dma_buf = dma_buf_get(dma_buf_fd); -+ if (IS_ERR(dma_buf)) -+ return PTR_ERR(dma_buf); -+ -+ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) -+ /* Can't handle non-graphics buffers */ -+ goto out_put; -+ -+ obj = dma_buf->priv; -+ if (obj->dev->driver != adev->ddev->driver) -+ /* Can't handle buffers from different drivers */ -+ goto out_put; -+ -+ adev = obj->dev->dev_private; -+ bo = gem_to_amdgpu_bo(obj); -+ if (!(bo->initial_domain & (AMDGPU_GEM_DOMAIN_VRAM | -+ AMDGPU_GEM_DOMAIN_GTT))) -+ /* Only VRAM and GTT BOs are supported */ -+ goto out_put; -+ -+ r = 0; -+ if (dma_buf_kgd) -+ *dma_buf_kgd = (struct kgd_dev *)adev; -+ if (bo_size) -+ *bo_size = amdgpu_bo_size(bo); -+ if (metadata_size) -+ *metadata_size = bo->metadata_size; -+ if (metadata_buffer) -+ r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size, -+ metadata_size, &metadata_flags); -+ if (flags) { -+ *flags = (bo->initial_domain & AMDGPU_GEM_DOMAIN_VRAM) ? -+ ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT; -+ -+ if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) -+ *flags |= ALLOC_MEM_FLAGS_PUBLIC; -+ } -+ -+out_put: -+ dma_buf_put(dma_buf); -+ return r; - } -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h -index 8e8c10e..5fa506d 100755 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h -@@ -27,14 +27,46 @@ - - #include <linux/types.h> - #include <linux/mm.h> -+#include <linux/workqueue.h> - #include <kgd_kfd_interface.h> - -+extern const struct kgd2kfd_calls *kgd2kfd; -+ - struct amdgpu_device; - -+struct kfd_bo_va_list { -+ struct list_head bo_list; -+ struct amdgpu_bo_va *bo_va; -+ void *kgd_dev; -+ bool is_mapped; -+}; -+ - struct kgd_mem { -- struct amdgpu_bo *bo; -- uint64_t gpu_addr; -- void *cpu_ptr; -+ union { -+ struct { -+ struct amdgpu_bo *bo; -+ uint64_t gpu_addr; -+ void *cpu_ptr; -+ } data1; -+ struct { -+ struct mutex lock; -+ struct amdgpu_bo *bo; -+ struct list_head bo_va_list; -+ uint32_t domain; -+ unsigned int mapped_to_gpu_memory; -+ void *kptr; -+ uint64_t va; -+ unsigned evicted; /* eviction counter */ -+ struct delayed_work work; /* for restore evicted mem */ -+ struct mm_struct *mm; /* for restore */ -+ /* flags bitfield */ -+ bool readonly : 1; -+ bool execute : 1; -+ bool no_substitute : 1; -+ bool aql_queue : 1; -+ } data2; -+ }; -+ - }; - - -@@ -51,17 +83,81 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); - void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); - void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); - -+int amdgpu_amdkfd_evict_mem(struct amdgpu_device *adev, struct kgd_mem *mem, -+ struct mm_struct *mm); -+int amdgpu_amdkfd_schedule_restore_mem(struct amdgpu_device *adev, -+ struct kgd_mem *mem, -+ struct mm_struct *mm, -+ unsigned long delay); -+void amdgpu_amdkfd_cancel_restore_mem(struct amdgpu_device *adev, -+ struct kgd_mem *mem); -+ - struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); - struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); - - /* Shared API */ -+int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm, -+ struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va); - int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr); - void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); --uint64_t get_vmem_size(struct kgd_dev *kgd); -+void get_local_mem_info(struct kgd_dev *kgd, -+ struct kfd_local_mem_info *mem_info); - uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); - - uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd); -+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info); -+int map_gtt_bo_to_kernel(struct kgd_dev *kgd, -+ struct kgd_mem *mem, void **kptr); -+int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, -+ struct kgd_dev **dmabuf_kgd, -+ uint64_t *bo_size, void *metadata_buffer, -+ size_t buffer_size, uint32_t *metadata_size, -+ uint32_t *flags); -+ -+/* GPUVM API */ -+int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( -+ struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem, -+ uint64_t *offset, void **kptr, -+ struct kfd_process_device *pdd, uint32_t flags); -+int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem); -+int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); -+int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); -+ -+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm); -+void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm); -+ -+uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm); -+ -+int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, -+ struct kfd_vm_fault_info *info); -+ -+int amdgpu_amdkfd_gpuvm_mmap_bo( -+ struct kgd_dev *kgd, struct vm_area_struct *vma); -+ -+int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, -+ struct kgd_mem *mem, void **kptr); -+ -+struct kfd_process_device *amdgpu_amdkfd_gpuvm_get_pdd_from_buffer_object( -+ struct kgd_dev *kgd, struct kgd_mem *mem); -+int amdgpu_amdkfd_gpuvm_return_bo_size( -+ struct kgd_dev *kgd, struct kgd_mem *mem); -+ -+int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, -+ struct kgd_mem *mem, uint64_t offset, -+ uint64_t size, struct sg_table **ret_sg); -+void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( -+ struct kgd_mem *mem, struct sg_table *sg); -+int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, int dma_buf_fd, -+ uint64_t va, void *vm, -+ struct kgd_mem **mem, uint64_t *size); -+int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm); -+int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm); - - #endif /* AMDGPU_AMDKFD_H_INCLUDED */ -+ -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c -index e283d31..873e2b7 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c -@@ -38,6 +38,9 @@ - #include "gmc/gmc_7_1_sh_mask.h" - #include "cik_structs.h" - -+ -+#define AMDKFD_SKIP_UNCOMPILED_CODE 1 -+ - enum { - MAX_TRAPID = 8, /* 3 bits in the bitfield. */ - MAX_WATCH_ADDRESSES = 4 -@@ -54,8 +57,8 @@ enum { - enum { - ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, - ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, -- ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, -- /* extend the mask to 26 bits to match the low address field */ -+ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000, -+ /* extend the mask to 26 bits in order to match the low address field. */ - ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, - ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF - }; -@@ -80,30 +83,43 @@ union TCP_WATCH_CNTL_BITS { - float f32All; - }; - -+static int create_process_vm(struct kgd_dev *kgd, void **vm); -+static void destroy_process_vm(struct kgd_dev *kgd, void *vm); -+ -+static uint32_t get_process_page_dir(void *vm); -+ -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); -+static int map_memory_to_gpu(struct kgd_dev *kgd, struct kgd_mem *mem, -+ void *vm); -+static int unmap_memory_from_gpu(struct kgd_dev *kgd, struct kgd_mem *mem, -+ void *vm); -+static int alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem, -+ uint64_t *offset, void **kptr, struct kfd_process_device *pdd, -+ uint32_t flags); -+static int free_memory_of_gpu(struct kgd_dev *kgd, struct kgd_mem *mem); -+ -+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -+ - /* - * Register access functions - */ - --static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, -- uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); -- --static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, -- unsigned int vmid); -- --static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, -- uint32_t hpd_size, uint64_t hpd_gpu_addr); -+static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, uint32_t sh_mem_config, -+ uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); -+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, unsigned int vmid); -+static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, uint32_t hpd_size, uint64_t hpd_gpu_addr); - static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr); -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t page_table_base); - static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); - static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, -- uint32_t pipe_id, uint32_t queue_id); -- -+ uint32_t pipe_id, uint32_t queue_id); -+static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); - static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id); --static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); - static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout); - static int kgd_address_watch_disable(struct kgd_dev *kgd); -@@ -123,15 +139,25 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid); - static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid); - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); -- --static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -+static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req); -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid); -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype); -+static int mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma); -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base); - - static const struct kfd2kgd_calls kfd2kgd = { - .init_gtt_mem_allocation = alloc_gtt_mem, - .free_gtt_mem = free_gtt_mem, -- .get_vmem_size = get_vmem_size, -+ .get_local_mem_info = get_local_mem_info, - .get_gpu_clock_counter = get_gpu_clock_counter, - .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, -+ .create_process_vm = create_process_vm, -+ .destroy_process_vm = destroy_process_vm, -+ .get_process_page_dir = get_process_page_dir, -+ .open_graphic_handle = open_graphic_handle, - .program_sh_mem_settings = kgd_program_sh_mem_settings, - .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, - .init_pipeline = kgd_init_pipeline, -@@ -149,14 +175,103 @@ static const struct kfd2kgd_calls kfd2kgd = { - .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid, - .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, - .write_vmid_invalidate_request = write_vmid_invalidate_request, -- .get_fw_version = get_fw_version -+ .alloc_memory_of_gpu = alloc_memory_of_gpu, -+ .free_memory_of_gpu = free_memory_of_gpu, -+ .map_memory_to_gpu = map_memory_to_gpu, -+ .unmap_memory_to_gpu = unmap_memory_from_gpu, -+ .get_fw_version = get_fw_version, -+ .set_num_of_requests = set_num_of_requests, -+ .get_cu_info = get_cu_info, -+ .alloc_memory_of_scratch = alloc_memory_of_scratch, -+ .write_config_static_mem = write_config_static_mem, -+ .mmap_bo = mmap_bo, -+ .map_gtt_bo_to_kernel = map_gtt_bo_to_kernel, -+ .set_vm_context_page_table_base = set_vm_context_page_table_base, -+ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, -+ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info - }; - --struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void) -+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions() - { - return (struct kfd2kgd_calls *)&kfd2kgd; - } - -+/* -+ * Creates a VM context for HSA process -+ */ -+static int create_process_vm(struct kgd_dev *kgd, void **vm) -+{ -+ int ret; -+ struct amdgpu_vm *new_vm; -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ BUG_ON(kgd == NULL); -+ BUG_ON(vm == NULL); -+ -+ new_vm = kzalloc(sizeof(struct amdgpu_vm), GFP_KERNEL); -+ if (new_vm == NULL) -+ return -ENOMEM; -+ -+ /* Initialize the VM context, allocate the page directory and zero it */ -+ ret = amdgpu_vm_init(adev, new_vm); -+ if (ret != 0) { -+ /* Undo everything related to the new VM context */ -+ amdgpu_vm_fini(adev, new_vm); -+ kfree(new_vm); -+ new_vm = NULL; -+ } -+ -+ /* Pin the PD directory*/ -+ amdgpu_bo_reserve(new_vm->page_directory, true); -+ amdgpu_bo_pin(new_vm->page_directory, AMDGPU_GEM_DOMAIN_VRAM, NULL); -+ amdgpu_bo_unreserve(new_vm->page_directory); -+#if 0 -+ new_vm->pd_gpu_addr = amdgpu_bo_gpu_offset(new_vm->page_directory); -+#endif -+ *vm = (void *) new_vm; -+ -+ return ret; -+} -+ -+/* -+ * Destroys a VM context of HSA process -+ */ -+static void destroy_process_vm(struct kgd_dev *kgd, void *vm) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ struct amdgpu_vm *rvm = (struct amdgpu_vm *) vm; -+ -+ BUG_ON(kgd == NULL); -+ BUG_ON(vm == NULL); -+ -+ /* Unpin the PD directory*/ -+ amdgpu_bo_reserve(rvm->page_directory, true); -+ amdgpu_bo_unpin(rvm->page_directory); -+ amdgpu_bo_unreserve(rvm->page_directory); -+ -+ /* Release the VM context */ -+ amdgpu_vm_fini(adev, rvm); -+ kfree(vm); -+} -+ -+static uint32_t get_process_page_dir(void *vm) -+{ -+#if 0 -+ struct amdgpu_vm *rvm = (struct amdgpu_vm *) vm; -+ -+ BUG_ON(vm == NULL); -+ -+ return rvm->pd_gpu_addr >> AMDGPU_GPU_PAGE_SHIFT; -+#endif -+ return 0; -+} -+ -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -+ int fd, uint32_t handle, struct kgd_mem **mem) -+{ -+ return 0; -+} -+ - static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) - { - return (struct amdgpu_device *)kgd; -@@ -221,12 +336,11 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, - - /* - * We have to assume that there is no outstanding mapping. -- * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because -- * a mapping is in progress or because a mapping finished and the -- * SW cleared it. So the protocol is to always wait & clear. -+ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a mapping -+ * is in progress or because a mapping finished and the SW cleared it. -+ * So the protocol is to always wait & clear. - */ -- uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | -- ATC_VMID0_PASID_MAPPING__VALID_MASK; -+ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; - - WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); - -@@ -253,7 +367,7 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) - uint32_t mec; - uint32_t pipe; - -- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - lock_srbm(kgd, mec, pipe, 0, 0); -@@ -272,8 +386,7 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) - - retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + - m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; -- -- pr_debug("kfd: sdma base address: 0x%x\n", retval); -+ pr_err("kfd: sdma base address: 0x%x\n", retval); - - return retval; - } -@@ -289,7 +402,8 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) - } - - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr) -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t page_table_base) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t wptr_shadow, is_wptr_shadow_valid; -@@ -363,24 +477,13 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); - -- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, -- m->sdma_rlc_virtual_addr); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, -- m->sdma_rlc_rb_base); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, -- m->sdma_rlc_virtual_addr); -- -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, -- m->sdma_rlc_rb_base_hi); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, -- m->sdma_rlc_rb_rptr_addr_lo); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, -- m->sdma_rlc_rb_rptr_addr_hi); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, -- m->sdma_rlc_doorbell); -- -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -- m->sdma_rlc_rb_cntl); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, m->sdma_rlc_virtual_addr); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdma_rlc_rb_base_hi); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdma_rlc_rb_rptr_addr_lo); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdma_rlc_rb_rptr_addr_hi); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, m->sdma_rlc_doorbell); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, m->sdma_rlc_rb_cntl); - - return 0; - } -@@ -440,10 +543,11 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, - - while (true) { - temp = RREG32(mmCP_HQD_ACTIVE); -- if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) -+ if (temp & CP_HQD_ACTIVE__ACTIVE__SHIFT) - break; - if (timeout <= 0) { -- pr_err("kfd: cp queue preemption time out.\n"); -+ pr_err("kfd: cp queue preemption time out (%dms)\n", -+ temp); - release_queue(kgd); - return -ETIME; - } -@@ -503,8 +607,8 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd) - - /* Turning off this address until we set all the registers */ - for (i = 0; i < MAX_WATCH_ADDRESSES; i++) -- WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_CNTL], cntl.u32All); -+ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); - - return 0; - } -@@ -522,20 +626,20 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, - - /* Turning off this watch point until we set all the registers */ - cntl.bitfields.valid = 0; -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_CNTL], cntl.u32All); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); - -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_ADDR_HI], addr_hi); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], -+ addr_hi); - -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_ADDR_LO], addr_lo); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], -+ addr_lo); - - /* Enable the watch point */ - cntl.bitfields.valid = 1; - -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_CNTL], cntl.u32All); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); - - return 0; - } -@@ -589,7 +693,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); -- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; -+ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; - } - - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) -@@ -599,6 +703,56 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) - WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); - } - -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype) -+{ -+ uint32_t reg; -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | -+ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | -+ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | -+ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; -+ -+ WREG32(mmSH_STATIC_MEM_CONFIG, reg); -+ return 0; -+} -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ lock_srbm(kgd, 0, 0, 0, vmid); -+ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); -+ unlock_srbm(kgd); -+ -+ return 0; -+} -+ -+ -+static int alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem, uint64_t *offset, -+ void **kptr, struct kfd_process_device *pdd, uint32_t flags) -+{ -+ return -EFAULT; -+} -+ -+static int free_memory_of_gpu(struct kgd_dev *kgd, struct kgd_mem *mem) -+{ -+ return -EFAULT; -+} -+ -+static int map_memory_to_gpu(struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) -+{ -+ return -EFAULT; -+} -+ -+static int unmap_memory_from_gpu(struct kgd_dev *kgd, struct kgd_mem *mem, -+ void *vm) -+{ -+ return -EFAULT; -+} -+ - static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - { - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -@@ -639,12 +793,12 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - - case KGD_ENGINE_SDMA1: - hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[0].fw->data; -+ adev->sdma[0].fw->data; - break; - - case KGD_ENGINE_SDMA2: - hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[1].fw->data; -+ adev->sdma[1].fw->data; - break; - - default: -@@ -658,3 +812,32 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - return hdr->common.ucode_version; - } - -+static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req) -+{ -+ uint32_t value; -+ struct amdgpu_device *adev = get_amdgpu_device(dev); -+ -+ value = RREG32(mmATC_ATS_DEBUG); -+ value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK; -+ value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT); -+ -+ WREG32(mmATC_ATS_DEBUG, value); -+} -+ -+static int mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) -+{ -+ return 0; -+} -+ -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ /* TODO: Don't use hardcoded VMIDs */ -+ if (vmid < 8 || vmid > 15) { -+ pr_err("amdkfd: trying to set page table base for wrong VMID\n"); -+ return; -+ } -+ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); -+} -+ -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c -index e00fadd..aeca2b6 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c -@@ -28,6 +28,7 @@ - #include "amdgpu.h" - #include "amdgpu_amdkfd.h" - #include "amdgpu_ucode.h" -+#include "amdgpu_amdkfd_gfx_v8.h" - #include "gca/gfx_8_0_sh_mask.h" - #include "gca/gfx_8_0_d.h" - #include "gca/gfx_8_0_enum.h" -@@ -38,7 +39,24 @@ - #include "vi_structs.h" - #include "vid.h" - --struct cik_sdma_rlc_registers; -+static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { -+ mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, -+ mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, -+ mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, -+ mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL -+}; -+ -+ -+struct vi_sdma_mqd; -+ -+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem); -+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); -+ -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -+ int fd, uint32_t handle, struct kgd_mem **mem); -+ -+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); - - /* - * Register access functions -@@ -54,7 +72,8 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr); - static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr); -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t page_table_base); - static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); - static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); -@@ -83,14 +102,27 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, - static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid); - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); --static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -+static void set_num_of_requests(struct kgd_dev *kgd, -+ uint8_t num_of_requests); -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid); -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype); -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base); - - static const struct kfd2kgd_calls kfd2kgd = { - .init_gtt_mem_allocation = alloc_gtt_mem, - .free_gtt_mem = free_gtt_mem, -- .get_vmem_size = get_vmem_size, -+ .get_local_mem_info = get_local_mem_info, - .get_gpu_clock_counter = get_gpu_clock_counter, - .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, -+ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, -+ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, -+ .create_process_gpumem = create_process_gpumem, -+ .destroy_process_gpumem = destroy_process_gpumem, -+ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, -+ .open_graphic_handle = open_graphic_handle, - .program_sh_mem_settings = kgd_program_sh_mem_settings, - .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, - .init_pipeline = kgd_init_pipeline, -@@ -110,14 +142,52 @@ static const struct kfd2kgd_calls kfd2kgd = { - .get_atc_vmid_pasid_mapping_valid = - get_atc_vmid_pasid_mapping_valid, - .write_vmid_invalidate_request = write_vmid_invalidate_request, -- .get_fw_version = get_fw_version -+ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, -+ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, -+ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, -+ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, -+ .get_fw_version = get_fw_version, -+ .set_num_of_requests = set_num_of_requests, -+ .get_cu_info = get_cu_info, -+ .set_num_of_requests = set_num_of_requests, -+ .alloc_memory_of_scratch = alloc_memory_of_scratch, -+ .write_config_static_mem = write_config_static_mem, -+ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, -+ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, -+ .set_vm_context_page_table_base = set_vm_context_page_table_base, -+ .get_pdd_from_buffer_object = -+ amdgpu_amdkfd_gpuvm_get_pdd_from_buffer_object, -+ .return_bo_size = amdgpu_amdkfd_gpuvm_return_bo_size, -+ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, -+ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, -+ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, -+ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, -+ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info - }; - --struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void) -+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions() - { - return (struct kfd2kgd_calls *)&kfd2kgd; - } - -+static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem) -+{ -+ return 0; -+} -+ -+/* Destroys the GPU allocation and frees the kgd_mem structure */ -+static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) -+{ -+ -+} -+ -+static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -+ int fd, uint32_t handle, struct kgd_mem **mem) -+{ -+ return 0; -+} -+ - static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) - { - return (struct amdgpu_device *)kgd; -@@ -227,9 +297,15 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) - return 0; - } - --static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) -+static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m) - { -- return 0; -+ uint32_t retval; -+ -+ retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + -+ m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET; -+ pr_debug("kfd: sdma base address: 0x%x\n", retval); -+ -+ return retval; - } - - static inline struct vi_mqd *get_mqd(void *mqd) -@@ -237,13 +313,14 @@ static inline struct vi_mqd *get_mqd(void *mqd) - return (struct vi_mqd *)mqd; - } - --static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) -+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) - { -- return (struct cik_sdma_rlc_registers *)mqd; -+ return (struct vi_sdma_mqd *)mqd; - } - - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr) -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t page_table_base) - { - struct vi_mqd *m; - uint32_t shadow_wptr, valid_wptr; -@@ -306,6 +383,49 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - - static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) - { -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ struct vi_sdma_mqd *m; -+ uint32_t sdma_base_addr; -+ uint32_t temp, timeout = 2000; -+ uint32_t data; -+ -+ -+ m = get_sdma_mqd(mqd); -+ sdma_base_addr = get_sdma_base_addr(m); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -+ m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); -+ -+ while (true) { -+ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -+ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -+ break; -+ if (timeout == 0) -+ return -ETIME; -+ msleep(10); -+ timeout -= 10; -+ } -+ if (m->sdma_engine_id) { -+ data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); -+ data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, -+ RESUME_CTX, 0); -+ WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); -+ } else { -+ data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); -+ data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, -+ RESUME_CTX, 0); -+ WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); -+ } -+ -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, m->sdmax_rlcx_doorbell); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, m->sdmax_rlcx_virtual_addr); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdmax_rlcx_rb_base_hi); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdmax_rlcx_rb_rptr_addr_lo); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdmax_rlcx_rb_rptr_addr_hi); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, m->sdmax_rlcx_rb_cntl); -+ - return 0; - } - -@@ -334,7 +454,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct cik_sdma_rlc_registers *m; -+ struct vi_sdma_mqd *m; - uint32_t sdma_base_addr; - uint32_t sdma_rlc_rb_cntl; - -@@ -382,7 +502,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct cik_sdma_rlc_registers *m; -+ struct vi_sdma_mqd *m; - uint32_t sdma_base_addr; - uint32_t temp; - int timeout = utimeout; -@@ -396,7 +516,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - - while (true) { - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) -+ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) - break; - if (timeout <= 0) - return -ETIME; -@@ -405,9 +525,9 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - } - - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -+ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | -+ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); - - return 0; - } -@@ -429,7 +549,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); -- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; -+ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; - } - - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) -@@ -441,6 +561,21 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) - - static int kgd_address_watch_disable(struct kgd_dev *kgd) - { -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ union TCP_WATCH_CNTL_BITS cntl; -+ unsigned int i; -+ -+ cntl.u32All = 0; -+ -+ cntl.bitfields.valid = 0; -+ cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; -+ cntl.bitfields.atc = 1; -+ -+ /* Turning off this address until we set all the registers */ -+ for (i = 0; i < MAX_WATCH_ADDRESSES; i++) -+ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); -+ - return 0; - } - -@@ -450,6 +585,28 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, - uint32_t addr_hi, - uint32_t addr_lo) - { -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ union TCP_WATCH_CNTL_BITS cntl; -+ -+ cntl.u32All = cntl_val; -+ -+ /* Turning off this watch point until we set all the registers */ -+ cntl.bitfields.valid = 0; -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); -+ -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], -+ addr_hi); -+ -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], -+ addr_lo); -+ -+ /* Enable the watch point */ -+ cntl.bitfields.valid = 1; -+ -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], -+ cntl.u32All); -+ - return 0; - } - -@@ -482,6 +639,32 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, - unsigned int watch_point_id, - unsigned int reg_offset) - { -+ return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; -+} -+ -+static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype) -+{ -+ uint32_t reg; -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | -+ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | -+ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | -+ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; -+ -+ WREG32(mmSH_STATIC_MEM_CONFIG, reg); -+ return 0; -+} -+static int alloc_memory_of_scratch(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ -+ lock_srbm(kgd, 0, 0, 0, vmid); -+ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); -+ unlock_srbm(kgd); -+ - return 0; - } - -@@ -525,12 +708,12 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - - case KGD_ENGINE_SDMA1: - hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[0].fw->data; -+ adev->sdma[0].fw->data; - break; - - case KGD_ENGINE_SDMA2: - hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[1].fw->data; -+ adev->sdma[1].fw->data; - break; - - default: -@@ -543,3 +726,21 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - /* Only 12 bit in use*/ - return hdr->common.ucode_version; - } -+ -+static void set_num_of_requests(struct kgd_dev *kgd, -+ uint8_t num_of_requests) -+{ -+ pr_debug("in %s this is a stub\n", __func__); -+} -+ -+static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base) -+{ -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ /* TODO: Don't use hardcoded VMIDs */ -+ if (vmid < 8 || vmid > 15) { -+ pr_err("amdkfd: trying to set page table base for wrong VMID\n"); -+ return; -+ } -+ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); -+} -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c -new file mode 100644 -index 0000000..454c247 ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c -@@ -0,0 +1,1619 @@ -+/* -+ * Copyright 2014 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#include <linux/module.h> -+#include <linux/fdtable.h> -+#include <linux/uaccess.h> -+#include <linux/firmware.h> -+#include <linux/list.h> -+#include <drm/drmP.h> -+#include <linux/dma-buf.h> -+#include "amdgpu.h" -+#include "amdgpu_amdkfd.h" -+#include "amdgpu_ucode.h" -+#include "gca/gfx_8_0_sh_mask.h" -+#include "gca/gfx_8_0_d.h" -+#include "gca/gfx_8_0_enum.h" -+#include "oss/oss_3_0_sh_mask.h" -+#include "oss/oss_3_0_d.h" -+#include "gmc/gmc_8_1_sh_mask.h" -+#include "gmc/gmc_8_1_d.h" -+#include "vi_structs.h" -+#include "vid.h" -+ -+/* Special VM and GART address alignment needed for VI pre-Fiji due to -+ * a HW bug. */ -+#define VI_BO_SIZE_ALIGN (0x8000) -+ -+static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) -+{ -+ return (struct amdgpu_device *)kgd; -+} -+ -+struct kfd_process_device *amdgpu_amdkfd_gpuvm_get_pdd_from_buffer_object( -+ struct kgd_dev *kgd, struct kgd_mem *mem) -+{ -+ return mem->data2.bo->pdd; -+} -+ -+static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, -+ struct list_head *list_bo_va) -+{ -+ struct kfd_bo_va_list *entry; -+ -+ list_for_each_entry(entry, list_bo_va, bo_list) -+ if (entry->bo_va->vm == avm) -+ return false; -+ -+ return true; -+} -+ -+static int add_bo_to_vm(struct amdgpu_device *adev, uint64_t va, -+ struct amdgpu_vm *avm, struct amdgpu_bo *bo, -+ struct list_head *list_bo_va, -+ bool readonly, bool execute) -+{ -+ int ret; -+ struct kfd_bo_va_list *bo_va_entry; -+ uint32_t flags; -+ -+ bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL); -+ if (!bo_va_entry) -+ return -ENOMEM; -+ -+ BUG_ON(va == 0); -+ -+ pr_debug("amdkfd: adding bo_va to bo %p and va 0x%llx id 0x%x\n", -+ bo, va, adev->dev->id); -+ -+ amdgpu_bo_reserve(bo, true); -+ -+ /* Add BO to VM internal data structures*/ -+ bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo); -+ if (bo_va_entry->bo_va == NULL) { -+ ret = -EINVAL; -+ pr_err("amdkfd: Failed to add BO object to VM. ret == %d\n", -+ ret); -+ goto err_vmadd; -+ } -+ -+ flags = AMDGPU_PTE_READABLE | AMDGPU_PTE_WRITEABLE; -+ if (readonly) -+ flags = AMDGPU_PTE_READABLE; -+ if (execute) -+ flags |= AMDGPU_PTE_EXECUTABLE; -+ -+ /* Set virtual address for the allocation, allocate PTs, -+ * if needed, and zero them */ -+ ret = amdgpu_vm_bo_map(adev, bo_va_entry->bo_va, -+ va, 0, amdgpu_bo_size(bo), -+ flags | AMDGPU_PTE_VALID); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to set virtual address for BO. ret == %d (0x%llx)\n", -+ ret, va); -+ goto err_vmsetaddr; -+ } -+ -+ bo_va_entry->kgd_dev = (void *)adev; -+ bo_va_entry->is_mapped = false; -+ list_add(&bo_va_entry->bo_list, list_bo_va); -+ -+ return 0; -+ -+err_vmsetaddr: -+ amdgpu_bo_reserve(bo, true); -+ amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va); -+ /* This will put the bo_va_mapping on the vm->freed -+ * list. amdgpu_vm_clear_freed needs the PTs to be reserved so -+ * we don't call it here. That can wait until the next time -+ * the page tables are updated for a map or unmap. */ -+ kfree(bo_va_entry); -+err_vmadd: -+ amdgpu_bo_unreserve(bo); -+ return ret; -+} -+ -+static void remove_bo_from_vm(struct amdgpu_device *adev, -+ struct amdgpu_bo *bo, struct amdgpu_bo_va *bo_va) -+{ -+ amdgpu_bo_reserve(bo, true); -+ amdgpu_vm_bo_rmv(adev, bo_va); -+ amdgpu_bo_unreserve(bo); -+} -+ -+ -+static int try_pin_bo(struct amdgpu_bo *bo, uint64_t *mc_address, bool resv, -+ uint32_t domain) -+{ -+ int ret = 0; -+ uint64_t temp; -+ -+ if (resv) { -+ ret = amdgpu_bo_reserve(bo, true); -+ if (ret != 0) -+ return ret; -+ } -+ -+ if (!amdgpu_ttm_tt_has_userptr(bo->tbo.ttm)) { -+ ret = amdgpu_bo_pin(bo, domain, &temp); -+ if (mc_address) -+ *mc_address = temp; -+ if (ret != 0) -+ goto error; -+ if (domain == AMDGPU_GEM_DOMAIN_GTT) { -+ ret = amdgpu_bo_kmap(bo, NULL); -+ if (ret != 0) { -+ pr_err("amdgpu: failed kmap GTT BO\n"); -+ goto error; -+ } -+ } -+ } else { -+ /* amdgpu_bo_pin doesn't support userptr. Therefore we -+ * can use the bo->pin_count for our version of -+ * pinning without conflict. */ -+ if (bo->pin_count == 0) { -+ amdgpu_ttm_placement_from_domain(bo, domain); -+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, -+ true, false); -+ if (ret != 0) { -+ pr_err("amdgpu: failed to validate BO\n"); -+ goto error; -+ } -+ } -+ bo->pin_count++; -+ } -+ -+error: -+ if (resv) -+ amdgpu_bo_unreserve(bo); -+ -+ return ret; -+} -+ -+static int unpin_bo(struct amdgpu_bo *bo, bool resv) -+{ -+ int ret = 0; -+ -+ if (resv) { -+ ret = amdgpu_bo_reserve(bo, true); -+ if (ret != 0) -+ return ret; -+ } -+ -+ amdgpu_bo_kunmap(bo); -+ -+ if (!amdgpu_ttm_tt_has_userptr(bo->tbo.ttm)) { -+ ret = amdgpu_bo_unpin(bo); -+ if (ret != 0) -+ goto error; -+ } else if (--bo->pin_count == 0) { -+ amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); -+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); -+ if (ret != 0) { -+ pr_err("amdgpu: failed to validate BO\n"); -+ goto error; -+ } -+ } -+ -+error: -+ if (resv) -+ amdgpu_bo_unreserve(bo); -+ -+ return ret; -+} -+ -+ -+static int try_pin_pts(struct amdgpu_bo_va *bo_va, bool resv) -+{ -+ int ret; -+ uint64_t pt_idx, start, last, failed; -+ struct amdgpu_vm *vm; -+ struct amdgpu_bo_va_mapping *mapping; -+ -+ vm = bo_va->vm; -+ list_for_each_entry(mapping, &bo_va->valids, list) { -+ start = mapping->it.start >> amdgpu_vm_block_size; -+ last = mapping->it.last >> amdgpu_vm_block_size; -+ -+ pr_debug("start PT index %llu last PT index %llu\n", start, last); -+ -+ /* walk over the address space and pin the page tables BOs*/ -+ for (pt_idx = start; pt_idx <= last; pt_idx++) { -+ ret = try_pin_bo(vm->page_tables[pt_idx].bo, NULL, resv, -+ AMDGPU_GEM_DOMAIN_VRAM); -+ if (ret != 0) { -+ failed = pt_idx; -+ goto err; -+ } -+ } -+ } -+ -+ list_for_each_entry(mapping, &bo_va->invalids, list) { -+ start = mapping->it.start >> amdgpu_vm_block_size; -+ last = mapping->it.last >> amdgpu_vm_block_size; -+ -+ pr_debug("start PT index %llu last PT index %llu\n", start, last); -+ -+ /* walk over the address space and pin the page tables BOs*/ -+ for (pt_idx = start; pt_idx <= last; pt_idx++) { -+ ret = try_pin_bo(vm->page_tables[pt_idx].bo, NULL, resv, -+ AMDGPU_GEM_DOMAIN_VRAM); -+ if (ret != 0) { -+ failed = pt_idx; -+ goto err; -+ } -+ } -+ } -+ -+ return 0; -+ -+err: -+ pr_err("amdgpu: Failed to pin BO's PTEs\n"); -+ /* Unpin all already pinned BOs*/ -+ if (failed > 0) { -+ for (pt_idx = start; pt_idx <= failed - 1; pt_idx++) -+ unpin_bo(vm->page_tables[pt_idx].bo, resv); -+ } -+ return ret; -+} -+ -+static void unpin_pts(struct amdgpu_bo_va *bo_va, struct amdgpu_vm *vm, -+ bool resv) -+{ -+ uint64_t pt_idx, start, last; -+ struct amdgpu_bo_va_mapping *mapping; -+ -+ list_for_each_entry(mapping, &bo_va->valids, list) { -+ start = mapping->it.start >> amdgpu_vm_block_size; -+ last = mapping->it.last >> amdgpu_vm_block_size; -+ -+ pr_debug("start PT index %llu last PT index %llu\n", start, last); -+ -+ /* walk over the address space and unpin the page tables BOs*/ -+ for (pt_idx = start; pt_idx <= last; pt_idx++) -+ unpin_bo(vm->page_tables[pt_idx].bo, resv); -+ } -+ -+ list_for_each_entry(mapping, &bo_va->invalids, list) { -+ start = mapping->it.start >> amdgpu_vm_block_size; -+ last = mapping->it.last >> amdgpu_vm_block_size; -+ -+ pr_debug("start PT index %llu last PT index %llu\n", start, last); -+ -+ /* walk over the address space and unpin the page tables BOs*/ -+ for (pt_idx = start; pt_idx <= last; pt_idx++) -+ unpin_bo(vm->page_tables[pt_idx].bo, resv); -+ } -+} -+ -+static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, -+ size_t size, void *vm, struct kgd_mem **mem, -+ uint64_t *offset, void **kptr, struct kfd_process_device *pdd, -+ u32 domain, u64 flags, bool aql_queue, -+ bool readonly, bool execute, bool no_sub, bool userptr) -+{ -+ struct amdgpu_device *adev; -+ int ret; -+ struct amdgpu_bo *bo; -+ uint64_t user_addr = 0; -+ int byte_align; -+ -+ BUG_ON(kgd == NULL); -+ BUG_ON(size == 0); -+ BUG_ON(mem == NULL); -+ BUG_ON(vm == NULL); -+ -+ if (aql_queue) -+ size = size >> 1; -+ if (userptr) { -+ if (!offset || !*offset) -+ return -EINVAL; -+ user_addr = *offset; -+ } -+ -+ adev = get_amdgpu_device(kgd); -+ byte_align = adev->asic_type != CHIP_FIJI ? VI_BO_SIZE_ALIGN : 1; -+ -+ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); -+ if (*mem == NULL) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ INIT_LIST_HEAD(&(*mem)->data2.bo_va_list); -+ mutex_init(&(*mem)->data2.lock); -+ (*mem)->data2.readonly = readonly; -+ (*mem)->data2.execute = execute; -+ (*mem)->data2.no_substitute = no_sub; -+ (*mem)->data2.aql_queue = aql_queue; -+ -+ pr_debug("amdkfd: allocating GTT BO size %lu\n", size); -+ -+ /* Allocate buffer object. Userptr objects need to start out -+ * in the CPU domain, get moved to GTT when pinned. */ -+ ret = amdgpu_bo_create(adev, size, byte_align, false, -+ userptr ? AMDGPU_GEM_DOMAIN_CPU : domain, -+ flags, NULL, NULL, &bo); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to create BO object on GTT. ret == %d\n", -+ ret); -+ goto err_bo_create; -+ } -+ bo->kfd_bo = *mem; -+ bo->pdd = pdd; -+ (*mem)->data2.bo = bo; -+ -+ pr_debug("Created BO on GTT with size %zu bytes\n", size); -+ -+ if (userptr) { -+ ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, -+ AMDGPU_GEM_USERPTR_ANONONLY); -+ if (ret) { -+ dev_err(adev->dev, -+ "(%d) failed to set userptr\n", ret); -+ goto allocate_mem_set_userptr_failed; -+ } -+ -+ ret = amdgpu_mn_register(bo, user_addr); -+ if (ret) { -+ dev_err(adev->dev, -+ "(%d) failed to register MMU notifier\n", ret); -+ goto allocate_mem_set_userptr_failed; -+ } -+ } -+ -+ ret = add_bo_to_vm(adev, va, vm, bo, &(*mem)->data2.bo_va_list, -+ (*mem)->data2.readonly, (*mem)->data2.execute); -+ if (ret != 0) -+ goto err_map; -+ -+ if (aql_queue) { -+ ret = add_bo_to_vm(adev, va + size, -+ vm, bo, &(*mem)->data2.bo_va_list, -+ (*mem)->data2.readonly, (*mem)->data2.execute); -+ if (ret != 0) -+ goto err_map; -+ } -+ -+ pr_debug("Set BO to VA %p\n", (void *) va); -+ -+ if (kptr) { -+ ret = amdgpu_bo_reserve(bo, true); -+ if (ret) { -+ dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", ret); -+ goto allocate_mem_reserve_bo_failed; -+ } -+ -+ ret = amdgpu_bo_pin(bo, domain, -+ NULL); -+ if (ret) { -+ dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", ret); -+ goto allocate_mem_pin_bo_failed; -+ } -+ -+ ret = amdgpu_bo_kmap(bo, kptr); -+ if (ret) { -+ dev_err(adev->dev, -+ "(%d) failed to map bo to kernel for amdkfd\n", ret); -+ goto allocate_mem_kmap_bo_failed; -+ } -+ (*mem)->data2.kptr = *kptr; -+ -+ amdgpu_bo_unreserve(bo); -+ } -+ -+ (*mem)->data2.va = va; -+ (*mem)->data2.domain = domain; -+ (*mem)->data2.mapped_to_gpu_memory = 0; -+ -+ if (offset) -+ *offset = amdgpu_bo_mmap_offset(bo); -+ -+ return 0; -+ -+allocate_mem_kmap_bo_failed: -+ amdgpu_bo_unpin(bo); -+allocate_mem_pin_bo_failed: -+ amdgpu_bo_unreserve(bo); -+allocate_mem_reserve_bo_failed: -+err_map: -+ if (userptr) -+ amdgpu_mn_unregister(bo); -+allocate_mem_set_userptr_failed: -+ amdgpu_bo_unref(&bo); -+err_bo_create: -+ kfree(*mem); -+err: -+ return ret; -+} -+ -+/* Reserving a BO and its page table BOs must happen atomically to -+ * avoid deadlocks. When updating userptrs we need to temporarily -+ * back-off the reservation and then reacquire it. Track all the -+ * reservation info in a context structure. Buffers can be mapped to -+ * multiple VMs simultaneously (buffers being restored on multiple -+ * GPUs). */ -+struct bo_vm_reservation_context { -+ struct amdgpu_bo_list_entry kfd_bo; -+ unsigned n_vms; -+ struct amdgpu_bo_list_entry **vm_bos; -+ struct ww_acquire_ctx ticket; -+ struct list_head list, duplicates; -+ bool reserved; -+}; -+ -+static int reserve_bo_and_vms(struct amdgpu_device *adev, struct amdgpu_bo *bo, -+ struct list_head *bo_va_list, -+ struct amdgpu_vm *vm, bool is_mapped, -+ struct bo_vm_reservation_context *ctx) -+{ -+ struct kfd_bo_va_list *entry; -+ unsigned i; -+ int ret; -+ -+ INIT_LIST_HEAD(&ctx->list); -+ INIT_LIST_HEAD(&ctx->duplicates); -+ -+ ctx->kfd_bo.robj = bo; -+ ctx->kfd_bo.prefered_domains = bo->initial_domain; -+ ctx->kfd_bo.allowed_domains = bo->initial_domain; -+ ctx->kfd_bo.priority = 0; -+ ctx->kfd_bo.tv.bo = &bo->tbo; -+ ctx->kfd_bo.tv.shared = true; -+ ctx->kfd_bo.user_pages = NULL; -+ list_add(&ctx->kfd_bo.tv.head, &ctx->list); -+ -+ ctx->reserved = false; -+ -+ ctx->n_vms = 0; -+ list_for_each_entry(entry, bo_va_list, bo_list) { -+ if ((vm && vm != entry->bo_va->vm) || -+ entry->is_mapped != is_mapped) -+ continue; -+ ctx->n_vms++; -+ } -+ if (ctx->n_vms == 0) -+ ctx->vm_bos = NULL; -+ else { -+ ctx->vm_bos = kzalloc(sizeof(struct amdgpu_bo_list_entry *) -+ * ctx->n_vms, GFP_KERNEL); -+ if (ctx->vm_bos == NULL) -+ return -ENOMEM; -+ } -+ -+ i = 0; -+ list_for_each_entry(entry, bo_va_list, bo_list) { -+ if ((vm && vm != entry->bo_va->vm) || -+ entry->is_mapped != is_mapped) -+ continue; -+ -+ ctx->vm_bos[i] = amdgpu_vm_get_bos(adev, entry->bo_va->vm, -+ &ctx->list); -+ if (!ctx->vm_bos[i]) { -+ pr_err("amdkfd: Failed to get bos from vm\n"); -+ ret = -ENOMEM; -+ goto out; -+ } -+ i++; -+ } -+ -+ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, -+ false, &ctx->duplicates); -+ if (!ret) -+ ctx->reserved = true; -+ else -+ pr_err("amdkfd: Failed to reserve buffers in ttm\n"); -+ -+out: -+ if (ret) { -+ for (i = 0; i < ctx->n_vms; i++) { -+ if (ctx->vm_bos[i]) -+ drm_free_large(ctx->vm_bos[i]); -+ } -+ kfree(ctx->vm_bos); -+ ctx->vm_bos = NULL; -+ } -+ -+ return ret; -+} -+ -+static void unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, -+ bool wait) -+{ -+ if (wait) { -+ struct ttm_validate_buffer *entry; -+ int ret; -+ -+ list_for_each_entry(entry, &ctx->list, head) { -+ ret = ttm_bo_wait(entry->bo, false, false, false); -+ if (ret != 0) -+ pr_err("amdkfd: Failed to wait for PT/PD update (err == %d)\n", -+ ret); -+ } -+ } -+ if (ctx->reserved) -+ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); -+ if (ctx->vm_bos) { -+ unsigned i; -+ -+ for (i = 0; i < ctx->n_vms; i++) { -+ if (ctx->vm_bos[i]) -+ drm_free_large(ctx->vm_bos[i]); -+ } -+ kfree(ctx->vm_bos); -+ } -+ ctx->reserved = false; -+ ctx->vm_bos = NULL; -+} -+ -+/* Must be called with mem->data2.lock held and a BO/VM reservation -+ * context. Temporarily drops the lock and reservation for updating -+ * user pointers, to avoid circular lock dependencies between MM locks -+ * and buffer reservations. If user pages are invalidated while the -+ * lock and reservation are dropped, try again. */ -+static int update_user_pages(struct kgd_mem *mem, struct mm_struct *mm, -+ struct bo_vm_reservation_context *ctx) -+{ -+ struct amdgpu_bo *bo; -+ unsigned tries = 10; -+ int ret; -+ -+ bo = mem->data2.bo; -+ if (!amdgpu_ttm_tt_has_userptr(bo->tbo.ttm)) -+ return 0; -+ -+ if (bo->tbo.ttm->state != tt_bound) { -+ struct page **pages; -+ int invalidated; -+ -+ /* get user pages without locking the BO to avoid -+ * circular lock dependency with MMU notifier. Retry -+ * until we have the current version. */ -+ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); -+ ctx->reserved = false; -+ pages = drm_calloc_large(bo->tbo.ttm->num_pages, -+ sizeof(struct page *)); -+ if (!pages) -+ return -ENOMEM; -+ -+ mutex_unlock(&mem->data2.lock); -+ -+ while (true) { -+ down_read(&mm->mmap_sem); -+ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, pages); -+ up_read(&mm->mmap_sem); -+ -+ mutex_lock(&mem->data2.lock); -+ if (ret != 0) -+ return ret; -+ -+ BUG_ON(bo != mem->data2.bo); -+ -+ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, -+ false, &ctx->duplicates); -+ if (unlikely(ret != 0)) { -+ release_pages(pages, bo->tbo.ttm->num_pages, 0); -+ drm_free_large(pages); -+ return ret; -+ } -+ ctx->reserved = true; -+ if (!amdgpu_ttm_tt_userptr_invalidated(bo->tbo.ttm, -+ &invalidated) || -+ bo->tbo.ttm->state == tt_bound || -+ --tries == 0) -+ break; -+ -+ release_pages(pages, bo->tbo.ttm->num_pages, 0); -+ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); -+ ctx->reserved = false; -+ mutex_unlock(&mem->data2.lock); -+ } -+ -+ /* If someone else already bound it, release our pages -+ * array, otherwise copy it into the ttm BO. */ -+ if (bo->tbo.ttm->state == tt_bound || tries == 0) -+ release_pages(pages, bo->tbo.ttm->num_pages, 0); -+ else -+ memcpy(bo->tbo.ttm->pages, pages, -+ sizeof(struct page *) * bo->tbo.ttm->num_pages); -+ drm_free_large(pages); -+ } -+ -+ if (tries == 0) { -+ pr_err("Gave up trying to update user pages\n"); -+ return -EDEADLK; -+ } -+ -+ return 0; -+} -+ -+static int map_bo_to_gpuvm(struct amdgpu_device *adev, struct amdgpu_bo *bo, -+ struct amdgpu_bo_va *bo_va) -+{ -+ struct amdgpu_vm_id *vm_id; -+ struct amdgpu_vm *vm; -+ int ret; -+ -+ /* Pin PTs */ -+ ret = try_pin_pts(bo_va, false); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to pin PTs\n"); -+ goto err_failed_to_pin_pts; -+ } -+ -+ /* Pin the PD directory*/ -+ vm = bo_va->vm; -+ vm_id = &vm->ids[7]; -+ ret = try_pin_bo(vm->page_directory, &vm_id->pd_gpu_addr, false, -+ AMDGPU_GEM_DOMAIN_VRAM); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to pin PD\n"); -+ goto err_failed_to_pin_pd; -+ } -+ -+ mutex_lock(&vm->mutex); -+ -+ /* Update the page directory */ -+ ret = amdgpu_vm_update_page_directory(adev, vm); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to radeon_vm_update_page_directory\n"); -+ goto err_failed_to_update_pd; -+ } -+ -+ /* -+ * The previously "released" BOs are really released and their VAs are -+ * removed from PT. This function is called here because it requires -+ * the radeon_vm::mutex to be locked and PT to be reserved -+ */ -+ ret = amdgpu_vm_clear_freed(adev, vm); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to radeon_vm_clear_freed\n"); -+ goto err_failed_vm_clear_freed; -+ } -+ -+ /* Update the page tables */ -+ ret = amdgpu_vm_bo_update(adev, bo_va, &bo->tbo.mem); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to radeon_vm_bo_update\n"); -+ goto err_failed_to_update_pts; -+ } -+ -+ ret = amdgpu_vm_clear_invalids(adev, vm, NULL); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to radeon_vm_clear_invalids\n"); -+ goto err_failed_to_vm_clear_invalids; -+ } -+ -+ mutex_unlock(&vm->mutex); -+ -+ return 0; -+ -+err_failed_to_vm_clear_invalids: -+ amdgpu_vm_bo_update(adev, bo_va, NULL); -+err_failed_to_update_pts: -+err_failed_vm_clear_freed: -+err_failed_to_update_pd: -+ mutex_unlock(&vm->mutex); -+ unpin_bo(vm->page_directory, false); -+err_failed_to_pin_pd: -+ unpin_pts(bo_va, vm, false); -+err_failed_to_pin_pts: -+ -+ return ret; -+} -+ -+#define BOOL_TO_STR(b) (b == true) ? "true" : "false" -+ -+int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( -+ struct kgd_dev *kgd, uint64_t va, size_t size, -+ void *vm, struct kgd_mem **mem, -+ uint64_t *offset, void **kptr, -+ struct kfd_process_device *pdd, uint32_t flags) -+{ -+ bool aql_queue, public, readonly, execute, no_sub, userptr; -+ u64 alloc_flag; -+ uint32_t domain; -+ uint64_t *temp_offset; -+ -+ if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) { -+ pr_err("amdgpu: current hw doesn't support paged memory\n"); -+ return -EINVAL; -+ } -+ -+ domain = 0; -+ alloc_flag = 0; -+ temp_offset = NULL; -+ -+ aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false; -+ public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false; -+ readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false; -+ execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false; -+ no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false; -+ userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false; -+ -+ if (userptr && kptr) { -+ pr_err("amdgpu: userptr can't be mapped to kernel\n"); -+ return -EINVAL; -+ } -+ -+ /* -+ * Check on which domain to allocate BO -+ */ -+ if (offset && !userptr) -+ *offset = 0; -+ if (flags & ALLOC_MEM_FLAGS_VRAM) { -+ domain = AMDGPU_GEM_DOMAIN_VRAM; -+ alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; -+ if (public) { -+ alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; -+ temp_offset = offset; -+ } -+ } else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) { -+ domain = AMDGPU_GEM_DOMAIN_GTT; -+ alloc_flag = 0; -+ temp_offset = offset; -+ } -+ -+ pr_debug("amdgpu: allocating BO domain %d alloc_flag 0x%llu public %s readonly %s execute %s no substitue %s va 0x%llx\n", -+ domain, -+ alloc_flag, -+ BOOL_TO_STR(public), -+ BOOL_TO_STR(readonly), -+ BOOL_TO_STR(execute), -+ BOOL_TO_STR(no_sub), -+ va); -+ -+ return __alloc_memory_of_gpu(kgd, va, size, vm, mem, -+ temp_offset, kptr, pdd, domain, -+ alloc_flag, -+ aql_queue, readonly, execute, -+ no_sub, userptr); -+} -+ -+int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem) -+{ -+ struct amdgpu_device *adev; -+ struct kfd_bo_va_list *entry, *tmp; -+ -+ BUG_ON(kgd == NULL); -+ BUG_ON(mem == NULL); -+ -+ adev = get_amdgpu_device(kgd); -+ -+ mutex_lock(&mem->data2.lock); -+ -+ if (mem->data2.mapped_to_gpu_memory > 0) { -+ pr_err("BO with size %lu bytes is mapped to GPU. Need to unmap it before release va 0x%llx\n", -+ mem->data2.bo->tbo.mem.size, mem->data2.va); -+ mutex_unlock(&mem->data2.lock); -+ return -EBUSY; -+ } -+ -+ mutex_unlock(&mem->data2.lock); -+ /* lock is not needed after this, since mem is unused and will -+ * be freed anyway */ -+ -+ amdgpu_mn_unregister(mem->data2.bo); -+ if (mem->data2.work.work.func) -+ cancel_delayed_work_sync(&mem->data2.work); -+ -+ /* Remove from VM internal data structures */ -+ list_for_each_entry_safe(entry, tmp, &mem->data2.bo_va_list, bo_list) { -+ pr_debug("Releasing BO with VA %p, size %lu bytes\n", -+ entry->bo_va, -+ mem->data2.bo->tbo.mem.size); -+ if (entry->bo_va->vm != NULL) -+ remove_bo_from_vm( -+ (struct amdgpu_device *)entry->kgd_dev, -+ mem->data2.bo, entry->bo_va); -+ list_del(&entry->bo_list); -+ kfree(entry); -+ } -+ -+ /* Free the BO*/ -+ amdgpu_bo_unref(&mem->data2.bo); -+ kfree(mem); -+ -+ return 0; -+} -+int amdgpu_amdkfd_gpuvm_return_bo_size(struct kgd_dev *kgd, struct kgd_mem *mem) -+{ -+ struct amdgpu_bo *bo; -+ -+ BUG_ON(mem == NULL); -+ -+ bo = mem->data2.bo; -+ return bo->tbo.mem.size; -+ -+} -+int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) -+{ -+ struct amdgpu_device *adev; -+ int ret; -+ struct amdgpu_bo *bo; -+ uint32_t domain; -+ struct kfd_bo_va_list *entry; -+ struct bo_vm_reservation_context ctx; -+ -+ BUG_ON(kgd == NULL); -+ BUG_ON(mem == NULL); -+ -+ adev = get_amdgpu_device(kgd); -+ -+ mutex_lock(&mem->data2.lock); -+ -+ bo = mem->data2.bo; -+ -+ BUG_ON(bo == NULL); -+ -+ domain = mem->data2.domain; -+ -+ pr_debug("amdgpu: try to map VA 0x%llx domain %d\n", -+ mem->data2.va, domain); -+ -+ if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, -+ &mem->data2.bo_va_list)) { -+ pr_debug("amdkfd: add new BO_VA to list 0x%llx\n", -+ mem->data2.va); -+ ret = add_bo_to_vm(adev, mem->data2.va, (struct amdgpu_vm *)vm, -+ bo, &mem->data2.bo_va_list, -+ mem->data2.readonly, mem->data2.execute); -+ if (ret != 0) -+ goto add_bo_to_vm_failed; -+ if (mem->data2.aql_queue) { -+ ret = add_bo_to_vm(adev, -+ mem->data2.va + bo->tbo.mem.size, -+ (struct amdgpu_vm *)vm, -+ bo, &mem->data2.bo_va_list, -+ mem->data2.readonly, -+ mem->data2.execute); -+ if (ret != 0) -+ goto add_bo_to_vm_failed; -+ } -+ } -+ -+ if (!mem->data2.evicted) { -+ ret = reserve_bo_and_vms(adev, bo, &mem->data2.bo_va_list, -+ vm, false, &ctx); -+ if (unlikely(ret != 0)) -+ goto bo_reserve_failed; -+ -+ ret = update_user_pages(mem, current->mm, &ctx); -+ if (ret != 0) -+ goto update_user_pages_failed; -+ } -+ -+ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { -+ if (entry->bo_va->vm == vm && entry->is_mapped == false) { -+ if (mem->data2.evicted) { -+ /* If the BO is evicted, just mark the -+ * mapping as mapped and stop the GPU's -+ * queues until the BO is restored. */ -+ ret = kgd2kfd->quiesce_mm(adev->kfd, -+ current->mm); -+ if (ret != 0) -+ goto quiesce_failed; -+ entry->is_mapped = true; -+ mem->data2.mapped_to_gpu_memory++; -+ continue; -+ } -+ -+ pr_debug("amdkfd: Trying to map VA 0x%llx to vm %p\n", -+ mem->data2.va, vm); -+ /* -+ * We need to pin the allocated BO, PD and appropriate PTs and to -+ * create a mapping of virtual to MC address -+ */ -+ /* Pin BO*/ -+ ret = try_pin_bo(bo, NULL, false, domain); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to pin BO\n"); -+ goto pin_bo_failed; -+ } -+ -+ ret = map_bo_to_gpuvm(adev, bo, entry->bo_va); -+ if (ret != 0) { -+ pr_err("amdkfd: Failed to map radeon bo to gpuvm\n"); -+ goto map_bo_to_gpuvm_failed; -+ } -+ entry->is_mapped = true; -+ mem->data2.mapped_to_gpu_memory++; -+ pr_debug("amdgpu: INC mapping count %d\n", -+ mem->data2.mapped_to_gpu_memory); -+ } -+ } -+ -+ if (!mem->data2.evicted) -+ unreserve_bo_and_vms(&ctx, true); -+ mutex_unlock(&mem->data2.lock); -+ return 0; -+ -+map_bo_to_gpuvm_failed: -+ unpin_bo(bo, false); -+pin_bo_failed: -+quiesce_failed: -+update_user_pages_failed: -+ if (!mem->data2.evicted) -+ unreserve_bo_and_vms(&ctx, false); -+bo_reserve_failed: -+add_bo_to_vm_failed: -+ mutex_unlock(&mem->data2.lock); -+ return ret; -+} -+ -+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm) -+{ -+ int ret; -+ struct amdgpu_vm *new_vm; -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ -+ BUG_ON(kgd == NULL); -+ BUG_ON(vm == NULL); -+ -+ new_vm = kzalloc(sizeof(struct amdgpu_vm), GFP_KERNEL); -+ if (new_vm == NULL) -+ return -ENOMEM; -+ -+ /* Initialize the VM context, allocate the page directory and zero it */ -+ ret = amdgpu_vm_init(adev, new_vm); -+ if (ret != 0) { -+ pr_err("amdgpu: failed init vm ret %d\n", ret); -+ /* Undo everything related to the new VM context */ -+ amdgpu_vm_fini(adev, new_vm); -+ kfree(new_vm); -+ new_vm = NULL; -+ } -+ -+ *vm = (void *) new_vm; -+ -+ /* -+ * The previously "released" BOs are really released and their VAs are -+ * removed from PT. This function is called here because it requires -+ * the radeon_vm::mutex to be locked and PT to be reserved -+ */ -+ ret = amdgpu_vm_clear_freed(adev, new_vm); -+ if (ret != 0) -+ pr_err("amdgpu: Failed to amdgpu_vm_clear_freed\n"); -+ -+ pr_debug("amdgpu: created process vm with address 0x%llx\n", -+ new_vm->ids[7].pd_gpu_addr); -+ -+ return ret; -+} -+ -+void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -+ struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; -+ -+ BUG_ON(kgd == NULL); -+ BUG_ON(vm == NULL); -+ -+ pr_debug("Destroying process vm with address %p\n", vm); -+ -+ /* Release the VM context */ -+ amdgpu_vm_fini(adev, avm); -+ kfree(vm); -+} -+ -+uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm) -+{ -+ struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; -+ struct amdgpu_vm_id *vm_id; -+ -+ BUG_ON(avm == NULL); -+ -+ vm_id = &avm->ids[7]; -+ return vm_id->pd_gpu_addr >> AMDGPU_GPU_PAGE_SHIFT; -+} -+ -+int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, -+ struct kfd_vm_fault_info *mem) -+{ -+ struct amdgpu_device *adev; -+ -+ BUG_ON(kgd == NULL); -+ adev = (struct amdgpu_device *) kgd; -+ if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) { -+ *mem = *adev->mc.vm_fault_info; -+ mb(); -+ atomic_set(&adev->mc.vm_fault_info_updated, 0); -+ } -+ return 0; -+} -+ -+static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, -+ struct amdgpu_bo_va *bo_va) -+{ -+ struct amdgpu_vm *vm; -+ int ret; -+ struct ttm_validate_buffer tv; -+ struct amdgpu_bo_list_entry *vm_bos; -+ struct ww_acquire_ctx ticket; -+ struct list_head list, duplicates; -+ -+ INIT_LIST_HEAD(&list); -+ INIT_LIST_HEAD(&duplicates); -+ -+ vm = bo_va->vm; -+ tv.bo = &bo_va->bo->tbo; -+ tv.shared = true; -+ list_add(&tv.head, &list); -+ -+ vm_bos = amdgpu_vm_get_bos(adev, vm, &list); -+ if (!vm_bos) { -+ pr_err("amdkfd: Failed to get bos from vm\n"); -+ ret = -ENOMEM; -+ goto err_failed_to_get_bos; -+ } -+ -+ ret = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates); -+ if (ret) { -+ pr_err("amdkfd: Failed to reserve buffers in ttm\n"); -+ goto err_failed_to_ttm_reserve; -+ } -+ -+ mutex_lock(&vm->mutex); -+ -+ /* -+ * The previously "released" BOs are really released and their VAs are -+ * removed from PT. This function is called here because it requires -+ * the radeon_vm::mutex to be locked and PT to be reserved -+ */ -+ amdgpu_vm_clear_freed(adev, vm); -+ -+ /* Update the page tables - Remove the mapping from bo_va */ -+ amdgpu_vm_bo_update(adev, bo_va, NULL); -+ -+ amdgpu_vm_clear_invalids(adev, vm, NULL); -+ -+ mutex_unlock(&vm->mutex); -+ -+ ttm_eu_backoff_reservation(&ticket, &list); -+ drm_free_large(vm_bos); -+ -+ return 0; -+err_failed_to_ttm_reserve: -+ drm_free_large(vm_bos); -+err_failed_to_get_bos: -+ return ret; -+} -+ -+static bool is_mem_on_local_device(struct kgd_dev *kgd, -+ struct list_head *bo_va_list, void *vm) -+{ -+ struct kfd_bo_va_list *entry; -+ -+ list_for_each_entry(entry, bo_va_list, bo_list) { -+ if (entry->kgd_dev == kgd && entry->bo_va->vm == vm) -+ return true; -+ } -+ -+ return false; -+} -+ -+int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( -+ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) -+{ -+ struct kfd_bo_va_list *entry; -+ struct amdgpu_device *adev; -+ unsigned mapped_before; -+ int ret = 0; -+ -+ BUG_ON(kgd == NULL); -+ BUG_ON(mem == NULL); -+ -+ adev = (struct amdgpu_device *) kgd; -+ -+ mutex_lock(&mem->data2.lock); -+ -+ /* -+ * Make sure that this BO mapped on KGD before unmappping it -+ */ -+ if (!is_mem_on_local_device(kgd, &mem->data2.bo_va_list, vm)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (mem->data2.mapped_to_gpu_memory == 0) { -+ pr_debug("BO size %lu bytes at va 0x%llx is not mapped\n", -+ mem->data2.bo->tbo.mem.size, mem->data2.va); -+ ret = -EINVAL; -+ goto out; -+ } -+ mapped_before = mem->data2.mapped_to_gpu_memory; -+ -+ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { -+ if (entry->kgd_dev == kgd && -+ entry->bo_va->vm == vm && -+ entry->is_mapped) { -+ if (mem->data2.evicted) { -+ /* If the BO is evicted, just mark the -+ * mapping as unmapped and allow the -+ * GPU's queues to resume. */ -+ ret = kgd2kfd->resume_mm(adev->kfd, -+ current->mm); -+ if (ret != 0) -+ goto out; -+ entry->is_mapped = false; -+ mem->data2.mapped_to_gpu_memory--; -+ continue; -+ } -+ -+ pr_debug("unmapping BO with VA 0x%llx, size %lu bytes from GPU memory\n", -+ mem->data2.va, -+ mem->data2.bo->tbo.mem.size); -+ /* Unpin the PD directory*/ -+ unpin_bo(entry->bo_va->vm->page_directory, true); -+ /* Unpin PTs */ -+ unpin_pts(entry->bo_va, entry->bo_va->vm, true); -+ -+ /* Unpin BO*/ -+ unpin_bo(mem->data2.bo, true); -+ ret = unmap_bo_from_gpuvm(adev, entry->bo_va); -+ if (ret == 0) { -+ entry->is_mapped = false; -+ } else { -+ pr_err("amdgpu: failed unmap va 0x%llx\n", -+ mem->data2.va); -+ goto out; -+ } -+ mem->data2.mapped_to_gpu_memory--; -+ pr_debug("amdgpu: DEC mapping count %d\n", -+ mem->data2.mapped_to_gpu_memory); -+ } -+ } -+ if (mapped_before == mem->data2.mapped_to_gpu_memory) { -+ pr_debug("BO size %lu bytes at va 0x%llx is not mapped on GPU %x:%x.%x\n", -+ mem->data2.bo->tbo.mem.size, mem->data2.va, -+ adev->pdev->bus->number, PCI_SLOT(adev->pdev->devfn), -+ PCI_FUNC(adev->pdev->devfn)); -+ ret = -EINVAL; -+ } -+ -+out: -+ mutex_unlock(&mem->data2.lock); -+ return ret; -+} -+ -+int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) -+{ -+ struct amdgpu_device *adev; -+ -+ adev = get_amdgpu_device(kgd); -+ BUG_ON(!adev); -+ -+ return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev); -+} -+ -+int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, -+ struct kgd_mem *mem, void **kptr) -+{ -+ int ret; -+ struct amdgpu_device *adev; -+ struct amdgpu_bo *bo; -+ -+ adev = get_amdgpu_device(kgd); -+ -+ mutex_lock(&mem->data2.lock); -+ -+ bo = mem->data2.bo; -+ /* map the buffer */ -+ ret = amdgpu_bo_reserve(bo, true); -+ if (ret) { -+ dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", ret); -+ mutex_unlock(&mem->data2.lock); -+ return ret; -+ } -+ -+ ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT, -+ NULL); -+ if (ret) { -+ dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", ret); -+ amdgpu_bo_unreserve(bo); -+ mutex_unlock(&mem->data2.lock); -+ return ret; -+ } -+ -+ ret = amdgpu_bo_kmap(bo, kptr); -+ if (ret) { -+ dev_err(adev->dev, -+ "(%d) failed to map bo to kernel for amdkfd\n", ret); -+ amdgpu_bo_unpin(bo); -+ amdgpu_bo_unreserve(bo); -+ mutex_unlock(&mem->data2.lock); -+ return ret; -+ } -+ -+ mem->data2.kptr = *kptr; -+ -+ amdgpu_bo_unreserve(bo); -+ mutex_unlock(&mem->data2.lock); -+ -+ return 0; -+} -+ -+static int pin_bo_wo_map(struct kgd_mem *mem) -+{ -+ struct amdgpu_bo *bo = mem->data2.bo; -+ int ret = 0; -+ -+ ret = amdgpu_bo_reserve(bo, false); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ ret = amdgpu_bo_pin(bo, mem->data2.domain, NULL); -+ amdgpu_bo_unreserve(bo); -+ -+ return ret; -+} -+ -+static void unpin_bo_wo_map(struct kgd_mem *mem) -+{ -+ struct amdgpu_bo *bo = mem->data2.bo; -+ int ret = 0; -+ -+ ret = amdgpu_bo_reserve(bo, false); -+ if (unlikely(ret != 0)) -+ return; -+ -+ amdgpu_bo_unpin(bo); -+ amdgpu_bo_unreserve(bo); -+} -+ -+#define AMD_GPU_PAGE_SHIFT PAGE_SHIFT -+#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT) -+ -+static int get_sg_table(struct amdgpu_device *adev, -+ struct kgd_mem *mem, uint64_t offset, -+ uint64_t size, struct sg_table **ret_sg) -+{ -+ struct amdgpu_bo *bo = mem->data2.bo; -+ struct sg_table *sg = NULL; -+ unsigned long bus_addr; -+ unsigned int chunks; -+ unsigned int i; -+ struct scatterlist *s; -+ uint64_t offset_in_page; -+ unsigned int page_size; -+ int ret; -+ -+ sg = kmalloc(sizeof(struct sg_table), GFP_KERNEL); -+ if (!sg) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ if (bo->initial_domain == AMDGPU_GEM_DOMAIN_VRAM) -+ page_size = AMD_GPU_PAGE_SIZE; -+ else -+ page_size = PAGE_SIZE; -+ -+ -+ offset_in_page = offset & (page_size - 1); -+ chunks = (size + offset_in_page + page_size - 1) -+ / page_size; -+ -+ ret = sg_alloc_table(sg, chunks, GFP_KERNEL); -+ if (unlikely(ret)) -+ goto out; -+ -+ if (bo->initial_domain == AMDGPU_GEM_DOMAIN_VRAM) { -+ bus_addr = bo->tbo.offset + adev->mc.aper_base + offset; -+ -+ for_each_sg(sg->sgl, s, sg->orig_nents, i) { -+ uint64_t chunk_size, length; -+ -+ chunk_size = page_size - offset_in_page; -+ length = min(size, chunk_size); -+ -+ sg_set_page(s, NULL, length, offset_in_page); -+ s->dma_address = bus_addr; -+ s->dma_length = length; -+ -+ size -= length; -+ offset_in_page = 0; -+ bus_addr += length; -+ } -+ } else { -+ struct page **pages; -+ unsigned int cur_page; -+ -+ pages = bo->tbo.ttm->pages; -+ -+ cur_page = offset / page_size; -+ for_each_sg(sg->sgl, s, sg->orig_nents, i) { -+ uint64_t chunk_size, length; -+ -+ chunk_size = page_size - offset_in_page; -+ length = min(size, chunk_size); -+ -+ sg_set_page(s, pages[cur_page], length, offset_in_page); -+ s->dma_address = page_to_phys(pages[cur_page]); -+ s->dma_length = length; -+ -+ size -= length; -+ offset_in_page = 0; -+ cur_page++; -+ } -+ } -+ -+ *ret_sg = sg; -+ return 0; -+out: -+ kfree(sg); -+ *ret_sg = NULL; -+ return ret; -+} -+ -+int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, -+ struct kgd_mem *mem, uint64_t offset, -+ uint64_t size, struct sg_table **ret_sg) -+{ -+ int ret; -+ struct amdgpu_device *adev; -+ -+ ret = pin_bo_wo_map(mem); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ adev = get_amdgpu_device(kgd); -+ -+ ret = get_sg_table(adev, mem, offset, size, ret_sg); -+ if (ret) -+ unpin_bo_wo_map(mem); -+ -+ return ret; -+} -+ -+void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( -+ struct kgd_mem *mem, struct sg_table *sg) -+{ -+ sg_free_table(sg); -+ kfree(sg); -+ -+ unpin_bo_wo_map(mem); -+} -+ -+int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, int dma_buf_fd, -+ uint64_t va, void *vm, -+ struct kgd_mem **mem, uint64_t *size) -+{ -+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ struct dma_buf *dma_buf; -+ struct drm_gem_object *obj; -+ struct amdgpu_bo *bo; -+ int r = -EINVAL; -+ -+ dma_buf = dma_buf_get(dma_buf_fd); -+ if (IS_ERR(dma_buf)) -+ return PTR_ERR(dma_buf); -+ -+ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) -+ /* Can't handle non-graphics buffers */ -+ goto out_put; -+ -+ obj = dma_buf->priv; -+ if (obj->dev->dev_private != adev) -+ /* Can't handle buffers from other devices */ -+ goto out_put; -+ -+ bo = gem_to_amdgpu_bo(obj); -+ if (!(bo->initial_domain & (AMDGPU_GEM_DOMAIN_VRAM | -+ AMDGPU_GEM_DOMAIN_GTT))) -+ /* Only VRAM and GTT BOs are supported */ -+ goto out_put; -+ -+ if (size) -+ *size = amdgpu_bo_size(bo); -+ -+ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); -+ if (*mem == NULL) { -+ r = -ENOMEM; -+ goto out_put; -+ } -+ -+ INIT_LIST_HEAD(&(*mem)->data2.bo_va_list); -+ mutex_init(&(*mem)->data2.lock); -+ (*mem)->data2.execute = true; /* executable by default */ -+ -+ (*mem)->data2.bo = amdgpu_bo_ref(bo); -+ (*mem)->data2.va = va; -+ (*mem)->data2.domain = (bo->initial_domain & AMDGPU_GEM_DOMAIN_VRAM) ? -+ AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT; -+ (*mem)->data2.mapped_to_gpu_memory = 0; -+ -+ r = add_bo_to_vm(adev, va, vm, bo, &(*mem)->data2.bo_va_list, -+ false, true); -+ -+ if (r) { -+ amdgpu_bo_unref(&bo); -+ kfree(*mem); -+ *mem = NULL; -+ } -+ -+out_put: -+ dma_buf_put(dma_buf); -+ return r; -+} -+ -+/* Runs out of process context. mem->data2.lock must be held. */ -+int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm) -+{ -+ struct kfd_bo_va_list *entry; -+ unsigned n_evicted; -+ int r = 0; -+ -+ pr_debug("Evicting buffer %p\n", mem); -+ -+ if (mem->data2.mapped_to_gpu_memory == 0) -+ return 0; -+ -+ /* Remove all GPU mappings of the buffer, but don't change any -+ * of the is_mapped flags so we can restore it later. The -+ * queues of the affected GPUs are quiesced first. Count the -+ * number of evicted mappings so we can roll back if something -+ * goes wrong. */ -+ n_evicted = 0; -+ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { -+ struct amdgpu_device *adev; -+ -+ if (!entry->is_mapped) -+ continue; -+ -+ adev = (struct amdgpu_device *)entry->kgd_dev; -+ -+ r = kgd2kfd->quiesce_mm(adev->kfd, mm); -+ if (r != 0) { -+ pr_err("failed to quiesce KFD\n"); -+ goto fail; -+ } -+ -+ r = unmap_bo_from_gpuvm(adev, entry->bo_va); -+ if (r != 0) { -+ pr_err("failed unmap va 0x%llx\n", -+ mem->data2.va); -+ kgd2kfd->resume_mm(adev->kfd, mm); -+ goto fail; -+ } -+ -+ /* Unpin the PD directory*/ -+ unpin_bo(entry->bo_va->vm->page_directory, true); -+ /* Unpin PTs */ -+ unpin_pts(entry->bo_va, entry->bo_va->vm, true); -+ -+ /* Unpin BO*/ -+ unpin_bo(mem->data2.bo, true); -+ -+ n_evicted++; -+ } -+ -+ return 0; -+ -+fail: -+ /* To avoid hangs and keep state consistent, roll back partial -+ * eviction by restoring queues and marking mappings as -+ * unmapped. Access to now unmapped buffers will fault. */ -+ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { -+ struct amdgpu_device *adev; -+ -+ if (n_evicted == 0) -+ break; -+ if (!entry->is_mapped) -+ continue; -+ -+ entry->is_mapped = false; -+ -+ adev = (struct amdgpu_device *)entry->kgd_dev; -+ if (kgd2kfd->resume_mm(adev->kfd, mm)) -+ pr_err("Failed to resume KFD\n"); -+ -+ n_evicted--; -+ } -+ -+ return r; -+} -+ -+/* Runs out of process context. mem->data2.lock must be held. */ -+int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm) -+{ -+ struct bo_vm_reservation_context ctx; -+ struct kfd_bo_va_list *entry; -+ uint32_t domain; -+ int r, ret = 0; -+ bool have_pages = false; -+ -+ pr_debug("Restoring buffer %p\n", mem); -+ -+ if (mem->data2.mapped_to_gpu_memory == 0) -+ return 0; -+ -+ domain = mem->data2.domain; -+ -+ ret = reserve_bo_and_vms(mem->data2.bo->adev, mem->data2.bo, -+ &mem->data2.bo_va_list, NULL, true, &ctx); -+ if (likely(ret == 0)) { -+ ret = update_user_pages(mem, mm, &ctx); -+ have_pages = !ret; -+ if (!have_pages) -+ unreserve_bo_and_vms(&ctx, false); -+ } -+ -+ /* update_user_pages drops the lock briefly. Check if someone -+ * else evicted or restored the buffer in the mean time */ -+ if (mem->data2.evicted != 1) { -+ unreserve_bo_and_vms(&ctx, false); -+ return 0; -+ } -+ -+ /* Try to restore all mappings. Mappings that fail to restore -+ * will be marked as unmapped. If we failed to get the user -+ * pages, all mappings will be marked as unmapped. */ -+ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { -+ struct amdgpu_device *adev; -+ -+ if (!entry->is_mapped) -+ continue; -+ -+ adev = (struct amdgpu_device *)entry->kgd_dev; -+ -+ if (unlikely(!have_pages)) { -+ entry->is_mapped = false; -+ goto resume_kfd; -+ } -+ -+ r = try_pin_bo(mem->data2.bo, NULL, false, domain); -+ if (unlikely(r != 0)) { -+ pr_err("Failed to pin BO\n"); -+ entry->is_mapped = false; -+ if (ret == 0) -+ ret = r; -+ goto resume_kfd; -+ } -+ -+ r = map_bo_to_gpuvm(adev, mem->data2.bo, entry->bo_va); -+ if (unlikely(r != 0)) { -+ pr_err("Failed to map BO to gpuvm\n"); -+ entry->is_mapped = false; -+ unpin_bo(mem->data2.bo, true); -+ if (ret == 0) -+ ret = r; -+ } -+ -+ /* Resume queues even if restore failed. Worst case -+ * the app will get a GPUVM fault. That's better than -+ * hanging the queues indefinitely. */ -+resume_kfd: -+ r = kgd2kfd->resume_mm(adev->kfd, mm); -+ if (ret != 0) { -+ pr_err("Failed to resume KFD\n"); -+ if (ret == 0) -+ ret = r; -+ } -+ } -+ -+ if (have_pages) -+ unreserve_bo_and_vms(&ctx, true); -+ -+ return ret; -+} -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c -index 06b824c..5ce6528 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c -@@ -381,7 +381,7 @@ void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring *ring) - static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, - size_t size, loff_t *pos) - { -- struct amdgpu_ring *ring = (struct amdgpu_ring*)kcl_file_private(f); -+ struct amdgpu_ring *ring = file_inode(f)->i_private; - int r, i; - uint32_t value, result, early[3]; - -diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig -index e13c67c..ac49532 100644 ---- a/drivers/gpu/drm/amd/amdkfd/Kconfig -+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig -@@ -5,5 +5,6 @@ - config HSA_AMD - tristate "HSA kernel driver for AMD GPU devices" - depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64 -+ select DRM_AMDGPU_USERPTR - help - Enable this if you want to use HSA features on AMD GPU devices. -diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile -index b400d56..60c60c0 100644 ---- a/drivers/gpu/drm/amd/amdkfd/Makefile -+++ b/drivers/gpu/drm/amd/amdkfd/Makefile -@@ -14,6 +14,6 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ - kfd_process_queue_manager.o kfd_device_queue_manager.o \ - kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ - kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ -- kfd_dbgdev.o kfd_dbgmgr.o -+ kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o - - obj-$(CONFIG_HSA_AMD) += amdkfd.o -diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c -index 211fc48..02a9082 100644 ---- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c -+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c -@@ -24,40 +24,59 @@ - #include "kfd_events.h" - #include "cik_int.h" - --static bool cik_event_interrupt_isr(struct kfd_dev *dev, -+static bool is_cpc_vm_fault(struct kfd_dev *dev, - const uint32_t *ih_ring_entry) - { -- unsigned int pasid; - const struct cik_ih_ring_entry *ihre = - (const struct cik_ih_ring_entry *)ih_ring_entry; - -- pasid = (ihre->ring_id & 0xffff0000) >> 16; -+ if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || -+ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && -+ ihre->vmid >= dev->vm_info.first_vmid_kfd && -+ ihre->vmid <= dev->vm_info.last_vmid_kfd) -+ return true; -+ return false; -+} -+static bool cik_event_interrupt_isr(struct kfd_dev *dev, -+ const uint32_t *ih_ring_entry) -+{ -+ const struct cik_ih_ring_entry *ihre = -+ (const struct cik_ih_ring_entry *)ih_ring_entry; - - /* Do not process in ISR, just request it to be forwarded to WQ. */ -- return (pasid != 0) && -+ return (ihre->pasid != 0) && - (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || - ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || -- ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); -+ ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || -+ is_cpc_vm_fault(dev, ih_ring_entry)); - } - - static void cik_event_interrupt_wq(struct kfd_dev *dev, - const uint32_t *ih_ring_entry) - { -- unsigned int pasid; - const struct cik_ih_ring_entry *ihre = - (const struct cik_ih_ring_entry *)ih_ring_entry; - -- pasid = (ihre->ring_id & 0xffff0000) >> 16; -- -- if (pasid == 0) -+ if (ihre->pasid == 0) - return; - - if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) -- kfd_signal_event_interrupt(pasid, 0, 0); -+ kfd_signal_event_interrupt(ihre->pasid, 0, 0); - else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) -- kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8); -+ kfd_signal_event_interrupt(ihre->pasid, ihre->data & 0xFF, 8); - else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) -- kfd_signal_hw_exception_event(pasid); -+ kfd_signal_hw_exception_event(ihre->pasid); -+ else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || -+ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { -+ struct kfd_vm_fault_info info; -+ -+ dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); -+ kfd_process_vm_fault(dev->dqm, ihre->pasid); -+ if (info.vmid == ihre->vmid) -+ kfd_signal_vm_fault_event(dev, ihre->pasid, &info); -+ else -+ kfd_signal_vm_fault_event(dev, ihre->pasid, NULL); -+ } - } - - const struct kfd_event_interrupt_class event_interrupt_class_cik = { -diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h -index 79a16d2..feb3c24 100644 ---- a/drivers/gpu/drm/amd/amdkfd/cik_int.h -+++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h -@@ -26,16 +26,30 @@ - #include <linux/types.h> - - struct cik_ih_ring_entry { -- uint32_t source_id; -- uint32_t data; -- uint32_t ring_id; -- uint32_t reserved; -+ uint32_t source_id:8; -+ uint32_t reserved1:8; -+ uint32_t reserved2:16; -+ -+ uint32_t data:28; -+ uint32_t reserved3:4; -+ -+ /* pipeid, meid and unused3 are officially called RINGID, -+ * but for our purposes, they always decode into pipe and ME. */ -+ uint32_t pipeid:2; -+ uint32_t meid:2; -+ uint32_t reserved4:4; -+ uint32_t vmid:8; -+ uint32_t pasid:16; -+ -+ uint32_t reserved5; - }; - - #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 - #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 - #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 - #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF -+#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 -+#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 - - #endif - -diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h -index 48769d1..607fc5c 100644 ---- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h -+++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h -@@ -23,11 +23,33 @@ - #ifndef CIK_REGS_H - #define CIK_REGS_H - -+#define IH_VMID_0_LUT 0x3D40u -+ -+#define BIF_DOORBELL_CNTL 0x530Cu -+ -+#define SRBM_GFX_CNTL 0xE44 -+#define PIPEID(x) ((x) << 0) -+#define MEID(x) ((x) << 2) -+#define VMID(x) ((x) << 4) -+#define QUEUEID(x) ((x) << 8) -+ -+#define SQ_CONFIG 0x8C00 -+ -+#define SH_MEM_BASES 0x8C28 - /* if PTR32, these are the bases for scratch and lds */ - #define PRIVATE_BASE(x) ((x) << 0) /* scratch */ - #define SHARED_BASE(x) ((x) << 16) /* LDS */ -+#define SH_MEM_APE1_BASE 0x8C2C -+/* if PTR32, this is the base location of GPUVM */ -+#define SH_MEM_APE1_LIMIT 0x8C30 -+/* if PTR32, this is the upper limit of GPUVM */ -+#define SH_MEM_CONFIG 0x8C34 - #define PTR32 (1 << 0) -+#define PRIVATE_ATC (1 << 1) - #define ALIGNMENT_MODE(x) ((x) << 2) -+#define SH_MEM_ALIGNMENT_MODE_DWORD 0 -+#define SH_MEM_ALIGNMENT_MODE_DWORD_STRICT 1 -+#define SH_MEM_ALIGNMENT_MODE_STRICT 2 - #define SH_MEM_ALIGNMENT_MODE_UNALIGNED 3 - #define DEFAULT_MTYPE(x) ((x) << 4) - #define APE1_MTYPE(x) ((x) << 7) -@@ -36,37 +58,164 @@ - #define MTYPE_CACHED 0 - #define MTYPE_NONCACHED 3 - -+ -+#define SH_STATIC_MEM_CONFIG 0x9604u -+ -+#define TC_CFG_L1_LOAD_POLICY0 0xAC68 -+#define TC_CFG_L1_LOAD_POLICY1 0xAC6C -+#define TC_CFG_L1_STORE_POLICY 0xAC70 -+#define TC_CFG_L2_LOAD_POLICY0 0xAC74 -+#define TC_CFG_L2_LOAD_POLICY1 0xAC78 -+#define TC_CFG_L2_STORE_POLICY0 0xAC7C -+#define TC_CFG_L2_STORE_POLICY1 0xAC80 -+#define TC_CFG_L2_ATOMIC_POLICY 0xAC84 -+#define TC_CFG_L1_VOLATILE 0xAC88 -+#define TC_CFG_L2_VOLATILE 0xAC8C -+ -+#define CP_PQ_WPTR_POLL_CNTL 0xC20C -+#define WPTR_POLL_EN (1 << 31) -+ -+#define CPC_INT_CNTL 0xC2D0 -+#define CP_ME1_PIPE0_INT_CNTL 0xC214 -+#define CP_ME1_PIPE1_INT_CNTL 0xC218 -+#define CP_ME1_PIPE2_INT_CNTL 0xC21C -+#define CP_ME1_PIPE3_INT_CNTL 0xC220 -+#define CP_ME2_PIPE0_INT_CNTL 0xC224 -+#define CP_ME2_PIPE1_INT_CNTL 0xC228 -+#define CP_ME2_PIPE2_INT_CNTL 0xC22C -+#define CP_ME2_PIPE3_INT_CNTL 0xC230 -+#define DEQUEUE_REQUEST_INT_ENABLE (1 << 13) -+#define WRM_POLL_TIMEOUT_INT_ENABLE (1 << 17) -+#define PRIV_REG_INT_ENABLE (1 << 23) -+#define TIME_STAMP_INT_ENABLE (1 << 26) -+#define GENERIC2_INT_ENABLE (1 << 29) -+#define GENERIC1_INT_ENABLE (1 << 30) -+#define GENERIC0_INT_ENABLE (1 << 31) -+#define CP_ME1_PIPE0_INT_STATUS 0xC214 -+#define CP_ME1_PIPE1_INT_STATUS 0xC218 -+#define CP_ME1_PIPE2_INT_STATUS 0xC21C -+#define CP_ME1_PIPE3_INT_STATUS 0xC220 -+#define CP_ME2_PIPE0_INT_STATUS 0xC224 -+#define CP_ME2_PIPE1_INT_STATUS 0xC228 -+#define CP_ME2_PIPE2_INT_STATUS 0xC22C -+#define CP_ME2_PIPE3_INT_STATUS 0xC230 -+#define DEQUEUE_REQUEST_INT_STATUS (1 << 13) -+#define WRM_POLL_TIMEOUT_INT_STATUS (1 << 17) -+#define PRIV_REG_INT_STATUS (1 << 23) -+#define TIME_STAMP_INT_STATUS (1 << 26) -+#define GENERIC2_INT_STATUS (1 << 29) -+#define GENERIC1_INT_STATUS (1 << 30) -+#define GENERIC0_INT_STATUS (1 << 31) -+ -+#define CP_HPD_EOP_BASE_ADDR 0xC904 -+#define CP_HPD_EOP_BASE_ADDR_HI 0xC908 -+#define CP_HPD_EOP_VMID 0xC90C -+#define CP_HPD_EOP_CONTROL 0xC910 -+#define EOP_SIZE(x) ((x) << 0) -+#define EOP_SIZE_MASK (0x3f << 0) -+#define CP_MQD_BASE_ADDR 0xC914 -+#define CP_MQD_BASE_ADDR_HI 0xC918 -+#define CP_HQD_ACTIVE 0xC91C -+#define CP_HQD_VMID 0xC920 -+ -+#define CP_HQD_PERSISTENT_STATE 0xC924u - #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) - #define PRELOAD_REQ (1 << 0) - --#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) -- --#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) -- --#define IB_ATC_EN (1U << 23) -- -+#define CP_HQD_PIPE_PRIORITY 0xC928u -+#define CP_HQD_QUEUE_PRIORITY 0xC92Cu -+#define CP_HQD_QUANTUM 0xC930u - #define QUANTUM_EN 1U - #define QUANTUM_SCALE_1MS (1U << 4) - #define QUANTUM_DURATION(x) ((x) << 8) - -+#define CP_HQD_PQ_BASE 0xC934 -+#define CP_HQD_PQ_BASE_HI 0xC938 -+#define CP_HQD_PQ_RPTR 0xC93C -+#define CP_HQD_PQ_RPTR_REPORT_ADDR 0xC940 -+#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI 0xC944 -+#define CP_HQD_PQ_WPTR_POLL_ADDR 0xC948 -+#define CP_HQD_PQ_WPTR_POLL_ADDR_HI 0xC94C -+#define CP_HQD_PQ_DOORBELL_CONTROL 0xC950 -+#define DOORBELL_OFFSET(x) ((x) << 2) -+#define DOORBELL_OFFSET_MASK (0x1fffff << 2) -+#define DOORBELL_SOURCE (1 << 28) -+#define DOORBELL_SCHD_HIT (1 << 29) -+#define DOORBELL_EN (1 << 30) -+#define DOORBELL_HIT (1 << 31) -+#define CP_HQD_PQ_WPTR 0xC954 -+#define CP_HQD_PQ_CONTROL 0xC958 -+#define QUEUE_SIZE(x) ((x) << 0) -+#define QUEUE_SIZE_MASK (0x3f << 0) - #define RPTR_BLOCK_SIZE(x) ((x) << 8) -+#define RPTR_BLOCK_SIZE_MASK (0x3f << 8) - #define MIN_AVAIL_SIZE(x) ((x) << 20) -+#define PQ_ATC_EN (1 << 23) -+#define PQ_VOLATILE (1 << 26) -+#define NO_UPDATE_RPTR (1 << 27) -+#define UNORD_DISPATCH (1 << 28) -+#define ROQ_PQ_IB_FLIP (1 << 29) -+#define PRIV_STATE (1 << 30) -+#define KMD_QUEUE (1 << 31) -+ - #define DEFAULT_RPTR_BLOCK_SIZE RPTR_BLOCK_SIZE(5) - #define DEFAULT_MIN_AVAIL_SIZE MIN_AVAIL_SIZE(3) - --#define PQ_ATC_EN (1 << 23) --#define NO_UPDATE_RPTR (1 << 27) -+#define CP_HQD_IB_BASE_ADDR 0xC95Cu -+#define CP_HQD_IB_BASE_ADDR_HI 0xC960u -+#define CP_HQD_IB_RPTR 0xC964u -+#define CP_HQD_IB_CONTROL 0xC968u -+#define IB_ATC_EN (1U << 23) -+#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) - --#define DOORBELL_OFFSET(x) ((x) << 2) --#define DOORBELL_EN (1 << 30) -+#define CP_HQD_DEQUEUE_REQUEST 0xC974 -+#define DEQUEUE_REQUEST_DRAIN 1 -+#define DEQUEUE_REQUEST_RESET 2 -+#define DEQUEUE_INT (1U << 8) - --#define PRIV_STATE (1 << 30) --#define KMD_QUEUE (1 << 31) -+#define CP_HQD_SEMA_CMD 0xC97Cu -+#define CP_HQD_MSG_TYPE 0xC980u -+#define CP_HQD_ATOMIC0_PREOP_LO 0xC984u -+#define CP_HQD_ATOMIC0_PREOP_HI 0xC988u -+#define CP_HQD_ATOMIC1_PREOP_LO 0xC98Cu -+#define CP_HQD_ATOMIC1_PREOP_HI 0xC990u -+#define CP_HQD_HQ_SCHEDULER0 0xC994u -+#define CP_HQD_HQ_SCHEDULER1 0xC998u - --#define AQL_ENABLE 1 -+ -+#define CP_MQD_CONTROL 0xC99C -+#define MQD_VMID(x) ((x) << 0) -+#define MQD_VMID_MASK (0xf << 0) -+#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) - - #define GRBM_GFX_INDEX 0x30800 -+#define INSTANCE_INDEX(x) ((x) << 0) -+#define SH_INDEX(x) ((x) << 8) -+#define SE_INDEX(x) ((x) << 16) -+#define SH_BROADCAST_WRITES (1 << 29) -+#define INSTANCE_BROADCAST_WRITES (1 << 30) -+#define SE_BROADCAST_WRITES (1 << 31) - -+#define SQC_CACHES 0x30d20 -+#define SQC_POLICY 0x8C38u -+#define SQC_VOLATILE 0x8C3Cu -+ -+#define CP_PERFMON_CNTL 0x36020 -+ -+#define ATC_VMID0_PASID_MAPPING 0x339Cu -+#define ATC_VMID_PASID_MAPPING_UPDATE_STATUS 0x3398u - #define ATC_VMID_PASID_MAPPING_VALID (1U << 31) - -+#define ATC_VM_APERTURE0_CNTL 0x3310u -+#define ATS_ACCESS_MODE_NEVER 0 -+#define ATS_ACCESS_MODE_ALWAYS 1 -+ -+#define ATC_VM_APERTURE0_CNTL2 0x3318u -+#define ATC_VM_APERTURE0_HIGH_ADDR 0x3308u -+#define ATC_VM_APERTURE0_LOW_ADDR 0x3300u -+#define ATC_VM_APERTURE1_CNTL 0x3314u -+#define ATC_VM_APERTURE1_CNTL2 0x331Cu -+#define ATC_VM_APERTURE1_HIGH_ADDR 0x330Cu -+#define ATC_VM_APERTURE1_LOW_ADDR 0x3304u -+ - #endif -diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h -new file mode 100644 -index 0000000..1880dc0 ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h -@@ -0,0 +1,1377 @@ -+/* -+ * Copyright 2015 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#if 0 -+ HW (CARRIZO) source code for CWSR trap handler -+ -+var G8SR_WDMEM_HWREG_OFFSET = 0 -+var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes -+ -+// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. -+ -+var G8SR_DEBUG_TIMESTAMP = 0 -+var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset -+var s_g8sr_ts_save_s = s[34:35] // save start -+var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi -+var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ -+var s_g8sr_ts_save_d = s[40:41] // save end -+var s_g8sr_ts_restore_s = s[42:43] // restore start -+var s_g8sr_ts_restore_d = s[44:45] // restore end -+ -+var G8SR_VGPR_SR_IN_DWX4 = 0 -+var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes -+var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 -+ -+ -+/*************************************************************************/ -+/* control on how to run the shader */ -+/*************************************************************************/ -+//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) -+var EMU_RUN_HACK = 0 -+var EMU_RUN_HACK_RESTORE_NORMAL = 0 -+var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 -+var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 -+var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -+var SAVE_LDS = 1 -+var WG_BASE_ADDR_LO = 0x9000a000 -+var WG_BASE_ADDR_HI = 0x0 -+var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem -+var CTX_SAVE_CONTROL = 0x0 -+var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -+var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) -+var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -+var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes -+var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing -+ -+/**************************************************************************/ -+/* variables */ -+/**************************************************************************/ -+var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 -+var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 -+ -+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 -+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 -+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 -+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 -+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 -+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits -+ -+var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 -+var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask -+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 -+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 -+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 -+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF -+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 -+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 -+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 -+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 -+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 -+ -+var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME -+var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME -+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME -+var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME -+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME -+ -+var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 -+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 -+ -+ -+/* Save */ -+var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes -+var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE -+ -+var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -+var S_SAVE_SPI_INIT_ATC_SHIFT = 27 -+var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -+var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 -+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -+ -+var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used -+var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME -+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME -+var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME -+ -+var s_save_spi_init_lo = exec_lo -+var s_save_spi_init_hi = exec_hi -+ -+ //tba_lo and tba_hi need to be saved/restored -+var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} -+var s_save_pc_hi = ttmp1 -+var s_save_exec_lo = ttmp2 -+var s_save_exec_hi = ttmp3 -+var s_save_status = ttmp4 -+var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine -+var s_save_xnack_mask_lo = ttmp6 -+var s_save_xnack_mask_hi = ttmp7 -+var s_save_buf_rsrc0 = ttmp8 -+var s_save_buf_rsrc1 = ttmp9 -+var s_save_buf_rsrc2 = ttmp10 -+var s_save_buf_rsrc3 = ttmp11 -+ -+var s_save_mem_offset = tma_lo -+var s_save_alloc_size = s_save_trapsts //conflict -+var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) -+var s_save_m0 = tma_hi -+ -+/* Restore */ -+var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE -+var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC -+ -+var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -+var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 -+var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -+var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 -+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -+ -+var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT -+var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK -+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK -+ -+var s_restore_spi_init_lo = exec_lo -+var s_restore_spi_init_hi = exec_hi -+ -+var s_restore_mem_offset = ttmp2 -+var s_restore_alloc_size = ttmp3 -+var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored -+var s_restore_mem_offset_save = s_restore_tmp //no conflict -+ -+var s_restore_m0 = s_restore_alloc_size //no conflict -+ -+var s_restore_mode = ttmp7 -+ -+var s_restore_pc_lo = ttmp0 -+var s_restore_pc_hi = ttmp1 -+var s_restore_exec_lo = tma_lo //no conflict -+var s_restore_exec_hi = tma_hi //no conflict -+var s_restore_status = ttmp4 -+var s_restore_trapsts = ttmp5 -+var s_restore_xnack_mask_lo = xnack_mask_lo -+var s_restore_xnack_mask_hi = xnack_mask_hi -+var s_restore_buf_rsrc0 = ttmp8 -+var s_restore_buf_rsrc1 = ttmp9 -+var s_restore_buf_rsrc2 = ttmp10 -+var s_restore_buf_rsrc3 = ttmp11 -+ -+/**************************************************************************/ -+/* trap handler entry points */ -+/**************************************************************************/ -+/* Shader Main*/ -+ -+shader main -+ asic(CARRIZO) -+ type(CS) -+ -+ -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore -+ //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC -+ s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC -+ s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. -+ s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE -+ //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE -+ s_branch L_SKIP_RESTORE //NOT restore, SAVE actually -+ else -+ s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save -+ end -+ -+L_JUMP_TO_RESTORE: -+ s_branch L_RESTORE //restore -+ -+L_SKIP_RESTORE: -+ -+ s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC -+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save -+ s_cbranch_scc1 L_SAVE //this is the operation for save -+ -+ // ********* Handle non-CWSR traps ******************* -+if (!EMU_RUN_HACK) -+ /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ -+ s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 -+ s_waitcnt lgkmcnt(0) -+ s_or_b32 ttmp7, ttmp8, ttmp9 -+ s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set -+ s_mov_b32 tma_lo, ttmp10 //set tma_lo/hi for next level trap handler -+ s_mov_b32 tma_hi, ttmp11 -+ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) -+ s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler -+ -+L_NO_NEXT_TRAP: -+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception -+ s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. -+ s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 -+ s_addc_u32 ttmp1, ttmp1, 0 -+L_EXCP_CASE: -+ s_and_b32 ttmp1, ttmp1, 0xFFFF -+ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) -+ s_rfe_b64 [ttmp0, ttmp1] -+end -+ // ********* End handling of non-CWSR traps ******************* -+ -+/**************************************************************************/ -+/* save routine */ -+/**************************************************************************/ -+ -+L_SAVE: -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_save_s -+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -+end -+ -+ //check whether there is mem_viol -+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK -+ s_cbranch_scc0 L_NO_PC_REWIND -+ -+ //if so, need rewind PC assuming GDS operation gets NACKed -+ s_mov_b32 s_save_tmp, 0 //clear mem_viol bit -+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit -+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] -+ s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 -+ s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc -+ -+L_NO_PC_REWIND: -+ s_mov_b32 s_save_tmp, 0 //clear saveCtx bit -+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit -+ -+ s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK -+ s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation -+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT -+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT -+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp -+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY -+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp -+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS -+ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG -+ -+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp -+ -+ /* inform SPI the readiness and wait for SPI's go signal */ -+ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI -+ s_mov_b32 s_save_exec_hi, exec_hi -+ s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_sq_save_msg -+ s_waitcnt lgkmcnt(0) -+end -+ -+ if (EMU_RUN_HACK) -+ -+ else -+ s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC -+ end -+ -+ L_SLEEP: -+ s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 -+ -+ if (EMU_RUN_HACK) -+ -+ else -+ s_cbranch_execz L_SLEEP -+ end -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_spi_wrexec -+ s_waitcnt lgkmcnt(0) -+end -+ -+ /* setup Resource Contants */ -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) -+ //calculate wd_addr using absolute thread id -+ v_readlane_b32 s_save_tmp, v9, 0 -+ s_lshr_b32 s_save_tmp, s_save_tmp, 6 -+ s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE -+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO -+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI -+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL -+ else -+ end -+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) -+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO -+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI -+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL -+ else -+ end -+ -+ -+ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo -+ s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE -+ s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited -+ s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC -+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK -+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position -+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC -+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK -+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position -+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE -+ -+ //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) -+ s_mov_b32 s_save_m0, m0 //save M0 -+ -+ /* global mem offset */ -+ s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 -+ -+ -+ -+ -+ /* save HW registers */ -+ ////////////////////////////// -+ -+ L_SAVE_HWREG: -+ // HWREG SR memory offset : size(VGPR)+size(SGPR) -+ get_vgpr_size_bytes(s_save_mem_offset) -+ get_sgpr_size_bytes(s_save_tmp) -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp -+ -+ -+ s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ -+ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 -+ -+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) -+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 -+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over -+ s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO -+ s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI -+ end -+ -+ write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC -+ write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) -+ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC -+ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) -+ // Save the tma_lo and tma_hi content from exec_lo and ttmp5 -+ s_mov_b32 s_save_exec_lo, exec_lo -+ s_mov_b32 s_save_exec_hi, ttmp5 -+ write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS -+ -+ //s_save_trapsts conflicts with s_save_alloc_size -+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -+ write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS -+ -+ write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO -+ write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI -+ -+ //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 -+ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE -+ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) -+ write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO -+ write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI -+ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //TMA_LO -+ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) //TMA_HI -+ -+ /* the first wave in the threadgroup */ -+ // save fist_wave bits in tba_hi unused bit.26 -+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit -+ //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] -+ s_mov_b32 s_save_exec_hi, 0x0 -+ s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] -+ -+ -+ /* save SGPRs */ -+ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... -+ ////////////////////////////// -+ -+ // SGPR SR memory offset : size(VGPR) -+ get_vgpr_size_bytes(s_save_mem_offset) -+ // TODO, change RSRC word to rearrange memory layout for SGPRS -+ -+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 -+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) -+ -+ if (SGPR_SAVE_USE_SQC) -+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes -+ else -+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) -+ end -+ -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ -+ // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 -+ //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 -+ s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 -+ s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset -+ -+ s_mov_b32 m0, 0x0 //SGPR initial index value =0 -+ L_SAVE_SGPR_LOOP: -+ // SGPR is allocated in 16 SGPR granularity -+ s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] -+ s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] -+ s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] -+ s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] -+ s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] -+ s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] -+ s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] -+ s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] -+ -+ write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 -+ s_add_u32 m0, m0, 16 //next sgpr index -+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? -+ // restore s_save_buf_rsrc0,1 -+ //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo -+ s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo -+ -+ -+ -+ -+ /* save first 4 VGPR, then LDS save could use */ -+ // each wave will alloc 4 vgprs at least... -+ ///////////////////////////////////////////////////////////////////////////////////// -+ -+ s_mov_b32 s_save_mem_offset, 0 -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ -+ // VGPR Allocated in 4-GPR granularity -+ -+if G8SR_VGPR_SR_IN_DWX4 -+ // the const stride for DWx4 is 4*4 bytes -+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -+ -+ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -+ -+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -+else -+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 -+ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 -+ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -+end -+ -+ -+ -+ /* save LDS */ -+ ////////////////////////////// -+ -+ L_SAVE_LDS: -+ -+ // Change EXEC to all threads... -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size -+ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? -+ s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE -+ -+ s_barrier //LDS is used? wait for other waves in the same TG -+ //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here -+ s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here -+ s_cbranch_scc0 L_SAVE_LDS_DONE -+ -+ // first wave do LDS save; -+ -+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw -+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes -+ s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes -+ -+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) -+ // -+ get_vgpr_size_bytes(s_save_mem_offset) -+ get_sgpr_size_bytes(s_save_tmp) -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() -+ -+ -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -+ -+ -+var LDS_DMA_ENABLE = 0 -+var UNROLL = 0 -+if UNROLL==0 && LDS_DMA_ENABLE==1 -+ s_mov_b32 s3, 256*2 -+ s_nop 0 -+ s_nop 0 -+ s_nop 0 -+ L_SAVE_LDS_LOOP: -+ //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? -+ if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity -+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW -+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW -+ end -+ -+ s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes -+ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? -+ -+elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss -+ // store from higest LDS address to lowest -+ s_mov_b32 s3, 256*2 -+ s_sub_u32 m0, s_save_alloc_size, s3 -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 -+ s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... -+ s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest -+ s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc -+ s_nop 0 -+ s_nop 0 -+ s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes -+ s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved -+ s_add_u32 s0, s0,s_save_alloc_size -+ s_addc_u32 s1, s1, 0 -+ s_setpc_b64 s[0:1] -+ -+ -+ for var i =0; i< 128; i++ -+ // be careful to make here a 64Byte aligned address, which could improve performance... -+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW -+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW -+ -+ if i!=127 -+ s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline -+ s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 -+ end -+ end -+ -+else // BUFFER_STORE -+ v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 -+ v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid -+ v_mul_i32_i24 v2, v3, 8 // tid*8 -+ v_mov_b32 v3, 256*2 -+ s_mov_b32 m0, 0x10000 -+ s_mov_b32 s0, s_save_buf_rsrc3 -+ s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid -+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT -+ -+L_SAVE_LDS_LOOP_VECTOR: -+ ds_read_b64 v[0:1], v2 //x =LDS[a], byte address -+ s_waitcnt lgkmcnt(0) -+ buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 -+// s_waitcnt vmcnt(0) -+ v_add_u32 v2, vcc[0:1], v2, v3 -+ v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size -+ s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR -+ -+ // restore rsrc3 -+ s_mov_b32 s_save_buf_rsrc3, s0 -+ -+end -+ -+L_SAVE_LDS_DONE: -+ -+ -+ /* save VGPRs - set the Rest VGPRs */ -+ ////////////////////////////////////////////////////////////////////////////////////// -+ L_SAVE_VGPR: -+ // VGPR SR memory offset: 0 -+ // TODO rearrange the RSRC words to use swizzle for VGPR save... -+ -+ s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 -+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible -+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ -+ // VGPR Allocated in 4-GPR granularity -+ -+if G8SR_VGPR_SR_IN_DWX4 -+ // the const stride for DWx4 is 4*4 bytes -+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -+ -+ s_mov_b32 m0, 4 // skip first 4 VGPRs -+ s_cmp_lt_u32 m0, s_save_alloc_size -+ s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs -+ -+ s_set_gpr_idx_on m0, 0x1 // This will change M0 -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 -+L_SAVE_VGPR_LOOP: -+ v_mov_b32 v0, v0 // v0 = v[0+m0] -+ v_mov_b32 v1, v1 -+ v_mov_b32 v2, v2 -+ v_mov_b32 v3, v3 -+ -+ -+ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -+ s_add_u32 m0, m0, 4 -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 -+ s_cmp_lt_u32 m0, s_save_alloc_size -+ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? -+ s_set_gpr_idx_off -+L_SAVE_VGPR_LOOP_END: -+ -+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -+else -+ // VGPR store using dw burst -+ s_mov_b32 m0, 0x4 //VGPR initial index value =0 -+ s_cmp_lt_u32 m0, s_save_alloc_size -+ s_cbranch_scc0 L_SAVE_VGPR_END -+ -+ -+ s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later -+ -+ L_SAVE_VGPR_LOOP: -+ v_mov_b32 v0, v0 //v0 = v[0+m0] -+ v_mov_b32 v1, v1 //v0 = v[0+m0] -+ v_mov_b32 v2, v2 //v0 = v[0+m0] -+ v_mov_b32 v3, v3 //v0 = v[0+m0] -+ -+ if(USE_MTBUF_INSTEAD_OF_MUBUF) -+ tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -+ else -+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 -+ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 -+ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -+ end -+ -+ s_add_u32 m0, m0, 4 //next vgpr index -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes -+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? -+ s_set_gpr_idx_off -+end -+ -+L_SAVE_VGPR_END: -+ -+ -+ -+ -+ -+ -+ /* S_PGM_END_SAVED */ //FIXME graphics ONLY -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) -+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] -+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 -+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over -+ s_rfe_b64 s_save_pc_lo //Return to the main shader program -+ else -+ end -+ -+// Save Done timestamp -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_save_d -+ // SGPR SR memory offset : size(VGPR) -+ get_vgpr_size_bytes(s_save_mem_offset) -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET -+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -+ // Need reset rsrc2?? -+ s_mov_b32 m0, s_save_mem_offset -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 -+end -+ -+ -+ s_branch L_END_PGM -+ -+ -+ -+/**************************************************************************/ -+/* restore routine */ -+/**************************************************************************/ -+ -+L_RESTORE: -+ /* Setup Resource Contants */ -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) -+ //calculate wd_addr using absolute thread id -+ v_readlane_b32 s_restore_tmp, v9, 0 -+ s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 -+ s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE -+ s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO -+ s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI -+ s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL -+ else -+ end -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_restore_s -+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -+ // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... -+ s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] -+ s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. -+end -+ -+ -+ -+ s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo -+ s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi -+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE -+ s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) -+ s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC -+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK -+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position -+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC -+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK -+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position -+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE -+ -+ /* global mem offset */ -+// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 -+ -+ /* the first wave in the threadgroup */ -+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK -+ s_cbranch_scc0 L_RESTORE_VGPR -+ -+ /* restore LDS */ -+ ////////////////////////////// -+ L_RESTORE_LDS: -+ -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size -+ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? -+ s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR -+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw -+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes -+ s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes -+ -+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) -+ // -+ get_vgpr_size_bytes(s_restore_mem_offset) -+ get_sgpr_size_bytes(s_restore_tmp) -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? -+ -+ -+ if (SWIZZLE_EN) -+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -+ -+ L_RESTORE_LDS_LOOP: -+ if (SAVE_LDS) -+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW -+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW -+ end -+ s_add_u32 m0, m0, 256*2 // 128 DW -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW -+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? -+ -+ -+ /* restore VGPRs */ -+ ////////////////////////////// -+ L_RESTORE_VGPR: -+ // VGPR SR memory offset : 0 -+ s_mov_b32 s_restore_mem_offset, 0x0 -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 -+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) -+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) -+ if (SWIZZLE_EN) -+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+if G8SR_VGPR_SR_IN_DWX4 -+ get_vgpr_size_bytes(s_restore_mem_offset) -+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -+ -+ // the const stride for DWx4 is 4*4 bytes -+ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -+ -+ s_mov_b32 m0, s_restore_alloc_size -+ s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 -+ -+L_RESTORE_VGPR_LOOP: -+ buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 -+ s_waitcnt vmcnt(0) -+ s_sub_u32 m0, m0, 4 -+ v_mov_b32 v0, v0 // v[0+m0] = v0 -+ v_mov_b32 v1, v1 -+ v_mov_b32 v2, v2 -+ v_mov_b32 v3, v3 -+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -+ s_cmp_eq_u32 m0, 0x8000 -+ s_cbranch_scc0 L_RESTORE_VGPR_LOOP -+ s_set_gpr_idx_off -+ -+ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes -+ -+else -+ // VGPR load using dw burst -+ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -+ s_mov_b32 m0, 4 //VGPR initial index value = 1 -+ s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 -+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later -+ -+ L_RESTORE_VGPR_LOOP: -+ if(USE_MTBUF_INSTEAD_OF_MUBUF) -+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -+ else -+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 -+ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 -+ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 -+ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 -+ end -+ s_waitcnt vmcnt(0) //ensure data ready -+ v_mov_b32 v0, v0 //v[0+m0] = v0 -+ v_mov_b32 v1, v1 -+ v_mov_b32 v2, v2 -+ v_mov_b32 v3, v3 -+ s_add_u32 m0, m0, 4 //next vgpr index -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes -+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? -+ s_set_gpr_idx_off -+ /* VGPR restore on v0 */ -+ if(USE_MTBUF_INSTEAD_OF_MUBUF) -+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -+ else -+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 -+ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 -+ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 -+ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 -+ end -+ -+end -+ -+ /* restore SGPRs */ -+ ////////////////////////////// -+ -+ // SGPR SR memory offset : size(VGPR) -+ get_vgpr_size_bytes(s_restore_mem_offset) -+ get_sgpr_size_bytes(s_restore_tmp) -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group -+ // TODO, change RSRC word to rearrange memory layout for SGPRS -+ -+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 -+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) -+ -+ if (SGPR_SAVE_USE_SQC) -+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes -+ else -+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) -+ end -+ if (SWIZZLE_EN) -+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), -+ However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG -+ */ -+ s_mov_b32 m0, s_restore_alloc_size -+ -+ L_RESTORE_SGPR_LOOP: -+ read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made -+ s_waitcnt lgkmcnt(0) //ensure data ready -+ -+ s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] -+ -+ s_movreld_b64 s0, s0 //s[0+m0] = s0 -+ s_movreld_b64 s2, s2 -+ s_movreld_b64 s4, s4 -+ s_movreld_b64 s6, s6 -+ s_movreld_b64 s8, s8 -+ s_movreld_b64 s10, s10 -+ s_movreld_b64 s12, s12 -+ s_movreld_b64 s14, s14 -+ -+ s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 -+ s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? -+ -+ /* restore HW registers */ -+ ////////////////////////////// -+ L_RESTORE_HWREG: -+ -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo -+ s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi -+end -+ -+ // HWREG SR memory offset : size(VGPR)+size(SGPR) -+ get_vgpr_size_bytes(s_restore_mem_offset) -+ get_sgpr_size_bytes(s_restore_tmp) -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -+ -+ -+ s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes -+ if (SWIZZLE_EN) -+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 -+ read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC -+ read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) -+ read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC -+ read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) -+ read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS -+ read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS -+ read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO -+ read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI -+ read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE -+ read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO -+ read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI -+ -+ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS -+ -+ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS -+ -+ //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) -+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) -+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over -+ end -+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) -+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal -+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over -+ end -+ -+ s_mov_b32 m0, s_restore_m0 -+ s_mov_b32 exec_lo, s_restore_exec_lo -+ s_mov_b32 exec_hi, s_restore_exec_hi -+ -+ read_hwreg_from_mem(tma_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //tma_lo -+ read_hwreg_from_mem(tma_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //tma_hi -+ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS -+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts -+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 -+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts -+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT -+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 -+ //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore -+ s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode -+ //reuse s_restore_m0 as a temp register -+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK -+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT -+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT -+ s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero -+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 -+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK -+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT -+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 -+ s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK -+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT -+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp -+ -+ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 -+ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 -+ s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu -+ -+ s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_restore_d -+ s_waitcnt lgkmcnt(0) -+end -+ -+// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution -+ s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc -+ -+ -+/**************************************************************************/ -+/* the END */ -+/**************************************************************************/ -+L_END_PGM: -+ s_endpgm -+ -+end -+ -+ -+/**************************************************************************/ -+/* the helper functions */ -+/**************************************************************************/ -+ -+//Only for save hwreg to mem -+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) -+ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on -+ s_mov_b32 m0, s_mem_offset -+ s_buffer_store_dword s, s_rsrc, m0 glc:0 -+ s_add_u32 s_mem_offset, s_mem_offset, 4 -+ s_mov_b32 m0, exec_lo -+end -+ -+//Only for save hwreg to mem -+function write_tma_to_mem(s, s_rsrc, offset_imm) -+ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on -+ s_mov_b32 m0, offset_imm -+ s_buffer_store_dword s, s_rsrc, m0 glc:0 -+ s_mov_b32 m0, exec_lo -+end -+ -+// HWREG are saved before SGPRs, so all HWREG could be use. -+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) -+ -+ s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:0 -+ s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:0 -+ s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:0 -+ s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:0 -+ s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 -+ s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc -+end -+ -+ -+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) -+ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 -+ s_add_u32 s_mem_offset, s_mem_offset, 4 -+end -+ -+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) -+ s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 -+ s_sub_u32 s_mem_offset, s_mem_offset, 4*16 -+end -+ -+ -+ -+function get_lds_size_bytes(s_lds_size_byte) -+ // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW -+ s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size -+ s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW -+end -+ -+function get_vgpr_size_bytes(s_vgpr_size_byte) -+ s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -+ s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 -+ s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible -+end -+ -+function get_sgpr_size_bytes(s_sgpr_size_byte) -+ s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -+ s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 -+ s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) -+end -+ -+function get_hwreg_size_bytes -+ return 128 //HWREG size 128 bytes -+end -+ -+#endif -+ -+static const uint32_t cwsr_trap_carrizo_hex[] = { -+ 0xbf820001, 0xbf820131, -+ 0xb8f4f802, 0xb8f5f803, -+ 0x8675ff75, 0x00000400, -+ 0xbf850013, 0xc00a1e37, -+ 0x00000000, 0xbf8c007f, -+ 0x87777978, 0xbf840004, -+ 0xbeee007a, 0xbeef007b, -+ 0xb974f802, 0xbe801d78, -+ 0xb8f5f803, 0x8675ff75, -+ 0x000001ff, 0xbf850002, -+ 0x80708470, 0x82718071, -+ 0x8671ff71, 0x0000ffff, -+ 0xb974f802, 0xbe801f70, -+ 0xb8f5f803, 0x8675ff75, -+ 0x00000100, 0xbf840006, -+ 0xbefa0080, 0xb97a0203, -+ 0x8671ff71, 0x0000ffff, -+ 0x80f08870, 0x82f18071, -+ 0xbefa0080, 0xb97a0283, -+ 0xbef60068, 0xbef70069, -+ 0xb8fa1c07, 0x8e7a9c7a, -+ 0x87717a71, 0xb8fa03c7, -+ 0x8e7a9b7a, 0x87717a71, -+ 0xb8faf807, 0x867aff7a, -+ 0x00007fff, 0xb97af807, -+ 0xbef2007e, 0xbef3007f, -+ 0xbefe0180, 0xbf900004, -+ 0xbf8e0002, 0xbf88fffe, -+ 0xbef8007e, 0x8679ff7f, -+ 0x0000ffff, 0x8779ff79, -+ 0x00040000, 0xbefa0080, -+ 0xbefb00ff, 0x00807fac, -+ 0x867aff7f, 0x08000000, -+ 0x8f7a837a, 0x877b7a7b, -+ 0x867aff7f, 0x70000000, -+ 0x8f7a817a, 0x877b7a7b, -+ 0xbeef007c, 0xbeee0080, -+ 0xb8ee2a05, 0x806e816e, -+ 0x8e6e8a6e, 0xb8fa1605, -+ 0x807a817a, 0x8e7a867a, -+ 0x806e7a6e, 0xbefa0084, -+ 0xbefa00ff, 0x01000000, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601bfc, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601c3c, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601c7c, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601cbc, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601cfc, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xbef2007e, 0xbef30075, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601d3c, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xb8f5f803, 0xbefe007c, -+ 0xbefc006e, 0xc0601d7c, -+ 0x0000007c, 0x806e846e, -+ 0xbefc007e, 0xbefe007c, -+ 0xbefc006e, 0xc0601dbc, -+ 0x0000007c, 0x806e846e, -+ 0xbefc007e, 0xbefe007c, -+ 0xbefc006e, 0xc0601dfc, -+ 0x0000007c, 0x806e846e, -+ 0xbefc007e, 0xb8eff801, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601bfc, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601b3c, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601b7c, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601cbc, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0xbefe007c, 0xbefc006e, -+ 0xc0601cfc, 0x0000007c, -+ 0x806e846e, 0xbefc007e, -+ 0x867aff7f, 0x04000000, -+ 0xbef30080, 0x8773737a, -+ 0xb8ee2a05, 0x806e816e, -+ 0x8e6e8a6e, 0xb8f51605, -+ 0x80758175, 0x8e758475, -+ 0x8e7a8275, 0xbefa00ff, -+ 0x01000000, 0xbef60178, -+ 0x80786e78, 0xbefc0080, -+ 0xbe802b00, 0xbe822b02, -+ 0xbe842b04, 0xbe862b06, -+ 0xbe882b08, 0xbe8a2b0a, -+ 0xbe8c2b0c, 0xbe8e2b0e, -+ 0xc06a003c, 0x00000000, -+ 0xc06a013c, 0x00000010, -+ 0xc06a023c, 0x00000020, -+ 0xc06a033c, 0x00000030, -+ 0x8078c078, 0x82798079, -+ 0x807c907c, 0xbf0a757c, -+ 0xbf85ffeb, 0xbef80176, -+ 0xbeee0080, 0xbefe00c1, -+ 0xbeff00c1, 0xbefa00ff, -+ 0x01000000, 0xe0724000, -+ 0x6e1e0000, 0xe0724100, -+ 0x6e1e0100, 0xe0724200, -+ 0x6e1e0200, 0xe0724300, -+ 0x6e1e0300, 0xbefe00c1, -+ 0xbeff00c1, 0xb8f54306, -+ 0x8675c175, 0xbf84002c, -+ 0xbf8a0000, 0x867aff73, -+ 0x04000000, 0xbf840028, -+ 0x8e758675, 0x8e758275, -+ 0xbefa0075, 0xb8ee2a05, -+ 0x806e816e, 0x8e6e8a6e, -+ 0xb8fa1605, 0x807a817a, -+ 0x8e7a867a, 0x806e7a6e, -+ 0x806eff6e, 0x00000080, -+ 0xbefa00ff, 0x01000000, -+ 0xbefc0080, 0xd28c0002, -+ 0x000100c1, 0xd28d0003, -+ 0x000204c1, 0xd1060002, -+ 0x00011103, 0x7e0602ff, -+ 0x00000200, 0xbefc00ff, -+ 0x00010000, 0xbe80007b, -+ 0x867bff7b, 0xff7fffff, -+ 0x877bff7b, 0x00058000, -+ 0xd8ec0000, 0x00000002, -+ 0xbf8c007f, 0xe0765000, -+ 0x6e1e0002, 0x32040702, -+ 0xd0c9006a, 0x0000eb02, -+ 0xbf87fff7, 0xbefb0000, -+ 0xbeee00ff, 0x00000400, -+ 0xbefe00c1, 0xbeff00c1, -+ 0xb8f52a05, 0x80758175, -+ 0x8e758275, 0x8e7a8875, -+ 0xbefa00ff, 0x01000000, -+ 0xbefc0084, 0xbf0a757c, -+ 0xbf840015, 0xbf11017c, -+ 0x8075ff75, 0x00001000, -+ 0x7e000300, 0x7e020301, -+ 0x7e040302, 0x7e060303, -+ 0xe0724000, 0x6e1e0000, -+ 0xe0724100, 0x6e1e0100, -+ 0xe0724200, 0x6e1e0200, -+ 0xe0724300, 0x6e1e0300, -+ 0x807c847c, 0x806eff6e, -+ 0x00000400, 0xbf0a757c, -+ 0xbf85ffef, 0xbf9c0000, -+ 0xbf8200d1, 0xbef8007e, -+ 0x8679ff7f, 0x0000ffff, -+ 0x8779ff79, 0x00040000, -+ 0xbefa0080, 0xbefb00ff, -+ 0x00807fac, 0x8676ff7f, -+ 0x08000000, 0x8f768376, -+ 0x877b767b, 0x8676ff7f, -+ 0x70000000, 0x8f768176, -+ 0x877b767b, 0x8676ff7f, -+ 0x04000000, 0xbf84001e, -+ 0xbefe00c1, 0xbeff00c1, -+ 0xb8f34306, 0x8673c173, -+ 0xbf840019, 0x8e738673, -+ 0x8e738273, 0xbefa0073, -+ 0xb8f22a05, 0x80728172, -+ 0x8e728a72, 0xb8f61605, -+ 0x80768176, 0x8e768676, -+ 0x80727672, 0x8072ff72, -+ 0x00000080, 0xbefa00ff, -+ 0x01000000, 0xbefc0080, -+ 0xe0510000, 0x721e0000, -+ 0xe0510100, 0x721e0000, -+ 0x807cff7c, 0x00000200, -+ 0x8072ff72, 0x00000200, -+ 0xbf0a737c, 0xbf85fff6, -+ 0xbef20080, 0xbefe00c1, -+ 0xbeff00c1, 0xb8f32a05, -+ 0x80738173, 0x8e738273, -+ 0x8e7a8873, 0xbefa00ff, -+ 0x01000000, 0xbef60072, -+ 0x8072ff72, 0x00000400, -+ 0xbefc0084, 0xbf11087c, -+ 0x8073ff73, 0x00008000, -+ 0xe0524000, 0x721e0000, -+ 0xe0524100, 0x721e0100, -+ 0xe0524200, 0x721e0200, -+ 0xe0524300, 0x721e0300, -+ 0xbf8c0f70, 0x7e000300, -+ 0x7e020301, 0x7e040302, -+ 0x7e060303, 0x807c847c, -+ 0x8072ff72, 0x00000400, -+ 0xbf0a737c, 0xbf85ffee, -+ 0xbf9c0000, 0xe0524000, -+ 0x761e0000, 0xe0524100, -+ 0x761e0100, 0xe0524200, -+ 0x761e0200, 0xe0524300, -+ 0x761e0300, 0xb8f22a05, -+ 0x80728172, 0x8e728a72, -+ 0xb8f61605, 0x80768176, -+ 0x8e768676, 0x80727672, -+ 0x80f2c072, 0xb8f31605, -+ 0x80738173, 0x8e738473, -+ 0x8e7a8273, 0xbefa00ff, -+ 0x01000000, 0xbefc0073, -+ 0xc031003c, 0x00000072, -+ 0x80f2c072, 0xbf8c007f, -+ 0x80fc907c, 0xbe802d00, -+ 0xbe822d02, 0xbe842d04, -+ 0xbe862d06, 0xbe882d08, -+ 0xbe8a2d0a, 0xbe8c2d0c, -+ 0xbe8e2d0e, 0xbf06807c, -+ 0xbf84fff1, 0xb8f22a05, -+ 0x80728172, 0x8e728a72, -+ 0xb8f61605, 0x80768176, -+ 0x8e768676, 0x80727672, -+ 0xbefa0084, 0xbefa00ff, -+ 0x01000000, 0xc0211cfc, -+ 0x00000072, 0x80728472, -+ 0xc0211c3c, 0x00000072, -+ 0x80728472, 0xc0211c7c, -+ 0x00000072, 0x80728472, -+ 0xc0211bbc, 0x00000072, -+ 0x80728472, 0xc0211bfc, -+ 0x00000072, 0x80728472, -+ 0xc0211d3c, 0x00000072, -+ 0x80728472, 0xc0211d7c, -+ 0x00000072, 0x80728472, -+ 0xc0211a3c, 0x00000072, -+ 0x80728472, 0xc0211a7c, -+ 0x00000072, 0x80728472, -+ 0xc0211dfc, 0x00000072, -+ 0x80728472, 0xc0211b3c, -+ 0x00000072, 0x80728472, -+ 0xc0211b7c, 0x00000072, -+ 0x80728472, 0xbf8c007f, -+ 0x8671ff71, 0x0000ffff, -+ 0xbefc0073, 0xbefe006e, -+ 0xbeff006f, 0xc0211bbc, -+ 0x00000072, 0x80728472, -+ 0xc0211bfc, 0x00000072, -+ 0x80728472, 0xbf8c007f, -+ 0x867375ff, 0x000003ff, -+ 0xb9734803, 0x867375ff, -+ 0xfffff800, 0x8f738b73, -+ 0xb973a2c3, 0xb977f801, -+ 0x8673ff71, 0xf0000000, -+ 0x8f739c73, 0x8e739073, -+ 0xbef60080, 0x87767376, -+ 0x8673ff71, 0x08000000, -+ 0x8f739b73, 0x8e738f73, -+ 0x87767376, 0x8673ff74, -+ 0x00800000, 0x8f739773, -+ 0xb976f807, 0x86fe7e7e, -+ 0x86ea6a6a, 0xb974f802, -+ 0xbf8a0000, 0x95807370, -+ 0xbf810000, 0x00000000, -+}; -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -index 6316aad..595640a 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -@@ -31,16 +31,23 @@ - #include <uapi/linux/kfd_ioctl.h> - #include <linux/time.h> - #include <linux/mm.h> --#include <linux/mman.h> -+#include <uapi/asm-generic/mman-common.h> - #include <asm/processor.h> -+ - #include "kfd_priv.h" - #include "kfd_device_queue_manager.h" - #include "kfd_dbgmgr.h" -+#include "cik_regs.h" - - static long kfd_ioctl(struct file *, unsigned int, unsigned long); - static int kfd_open(struct inode *, struct file *); - static int kfd_mmap(struct file *, struct vm_area_struct *); -+static uint32_t kfd_convert_user_mem_alloction_flags( -+ struct kfd_dev *dev, -+ uint32_t userspace_flags); -+static bool kfd_is_large_bar(struct kfd_dev *dev); - -+static int kfd_evict(struct file *filep, struct kfd_process *p, void *data); - static const char kfd_dev_name[] = "kfd"; - - static const struct file_operations kfd_fops = { -@@ -117,7 +124,7 @@ static int kfd_open(struct inode *inode, struct file *filep) - return -EPERM; - } - -- process = kfd_create_process(current); -+ process = kfd_create_process(filep); - if (IS_ERR(process)) - return PTR_ERR(process); - -@@ -206,6 +213,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, - q_properties->ctx_save_restore_area_address = - args->ctx_save_restore_address; - q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; -+ q_properties->ctl_stack_size = args->ctl_stack_size; - if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || - args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) - q_properties->type = KFD_QUEUE_TYPE_COMPUTE; -@@ -270,7 +278,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, - return -EINVAL; - } - -- mutex_lock(&p->mutex); -+ down_write(&p->lock); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { -@@ -282,8 +290,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, - p->pasid, - dev->id); - -- err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, -- 0, q_properties.type, &queue_id); -+ err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id); - if (err != 0) - goto err_create_queue; - -@@ -291,10 +298,10 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, - - - /* Return gpu_id as doorbell offset for mmap usage */ -- args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); -+ args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL | args->gpu_id); - args->doorbell_offset <<= PAGE_SHIFT; - -- mutex_unlock(&p->mutex); -+ up_write(&p->lock); - - pr_debug("kfd: queue id %d was created successfully\n", args->queue_id); - -@@ -311,7 +318,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, - - err_create_queue: - err_bind_process: -- mutex_unlock(&p->mutex); -+ up_write(&p->lock); - return err; - } - -@@ -325,11 +332,11 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, - args->queue_id, - p->pasid); - -- mutex_lock(&p->mutex); -+ down_write(&p->lock); - - retval = pqm_destroy_queue(&p->pqm, args->queue_id); - -- mutex_unlock(&p->mutex); -+ up_write(&p->lock); - return retval; - } - -@@ -371,11 +378,33 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, - pr_debug("kfd: updating queue id %d for PASID %d\n", - args->queue_id, p->pasid); - -- mutex_lock(&p->mutex); -+ down_write(&p->lock); - - retval = pqm_update_queue(&p->pqm, args->queue_id, &properties); - -- mutex_unlock(&p->mutex); -+ up_write(&p->lock); -+ -+ return retval; -+} -+ -+static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, -+ void *data) -+{ -+ int retval; -+ struct kfd_ioctl_set_cu_mask_args *args = data; -+ struct queue_properties properties; -+ uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; -+ -+ if (get_user(properties.cu_mask, cu_mask_ptr)) -+ return -EFAULT; -+ if (properties.cu_mask == 0) -+ return 0; -+ -+ down_write(&p->lock); -+ -+ retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); -+ -+ up_write(&p->lock); - - return retval; - } -@@ -403,7 +432,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, - if (dev == NULL) - return -EINVAL; - -- mutex_lock(&p->mutex); -+ down_write(&p->lock); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { -@@ -427,46 +456,80 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, - err = -EINVAL; - - out: -- mutex_unlock(&p->mutex); -+ up_write(&p->lock); - - return err; - } - --static int kfd_ioctl_dbg_register(struct file *filep, -- struct kfd_process *p, void *data) -+static int kfd_ioctl_set_trap_handler(struct file *filep, -+ struct kfd_process *p, void *data) - { -- struct kfd_ioctl_dbg_register_args *args = data; -+ struct kfd_ioctl_set_trap_handler_args *args = data; - struct kfd_dev *dev; -- struct kfd_dbgmgr *dbgmgr_ptr; -+ int err = 0; - struct kfd_process_device *pdd; -- bool create_ok; -- long status = 0; - - dev = kfd_device_by_id(args->gpu_id); - if (dev == NULL) - return -EINVAL; - -- if (dev->device_info->asic_family == CHIP_CARRIZO) { -- pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); -- return -EINVAL; -+ down_write(&p->lock); -+ -+ pdd = kfd_bind_process_to_device(dev, p); -+ if (IS_ERR(pdd)) { -+ err = -ESRCH; -+ goto out; -+ } -+ if (!dev->cwsr_enabled || !pdd->qpd.cwsr_kaddr) { -+ pr_err("kfd: CWSR is not enabled, can't set trap handler.\n"); -+ err = -EINVAL; -+ goto out; - } - -- mutex_lock(kfd_get_dbgmgr_mutex()); -- mutex_lock(&p->mutex); -+ if (dev->dqm->ops.set_trap_handler(dev->dqm, -+ &pdd->qpd, -+ args->tba_addr, -+ args->tma_addr)) -+ err = -EINVAL; - -- /* -- * make sure that we have pdd, if this the first queue created for -- * this process -- */ -+out: -+ up_write(&p->lock); -+ -+ return err; -+} -+ -+static int -+kfd_ioctl_dbg_register(struct file *filep, struct kfd_process *p, void *data) -+{ -+ long status = -EFAULT; -+ struct kfd_ioctl_dbg_register_args *args = data; -+ struct kfd_dev *dev; -+ struct kfd_dbgmgr *dbgmgr_ptr; -+ struct kfd_process_device *pdd; -+ bool create_ok = false; -+ -+ pr_debug("kfd:dbg: %s\n", __func__); -+ -+ dev = kfd_device_by_id(args->gpu_id); -+ if (!dev) { -+ dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__); -+ return status; -+ } -+ -+ down_write(&p->lock); -+ mutex_lock(get_dbgmgr_mutex()); -+ -+ /* make sure that we have pdd, if this the first queue created for this process */ - pdd = kfd_bind_process_to_device(dev, p); -- if (IS_ERR(pdd)) { -- mutex_unlock(&p->mutex); -- mutex_unlock(kfd_get_dbgmgr_mutex()); -+ if (IS_ERR(pdd) < 0) { -+ mutex_unlock(get_dbgmgr_mutex()); -+ up_write(&p->lock); - return PTR_ERR(pdd); - } - - if (dev->dbgmgr == NULL) { - /* In case of a legal call, we have no dbgmgr yet */ -+ - create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev); - if (create_ok) { - status = kfd_dbgmgr_register(dbgmgr_ptr, p); -@@ -475,13 +538,10 @@ static int kfd_ioctl_dbg_register(struct file *filep, - else - dev->dbgmgr = dbgmgr_ptr; - } -- } else { -- pr_debug("debugger already registered\n"); -- status = -EINVAL; - } - -- mutex_unlock(&p->mutex); -- mutex_unlock(kfd_get_dbgmgr_mutex()); -+ mutex_unlock(get_dbgmgr_mutex()); -+ up_write(&p->lock); - - return status; - } -@@ -489,9 +549,9 @@ static int kfd_ioctl_dbg_register(struct file *filep, - static int kfd_ioctl_dbg_unregister(struct file *filep, - struct kfd_process *p, void *data) - { -+ long status = -EFAULT; - struct kfd_ioctl_dbg_unregister_args *args = data; - struct kfd_dev *dev; -- long status; - - dev = kfd_device_by_id(args->gpu_id); - if (dev == NULL) -@@ -502,7 +562,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, - return -EINVAL; - } - -- mutex_lock(kfd_get_dbgmgr_mutex()); -+ mutex_lock(get_dbgmgr_mutex()); - - status = kfd_dbgmgr_unregister(dev->dbgmgr, p); - if (status == 0) { -@@ -510,7 +570,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, - dev->dbgmgr = NULL; - } - -- mutex_unlock(kfd_get_dbgmgr_mutex()); -+ mutex_unlock(get_dbgmgr_mutex()); - - return status; - } -@@ -519,125 +579,144 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, - * Parse and generate variable size data structure for address watch. - * Total size of the buffer and # watch points is limited in order - * to prevent kernel abuse. (no bearing to the much smaller HW limitation -- * which is enforced by dbgdev module) -+ * which is enforced by dbgdev module. - * please also note that the watch address itself are not "copied from user", - * since it be set into the HW in user mode values. - * - */ --static int kfd_ioctl_dbg_address_watch(struct file *filep, -- struct kfd_process *p, void *data) -+ -+static int -+kfd_ioctl_dbg_address_watch(struct file *filep, -+ struct kfd_process *p, -+ void *data) - { -+ long status = -EFAULT; - struct kfd_ioctl_dbg_address_watch_args *args = data; - struct kfd_dev *dev; - struct dbg_address_watch_info aw_info; -- unsigned char *args_buff; -- long status; -- void __user *cmd_from_user; -- uint64_t watch_mask_value = 0; -+ unsigned char *args_buff = NULL; - unsigned int args_idx = 0; -+ uint64_t watch_mask_value = 0; - - memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info)); - -- dev = kfd_device_by_id(args->gpu_id); -- if (dev == NULL) -- return -EINVAL; -+ do { -+ dev = kfd_device_by_id(args->gpu_id); -+ if (!dev) { -+ dev_info(NULL, -+ "Error! kfd: In func %s >> get device by id failed\n", -+ __func__); -+ break; -+ } - -- if (dev->device_info->asic_family == CHIP_CARRIZO) { -- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); -- return -EINVAL; -- } -+ if (args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) { -+ status = -EINVAL; -+ break; -+ } - -- cmd_from_user = (void __user *) args->content_ptr; -+ if (args->buf_size_in_bytes <= sizeof(*args)) { -+ status = -EINVAL; -+ break; -+ } - -- /* Validate arguments */ -+ /* this is the actual buffer to work with */ - -- if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) || -- (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) || -- (cmd_from_user == NULL)) -- return -EINVAL; -+ args_buff = kzalloc(args->buf_size_in_bytes - -+ sizeof(*args), GFP_KERNEL); -+ if (args_buff == NULL) { -+ status = -ENOMEM; -+ break; -+ } - -- /* this is the actual buffer to work with */ -- args_buff = memdup_user(cmd_from_user, -- args->buf_size_in_bytes - sizeof(*args)); -- if (IS_ERR(args_buff)) -- return PTR_ERR(args_buff); -+ /* this is the actual buffer to work with */ -+ args_buff = memdup_user(cmd_from_user, -+ args->buf_size_in_bytes - sizeof(*args)); -+ if (IS_ERR(args_buff)) -+ return PTR_ERR(args_buff); - -- aw_info.process = p; -+ aw_info.process = p; - -- aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx])); -- args_idx += sizeof(aw_info.num_watch_points); -+ aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx])); -+ args_idx += sizeof(aw_info.num_watch_points); - -- aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx]; -- args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points; -+ aw_info.watch_mode = (HSA_DBG_WATCH_MODE *) &args_buff[args_idx]; -+ args_idx += sizeof(HSA_DBG_WATCH_MODE) * aw_info.num_watch_points; - -- /* -- * set watch address base pointer to point on the array base -- * within args_buff -- */ -- aw_info.watch_address = (uint64_t *) &args_buff[args_idx]; -+ /* set watch address base pointer to point on the array base within args_buff */ - -- /* skip over the addresses buffer */ -- args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; -+ aw_info.watch_address = (uint64_t *) &args_buff[args_idx]; - -- if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) { -- kfree(args_buff); -- return -EINVAL; -- } -+ /*skip over the addresses buffer */ -+ args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; - -- watch_mask_value = (uint64_t) args_buff[args_idx]; -+ if (args_idx >= args->buf_size_in_bytes) { -+ status = -EINVAL; -+ break; -+ } - -- if (watch_mask_value > 0) { -- /* -- * There is an array of masks. -- * set watch mask base pointer to point on the array base -- * within args_buff -- */ -- aw_info.watch_mask = (uint64_t *) &args_buff[args_idx]; -+ watch_mask_value = (uint64_t) args_buff[args_idx]; - -- /* skip over the masks buffer */ -- args_idx += sizeof(aw_info.watch_mask) * -- aw_info.num_watch_points; -- } else { -- /* just the NULL mask, set to NULL and skip over it */ -- aw_info.watch_mask = NULL; -- args_idx += sizeof(aw_info.watch_mask); -- } -+ if (watch_mask_value > 0) { -+ /* there is an array of masks */ - -- if (args_idx >= args->buf_size_in_bytes - sizeof(args)) { -- kfree(args_buff); -- return -EINVAL; -- } -+ /* set watch mask base pointer to point on the array base within args_buff */ -+ aw_info.watch_mask = (uint64_t *) &args_buff[args_idx]; -+ -+ /*skip over the masks buffer */ -+ args_idx += sizeof(aw_info.watch_mask) * aw_info.num_watch_points; -+ } -+ -+ else -+ /* just the NULL mask, set to NULL and skip over it */ -+ { -+ aw_info.watch_mask = NULL; -+ args_idx += sizeof(aw_info.watch_mask); -+ } -+ -+ if (args_idx > args->buf_size_in_bytes) { -+ status = -EINVAL; -+ break; -+ } -+ -+ aw_info.watch_event = NULL; /* Currently HSA Event is not supported for DBG */ -+ status = 0; -+ -+ } while (0); - -- /* Currently HSA Event is not supported for DBG */ -- aw_info.watch_event = NULL; -+ if (status == 0) { -+ mutex_lock(get_dbgmgr_mutex()); - -- mutex_lock(kfd_get_dbgmgr_mutex()); -+ status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info); - -- status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info); -+ mutex_unlock(get_dbgmgr_mutex()); - -- mutex_unlock(kfd_get_dbgmgr_mutex()); -+ } - - kfree(args_buff); - - return status; - } - --/* Parse and generate fixed size data structure for wave control */ --static int kfd_ioctl_dbg_wave_control(struct file *filep, -- struct kfd_process *p, void *data) -+/* -+ * Parse and generate fixed size data structure for wave control. -+ * Buffer is generated in a "packed" form, for avoiding structure packing/pending dependencies. -+ */ -+ -+static int -+kfd_ioctl_dbg_wave_control(struct file *filep, struct kfd_process *p, void *data) - { -+ long status = -EFAULT; - struct kfd_ioctl_dbg_wave_control_args *args = data; - struct kfd_dev *dev; - struct dbg_wave_control_info wac_info; -- unsigned char *args_buff; -- uint32_t computed_buff_size; -- long status; -- void __user *cmd_from_user; -+ unsigned char *args_buff = NULL; - unsigned int args_idx = 0; -+ uint32_t computed_buff_size; - - memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info)); - - /* we use compact form, independent of the packing attribute value */ -+ - computed_buff_size = sizeof(*args) + - sizeof(wac_info.mode) + - sizeof(wac_info.operand) + -@@ -645,26 +724,25 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, - sizeof(wac_info.dbgWave_msg.MemoryVA) + - sizeof(wac_info.trapId); - -- dev = kfd_device_by_id(args->gpu_id); -- if (dev == NULL) -- return -EINVAL; - -- if (dev->device_info->asic_family == CHIP_CARRIZO) { -- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); -- return -EINVAL; -- } -+ dev_info(NULL, "kfd: In func %s - start\n", __func__); - -- /* input size must match the computed "compact" size */ -- if (args->buf_size_in_bytes != computed_buff_size) { -- pr_debug("size mismatch, computed : actual %u : %u\n", -- args->buf_size_in_bytes, computed_buff_size); -- return -EINVAL; -- } -+ do { -+ dev = kfd_device_by_id(args->gpu_id); -+ if (!dev) { -+ dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__); -+ break; -+ } - -- cmd_from_user = (void __user *) args->content_ptr; -+ /* input size must match the computed "compact" size */ - -- if (cmd_from_user == NULL) -- return -EINVAL; -+ if (args->buf_size_in_bytes != computed_buff_size) { -+ dev_info(NULL, -+ "Error! kfd: In func %s >> size mismatch, computed : actual %u : %u\n", -+ __func__, args->buf_size_in_bytes, computed_buff_size); -+ status = -EINVAL; -+ break; -+ } - - /* copy the entire buffer from user */ - -@@ -673,34 +751,51 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, - if (IS_ERR(args_buff)) - return PTR_ERR(args_buff); - -- /* move ptr to the start of the "pay-load" area */ -- wac_info.process = p; -+ if (copy_from_user(args_buff, -+ (void __user *) args->content_ptr, -+ args->buf_size_in_bytes - sizeof(*args))) { -+ dev_info(NULL, -+ "Error! kfd: In func %s >> copy_from_user failed\n", -+ __func__); -+ break; -+ } -+ -+ /* move ptr to the start of the "pay-load" area */ - -- wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx])); -- args_idx += sizeof(wac_info.operand); - -- wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx])); -- args_idx += sizeof(wac_info.mode); -+ wac_info.process = p; - -- wac_info.trapId = *((uint32_t *)(&args_buff[args_idx])); -- args_idx += sizeof(wac_info.trapId); -+ wac_info.operand = (HSA_DBG_WAVEOP) *((HSA_DBG_WAVEOP *)(&args_buff[args_idx])); -+ args_idx += sizeof(wac_info.operand); - -- wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = -- *((uint32_t *)(&args_buff[args_idx])); -- wac_info.dbgWave_msg.MemoryVA = NULL; -+ wac_info.mode = (HSA_DBG_WAVEMODE) *((HSA_DBG_WAVEMODE *)(&args_buff[args_idx])); -+ args_idx += sizeof(wac_info.mode); - -- mutex_lock(kfd_get_dbgmgr_mutex()); -+ wac_info.trapId = (uint32_t) *((uint32_t *)(&args_buff[args_idx])); -+ args_idx += sizeof(wac_info.trapId); - -- pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n", -- wac_info.process, wac_info.operand, -- wac_info.mode, wac_info.trapId, -- wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); -+ wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = *((uint32_t *)(&args_buff[args_idx])); -+ wac_info.dbgWave_msg.MemoryVA = NULL; - -- status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info); - -- pr_debug("Returned status of dbg manager is %ld\n", status); -+ status = 0; - -- mutex_unlock(kfd_get_dbgmgr_mutex()); -+ } while (0); -+ if (status == 0) { -+ mutex_lock(get_dbgmgr_mutex()); -+ -+ dev_info(NULL, -+ "kfd: In func %s >> calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n", -+ __func__, wac_info.process, wac_info.operand, wac_info.mode, wac_info.trapId, -+ wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); -+ -+ status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info); -+ -+ dev_info(NULL, "kfd: In func %s >> returned status of dbg manager is %ld\n", __func__, status); -+ -+ mutex_unlock(get_dbgmgr_mutex()); -+ -+ } - - kfree(args_buff); - -@@ -715,12 +810,13 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, - struct timespec64 time; - - dev = kfd_device_by_id(args->gpu_id); -- if (dev == NULL) -- return -EINVAL; -- -- /* Reading GPU clock counter from KGD */ -- args->gpu_clock_counter = -- dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); -+ if (dev) -+ /* Reading GPU clock counter from KGD */ -+ args->gpu_clock_counter = -+ dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); -+ else -+ /* Node without GPU resource */ -+ args->gpu_clock_counter = 0; - - /* No access to rdtsc. Using raw monotonic time */ - getrawmonotonic64(&time); -@@ -747,7 +843,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, - - args->num_of_nodes = 0; - -- mutex_lock(&p->mutex); -+ down_write(&p->lock); - - /*if the process-device list isn't empty*/ - if (kfd_has_process_device_data(p)) { -@@ -786,52 +882,180 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, - (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); - } - -- mutex_unlock(&p->mutex); -+ up_write(&p->lock); - - return 0; - } - --static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, -- void *data) -+static int kfd_ioctl_get_process_apertures_new(struct file *filp, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_get_process_apertures_new_args *args = data; -+ struct kfd_process_device_apertures *pa; -+ struct kfd_process_device *pdd; -+ uint32_t nodes = 0; -+ int ret; -+ -+ dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); -+ -+ if (args->num_of_nodes == 0) { -+ /* Return number of nodes, so that user space can alloacate -+ * sufficient memory */ -+ down_write(&p->lock); -+ -+ if (!kfd_has_process_device_data(p)) { -+ up_write(&p->lock); -+ return 0; -+ } -+ -+ /* Run over all pdd of the process */ -+ pdd = kfd_get_first_process_device_data(p); -+ do { -+ args->num_of_nodes++; -+ } while ((pdd = -+ kfd_get_next_process_device_data(p, pdd)) != NULL); -+ -+ up_write(&p->lock); -+ return 0; -+ } -+ -+ /* Fill in process-aperture information for all available -+ * nodes, but not more than args->num_of_nodes as that is -+ * the amount of memory allocated by user */ -+ pa = kzalloc((sizeof(struct kfd_process_device_apertures) * -+ args->num_of_nodes), GFP_KERNEL); -+ if (!pa) -+ return -ENOMEM; -+ -+ down_write(&p->lock); -+ -+ if (!kfd_has_process_device_data(p)) { -+ up_write(&p->lock); -+ args->num_of_nodes = 0; -+ kfree(pa); -+ return 0; -+ } -+ -+ /* Run over all pdd of the process */ -+ pdd = kfd_get_first_process_device_data(p); -+ do { -+ pa[nodes].gpu_id = pdd->dev->id; -+ pa[nodes].lds_base = pdd->lds_base; -+ pa[nodes].lds_limit = pdd->lds_limit; -+ pa[nodes].gpuvm_base = pdd->gpuvm_base; -+ pa[nodes].gpuvm_limit = pdd->gpuvm_limit; -+ pa[nodes].scratch_base = pdd->scratch_base; -+ pa[nodes].scratch_limit = pdd->scratch_limit; -+ -+ dev_dbg(kfd_device, -+ "gpu id %u\n", pdd->dev->id); -+ dev_dbg(kfd_device, -+ "lds_base %llX\n", pdd->lds_base); -+ dev_dbg(kfd_device, -+ "lds_limit %llX\n", pdd->lds_limit); -+ dev_dbg(kfd_device, -+ "gpuvm_base %llX\n", pdd->gpuvm_base); -+ dev_dbg(kfd_device, -+ "gpuvm_limit %llX\n", pdd->gpuvm_limit); -+ dev_dbg(kfd_device, -+ "scratch_base %llX\n", pdd->scratch_base); -+ dev_dbg(kfd_device, -+ "scratch_limit %llX\n", pdd->scratch_limit); -+ nodes++; -+ } while ( -+ (pdd = kfd_get_next_process_device_data(p, pdd)) != NULL && -+ (nodes < args->num_of_nodes)); -+ up_write(&p->lock); -+ -+ args->num_of_nodes = nodes; -+ ret = copy_to_user( -+ (void __user *)args->kfd_process_device_apertures_ptr, -+ pa, -+ (nodes * sizeof(struct kfd_process_device_apertures))); -+ kfree(pa); -+ return ret ? -EFAULT : 0; -+} -+ -+static int -+kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, void *data) - { - struct kfd_ioctl_create_event_args *args = data; -- int err; -+ struct kfd_dev *kfd; -+ struct kfd_process_device *pdd; -+ int err = -EINVAL; -+ void *mem, *kern_addr = NULL; -+ -+ pr_debug("amdkfd: Event page offset 0x%llx\n", args->event_page_offset); -+ -+ if (args->event_page_offset) { -+ kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); -+ if (!kfd) { -+ pr_err("amdkfd: can't find kfd device\n"); -+ return -EFAULT; -+ } -+ if (KFD_IS_DGPU(kfd->device_info->asic_family)) { -+ down_write(&p->lock); -+ pdd = kfd_bind_process_to_device(kfd, p); -+ if (IS_ERR(pdd) < 0) { -+ err = PTR_ERR(pdd); -+ up_write(&p->lock); -+ return -EFAULT; -+ } -+ mem = kfd_process_device_translate_handle(pdd, -+ GET_IDR_HANDLE(args->event_page_offset)); -+ if (!mem) { -+ pr_err("amdkfd: can't find BO offset is 0x%llx\n", -+ args->event_page_offset); -+ up_write(&p->lock); -+ return -EFAULT; -+ } -+ up_write(&p->lock); -+ -+ /* Map dGPU gtt BO to kernel */ -+ kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, -+ mem, &kern_addr); -+ } -+ } - -- err = kfd_event_create(filp, p, args->event_type, -- args->auto_reset != 0, args->node_id, -- &args->event_id, &args->event_trigger_data, -- &args->event_page_offset, -- &args->event_slot_index); -+ err = kfd_event_create(filp, p, -+ args->event_type, -+ args->auto_reset != 0, -+ args->node_id, -+ &args->event_id, -+ &args->event_trigger_data, -+ &args->event_page_offset, -+ &args->event_slot_index, -+ kern_addr); - - return err; - } - --static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, -- void *data) -+static int -+kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, void *data) - { - struct kfd_ioctl_destroy_event_args *args = data; - - return kfd_event_destroy(p, args->event_id); - } - --static int kfd_ioctl_set_event(struct file *filp, struct kfd_process *p, -- void *data) -+static int -+kfd_ioctl_set_event(struct file *filp, struct kfd_process *p, void *data) - { - struct kfd_ioctl_set_event_args *args = data; - - return kfd_set_event(p, args->event_id); - } - --static int kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p, -- void *data) -+static int -+kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p, void *data) - { - struct kfd_ioctl_reset_event_args *args = data; - - return kfd_reset_event(p, args->event_id); - } - --static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, -- void *data) -+static int -+kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, void *data) - { - struct kfd_ioctl_wait_events_args *args = data; - enum kfd_event_wait_result wait_result; -@@ -846,6 +1070,711 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, - - return err; - } -+static int kfd_ioctl_alloc_scratch_memory(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_alloc_memory_of_gpu_args *args = -+ (struct kfd_ioctl_alloc_memory_of_gpu_args *)data; -+ struct kfd_process_device *pdd; -+ struct kfd_dev *dev; -+ long err; -+ -+ if (args->size == 0) -+ return -EINVAL; -+ -+ dev = kfd_device_by_id(args->gpu_id); -+ if (dev == NULL) -+ return -EINVAL; -+ -+ down_write(&p->lock); -+ -+ pdd = kfd_bind_process_to_device(dev, p); -+ if (IS_ERR(pdd) < 0) { -+ err = PTR_ERR(pdd); -+ goto bind_process_to_device_fail; -+ } -+ -+ pdd->sh_hidden_private_base_vmid = args->va_addr; -+ pdd->qpd.sh_hidden_private_base = args->va_addr; -+ -+ up_write(&p->lock); -+ -+ if (sched_policy == KFD_SCHED_POLICY_NO_HWS && pdd->qpd.vmid != 0) { -+ err = dev->kfd2kgd->alloc_memory_of_scratch( -+ dev->kgd, args->va_addr, pdd->qpd.vmid); -+ if (err != 0) -+ goto alloc_memory_of_scratch_failed; -+ } -+ -+ return 0; -+ -+bind_process_to_device_fail: -+ up_write(&p->lock); -+alloc_memory_of_scratch_failed: -+ return -EFAULT; -+} -+ -+static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_alloc_memory_of_gpu_args *args = data; -+ struct kfd_process_device *pdd; -+ void *mem; -+ struct kfd_dev *dev; -+ int idr_handle; -+ long err; -+ -+ if (args->size == 0) -+ return -EINVAL; -+ -+ dev = kfd_device_by_id(args->gpu_id); -+ if (dev == NULL) -+ return -EINVAL; -+ -+ down_write(&p->lock); -+ pdd = kfd_bind_process_to_device(dev, p); -+ up_write(&p->lock); -+ if (IS_ERR(pdd) < 0) -+ return PTR_ERR(pdd); -+ -+ err = dev->kfd2kgd->alloc_memory_of_gpu( -+ dev->kgd, args->va_addr, args->size, -+ pdd->vm, (struct kgd_mem **) &mem, NULL, NULL, pdd, 0); -+ -+ if (err != 0) -+ return err; -+ -+ down_write(&p->lock); -+ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, -+ args->va_addr, args->size); -+ up_write(&p->lock); -+ if (idr_handle < 0) { -+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, -+ (struct kgd_mem *) mem); -+ return -EFAULT; -+ } -+ -+ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); -+ -+ return 0; -+} -+ -+bool kfd_is_large_bar(struct kfd_dev *dev) -+{ -+ struct kfd_local_mem_info mem_info; -+ -+ if (debug_largebar) { -+ pr_debug("amdkfd: simulate large-bar allocation on non large-bar machine\n"); -+ return true; -+ } -+ -+ if (!KFD_IS_DGPU(dev->device_info->asic_family)) -+ return false; -+ -+ dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info); -+ if (mem_info.local_mem_size_private == 0 && -+ mem_info.local_mem_size_public > 0) -+ return true; -+ return false; -+} -+ -+static uint32_t kfd_convert_user_mem_alloction_flags( -+ struct kfd_dev *dev, -+ uint32_t userspace_flags) -+{ -+ uint32_t kernel_allocation_flags; -+ -+ kernel_allocation_flags = 0; -+ -+ /* Allocate VRAM bo */ -+ if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) || -+ (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE)) { -+ kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM; -+ if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) && -+ kfd_is_large_bar(dev)) -+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_PUBLIC; -+ goto out; -+ } -+ /* -+ * Since currently user space library doesn't uses scratch -+ * allocation flag I route it to VRAM -+ */ -+ if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_SCRATCH) || -+ (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_SCRATCH)) { -+ kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM; -+ goto out; -+ } -+ /* -+ * The current usage for *_HOST allocation flags are for GTT memory -+ * Need to verify if we're node zero or we want to allocate bo on -+ * public domain for P2P buffers. -+ */ -+ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST) { -+ kernel_allocation_flags = ALLOC_MEM_FLAGS_GTT; -+ goto out; -+ } -+ /* Allocate userptr BO */ -+ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { -+ kernel_allocation_flags = ALLOC_MEM_FLAGS_USERPTR; -+ goto out; -+ } -+ -+out: -+ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_AQL_QUEUE_MEM) -+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_AQL_QUEUE_MEM; -+ /* Current HW doesn't support non paged memory */ -+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_NONPAGED; -+ /* -+ * Set by default execute access as this buffer might be allocated -+ * for CP's ring buffer -+ */ -+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_EXECUTE_ACCESS; -+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_NO_SUBSTITUTE; -+ -+ pr_debug("amdkfd: user allocation flags 0x%x kernel allocation flags: 0x%x\n", -+ userspace_flags, kernel_allocation_flags); -+ -+ return kernel_allocation_flags; -+} -+ -+static int kfd_ioctl_alloc_memory_of_gpu_new(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_alloc_memory_of_gpu_new_args *args = data; -+ struct kfd_process_device *pdd; -+ void *mem; -+ struct kfd_dev *dev; -+ int idr_handle; -+ long err; -+ uint64_t offset; -+ -+ if (args->size == 0) -+ return -EINVAL; -+ -+ dev = kfd_device_by_id(args->gpu_id); -+ if (dev == NULL) -+ return -EINVAL; -+ -+ down_write(&p->lock); -+ pdd = kfd_bind_process_to_device(dev, p); -+ up_write(&p->lock); -+ if (IS_ERR(pdd) < 0) -+ return PTR_ERR(pdd); -+ -+ offset = args->mmap_offset; -+ err = dev->kfd2kgd->alloc_memory_of_gpu( -+ dev->kgd, args->va_addr, args->size, -+ pdd->vm, (struct kgd_mem **) &mem, &offset, -+ NULL, pdd, -+ kfd_convert_user_mem_alloction_flags(dev, args->flags)); -+ -+ if (err != 0) -+ return err; -+ -+ down_write(&p->lock); -+ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, -+ args->va_addr, args->size); -+ up_write(&p->lock); -+ if (idr_handle < 0) { -+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, -+ (struct kgd_mem *) mem); -+ return -EFAULT; -+ } -+ -+ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); -+ if ((args->flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) != 0 && -+ !kfd_is_large_bar(dev)) { -+ args->mmap_offset = 0; -+ } else { -+ args->mmap_offset = KFD_MMAP_TYPE_MAP_BO; -+ args->mmap_offset |= KFD_MMAP_GPU_ID(args->gpu_id); -+ args->mmap_offset <<= PAGE_SHIFT; -+ args->mmap_offset |= offset; -+ } -+ -+ return 0; -+} -+ -+static int kfd_ioctl_free_memory_of_gpu(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_free_memory_of_gpu_args *args = data; -+ struct kfd_process_device *pdd; -+ struct kfd_bo *buf_obj; -+ struct kfd_dev *dev; -+ int ret; -+ -+ dev = kfd_device_by_id(GET_GPU_ID(args->handle)); -+ if (dev == NULL) -+ return -EINVAL; -+ -+ down_write(&p->lock); -+ -+ pdd = kfd_get_process_device_data(dev, p); -+ if (!pdd) { -+ pr_err("Process device data doesn't exist\n"); -+ ret = -EINVAL; -+ goto err_unlock; -+ } -+ -+ buf_obj = kfd_process_device_find_bo(pdd, -+ GET_IDR_HANDLE(args->handle)); -+ if (buf_obj == NULL) { -+ ret = -EINVAL; -+ goto err_unlock; -+ } -+ run_rdma_free_callback(buf_obj); -+ -+ up_write(&p->lock); -+ -+ ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem); -+ -+ /* If freeing the buffer failed, leave the handle in place for -+ * clean-up during process tear-down. */ -+ if (ret == 0) { -+ down_write(&p->lock); -+ kfd_process_device_remove_obj_handle( -+ pdd, GET_IDR_HANDLE(args->handle)); -+ up_write(&p->lock); -+ } -+ -+ return ret; -+ -+err_unlock: -+ up_write(&p->lock); -+ return ret; -+} -+ -+int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem, -+ struct kfd_process *p, struct kfd_process_device *pdd) -+{ -+ int err; -+ -+ BUG_ON(!dev); -+ BUG_ON(!pdd); -+ -+ err = dev->kfd2kgd->map_memory_to_gpu( -+ dev->kgd, (struct kgd_mem *) mem, pdd->vm); -+ -+ if (err != 0) -+ return err; -+ -+ radeon_flush_tlb(dev, p->pasid); -+ -+ err = dev->dqm->ops.set_page_directory_base(dev->dqm, &pdd->qpd); -+ if (err != 0) { -+ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, -+ (struct kgd_mem *) mem, pdd->vm); -+ return err; -+ } -+ -+ return 0; -+} -+ -+static int kfd_ioctl_map_memory_to_gpu(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_map_memory_to_gpu_new_args *args = data; -+ struct kfd_process_device *pdd, *peer_pdd; -+ void *mem; -+ struct kfd_dev *dev, *peer; -+ long err = 0; -+ int i, num_dev; -+ uint32_t *devices_arr = NULL; -+ int bo_size; -+ -+ dev = kfd_device_by_id(GET_GPU_ID(args->handle)); -+ if (dev == NULL) -+ return -EINVAL; -+ -+ if (args->device_ids_array_size > 0 && -+ (args->device_ids_array_size < sizeof(uint32_t))) { -+ pr_err("amdkfd: err node IDs array size %u\n", -+ args->device_ids_array_size); -+ return -EFAULT; -+ } -+ -+ if (args->device_ids_array_size > 0) { -+ devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); -+ if (!devices_arr) -+ return -ENOMEM; -+ -+ err = copy_from_user(devices_arr, -+ (void __user *)args->device_ids_array, -+ args->device_ids_array_size); -+ if (err != 0) { -+ err = -EFAULT; -+ goto copy_from_user_failed; -+ } -+ } -+ -+ down_write(&p->lock); -+ -+ pdd = kfd_bind_process_to_device(dev, p); -+ if (IS_ERR(pdd) < 0) { -+ err = PTR_ERR(pdd); -+ goto bind_process_to_device_failed; -+ } -+ -+ mem = kfd_process_device_translate_handle(pdd, -+ GET_IDR_HANDLE(args->handle)); -+ up_write(&p->lock); -+ -+ if (mem == NULL) { -+ err = PTR_ERR(mem); -+ goto get_mem_obj_from_handle_failed; -+ } -+ -+ if (args->device_ids_array_size > 0) { -+ num_dev = args->device_ids_array_size / sizeof(uint32_t); -+ for (i = 0 ; i < num_dev; i++) { -+ peer = kfd_device_by_id(devices_arr[i]); -+ if (!peer) { -+ pr_err("amdkfd: didn't found kfd-dev for 0x%x\n", -+ devices_arr[i]); -+ err = -EFAULT; -+ goto get_mem_obj_from_handle_failed; -+ } -+ down_write(&p->lock); -+ peer_pdd = kfd_bind_process_to_device(peer, p); -+ up_write(&p->lock); -+ if (!peer_pdd) { -+ err = -EFAULT; -+ goto get_mem_obj_from_handle_failed; -+ } -+ err = kfd_map_memory_to_gpu(peer, mem, p, peer_pdd); -+ if (err != 0) -+ pr_err("amdkfd: failed to map\n"); -+ } -+ } else { -+ err = kfd_map_memory_to_gpu(dev, mem, p, pdd); -+ if (err != 0) -+ pr_err("amdkfd: failed to map\n"); -+ } -+ -+ bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem); -+ down_write(&p->lock); -+ pdd->mapped_size += bo_size; -+ up_write(&p->lock); -+ -+ if (args->device_ids_array_size > 0 && devices_arr) -+ kfree(devices_arr); -+ -+ return err; -+ -+bind_process_to_device_failed: -+ up_write(&p->lock); -+get_mem_obj_from_handle_failed: -+copy_from_user_failed: -+ kfree(devices_arr); -+ return err; -+} -+ -+static int kfd_ioctl_map_memory_to_gpu_wrapper(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_map_memory_to_gpu_args *args = data; -+ struct kfd_ioctl_map_memory_to_gpu_new_args new_args; -+ -+ new_args.handle = args->handle; -+ new_args.device_ids_array = NULL; -+ new_args.device_ids_array_size = 0; -+ -+ return kfd_ioctl_map_memory_to_gpu(filep, p, &new_args); -+} -+ -+static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_unmap_memory_from_gpu_new_args *args = data; -+ struct kfd_process_device *pdd, *peer_pdd; -+ void *mem; -+ struct kfd_dev *dev, *peer; -+ long err = 0; -+ uint32_t *devices_arr = NULL, num_dev, i; -+ int bo_size; -+ -+ dev = kfd_device_by_id(GET_GPU_ID(args->handle)); -+ if (dev == NULL) -+ return -EINVAL; -+ -+ if (args->device_ids_array_size > 0 && -+ (args->device_ids_array_size < sizeof(uint32_t))) { -+ pr_err("amdkfd: err node IDs array size %u\n", -+ args->device_ids_array_size); -+ return -EFAULT; -+ } -+ -+ if (args->device_ids_array_size > 0) { -+ devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); -+ if (!devices_arr) -+ return -ENOMEM; -+ -+ err = copy_from_user(devices_arr, -+ (void __user *)args->device_ids_array, -+ args->device_ids_array_size); -+ if (err != 0) { -+ err = -EFAULT; -+ goto copy_from_user_failed; -+ } -+ } -+ -+ down_write(&p->lock); -+ -+ pdd = kfd_get_process_device_data(dev, p); -+ if (!pdd) { -+ pr_err("Process device data doesn't exist\n"); -+ err = PTR_ERR(pdd); -+ goto bind_process_to_device_failed; -+ } -+ -+ mem = kfd_process_device_translate_handle(pdd, -+ GET_IDR_HANDLE(args->handle)); -+ up_write(&p->lock); -+ -+ if (mem == NULL) { -+ err = PTR_ERR(mem); -+ goto get_mem_obj_from_handle_failed; -+ } -+ -+ if (args->device_ids_array_size > 0) { -+ num_dev = args->device_ids_array_size / sizeof(uint32_t); -+ for (i = 0 ; i < num_dev; i++) { -+ peer = kfd_device_by_id(devices_arr[i]); -+ if (!peer) { -+ err = -EFAULT; -+ goto get_mem_obj_from_handle_failed; -+ } -+ down_write(&p->lock); -+ peer_pdd = kfd_get_process_device_data(peer, p); -+ up_write(&p->lock); -+ if (!peer_pdd) { -+ err = -EFAULT; -+ goto get_mem_obj_from_handle_failed; -+ } -+ peer->kfd2kgd->unmap_memory_to_gpu(peer->kgd, -+ mem, peer_pdd->vm); -+ radeon_flush_tlb(peer, p->pasid); -+ } -+ } else { -+ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, mem, pdd->vm); -+ radeon_flush_tlb(dev, p->pasid); -+ } -+ -+ bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem); -+ down_write(&p->lock); -+ pdd->mapped_size -= bo_size; -+ up_write(&p->lock); -+ -+ return 0; -+ -+bind_process_to_device_failed: -+ up_write(&p->lock); -+get_mem_obj_from_handle_failed: -+copy_from_user_failed: -+ kfree(devices_arr); -+ return err; -+} -+ -+static int kfd_ioctl_unmap_memory_from_gpu_wrapper(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; -+ struct kfd_ioctl_unmap_memory_from_gpu_new_args new_args; -+ -+ new_args.handle = args->handle; -+ new_args.device_ids_array = NULL; -+ new_args.device_ids_array_size = 0; -+ -+ return kfd_ioctl_unmap_memory_from_gpu(filep, p, &new_args); -+} -+ -+static int kfd_ioctl_open_graphic_handle(struct file *filep, -+ struct kfd_process *p, -+ void *data) -+{ -+ struct kfd_ioctl_open_graphic_handle_args *args = data; -+ struct kfd_dev *dev; -+ struct kfd_process_device *pdd; -+ void *mem; -+ int idr_handle; -+ long err; -+ -+ dev = kfd_device_by_id(args->gpu_id); -+ if (dev == NULL) -+ return -EINVAL; -+ -+ if (dev->device_info->asic_family != CHIP_KAVERI) { -+ pr_debug("kfd_ioctl_open_graphic_handle only supported on KV\n"); -+ return -EINVAL; -+ } -+ -+ down_write(&p->lock); -+ pdd = kfd_bind_process_to_device(dev, p); -+ up_write(&p->lock); -+ if (IS_ERR(pdd) < 0) -+ return PTR_ERR(pdd); -+ -+ err = dev->kfd2kgd->open_graphic_handle(dev->kgd, -+ args->va_addr, -+ (struct kgd_vm *) pdd->vm, -+ args->graphic_device_fd, -+ args->graphic_handle, -+ (struct kgd_mem **) &mem); -+ -+ if (err != 0) -+ return err; -+ -+ down_write(&p->lock); -+ /*TODO: When open_graphic_handle is implemented, we need to create -+ * the corresponding interval tree. We need to know the size of -+ * the buffer through open_graphic_handle(). We use 1 for now.*/ -+ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, -+ args->va_addr, 1); -+ up_write(&p->lock); -+ if (idr_handle < 0) { -+ /* FIXME: destroy_process_gpumem doesn't seem to be -+ * implemented anywhere */ -+ dev->kfd2kgd->destroy_process_gpumem(dev->kgd, mem); -+ return -EFAULT; -+ } -+ -+ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); -+ -+ return 0; -+} -+ -+static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_set_process_dgpu_aperture_args *args = data; -+ struct kfd_dev *dev; -+ struct kfd_process_device *pdd; -+ long err; -+ -+ dev = kfd_device_by_id(args->gpu_id); -+ if (dev == NULL) -+ return -EINVAL; -+ -+ down_write(&p->lock); -+ -+ pdd = kfd_bind_process_to_device(dev, p); -+ if (IS_ERR(pdd) < 0) { -+ err = PTR_ERR(pdd); -+ goto exit; -+ } -+ -+ err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base, -+ args->dgpu_limit); -+ -+exit: -+ up_write(&p->lock); -+ return err; -+} -+ -+static int kfd_ioctl_get_dmabuf_info(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_get_dmabuf_info_args *args = data; -+ struct kfd_dev *dev = NULL; -+ struct kgd_dev *dma_buf_kgd; -+ void *metadata_buffer = NULL; -+ uint32_t flags; -+ unsigned i; -+ int r; -+ -+ /* Find a KFD GPU device that supports the get_dmabuf_info query */ -+ for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) -+ if (dev && dev->kfd2kgd->get_dmabuf_info) -+ break; -+ if (!dev) -+ return -EINVAL; -+ -+ if (args->metadata_ptr) { -+ metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL); -+ if (!metadata_buffer) -+ return -ENOMEM; -+ } -+ -+ /* Get dmabuf info from KGD */ -+ r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd, -+ &dma_buf_kgd, &args->size, -+ metadata_buffer, args->metadata_size, -+ &args->metadata_size, &flags); -+ if (r) -+ goto exit; -+ -+ /* Reverse-lookup gpu_id from kgd pointer */ -+ dev = kfd_device_by_kgd(dma_buf_kgd); -+ if (!dev) { -+ r = -EINVAL; -+ goto exit; -+ } -+ args->gpu_id = kfd_get_gpu_id(dev); -+ -+ /* Translate flags */ -+ if (flags & ALLOC_MEM_FLAGS_VRAM) { -+ args->flags = KFD_IS_DGPU(dev->device_info->asic_family) ? -+ KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE : -+ KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE; -+ } else -+ args->flags = KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST; -+ -+ /* Copy metadata buffer to user mode */ -+ if (metadata_buffer) { -+ r = copy_to_user((void __user *)args->metadata_ptr, -+ metadata_buffer, args->metadata_size); -+ if (r != 0) -+ r = -EFAULT; -+ } -+ -+exit: -+ kfree(metadata_buffer); -+ -+ return r; -+} -+ -+static int kfd_ioctl_import_dmabuf(struct file *filep, -+ struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_import_dmabuf_args *args = data; -+ struct kfd_dev *dev; -+ struct kfd_process_device *pdd; -+ void *mem; -+ uint64_t size; -+ int idr_handle; -+ int r; -+ -+ dev = kfd_device_by_id(args->gpu_id); -+ if (!dev || !dev->kfd2kgd->import_dmabuf) -+ return -EINVAL; -+ -+ down_write(&p->lock); -+ pdd = kfd_bind_process_to_device(dev, p); -+ up_write(&p->lock); -+ if (IS_ERR(pdd) < 0) -+ return PTR_ERR(pdd); -+ -+ r = dev->kfd2kgd->import_dmabuf(dev->kgd, args->dmabuf_fd, -+ args->va_addr, pdd->vm, -+ (struct kgd_mem **)&mem, &size); -+ if (r) -+ return r; -+ -+ down_write(&p->lock); -+ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, -+ args->va_addr, size); -+ up_write(&p->lock); -+ if (idr_handle < 0) { -+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, -+ (struct kgd_mem *)mem); -+ return -EFAULT; -+ } -+ -+ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); -+ -+ return 0; -+} - - #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ - [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0, .name = #ioctl} -@@ -899,10 +1828,65 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, - kfd_ioctl_dbg_wave_control, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, -+ kfd_ioctl_alloc_memory_of_gpu, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU, -+ kfd_ioctl_free_memory_of_gpu, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU, -+ kfd_ioctl_map_memory_to_gpu_wrapper, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, -+ kfd_ioctl_unmap_memory_from_gpu_wrapper, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_OPEN_GRAPHIC_HANDLE, -+ kfd_ioctl_open_graphic_handle, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH, -+ kfd_ioctl_alloc_scratch_memory, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, -+ kfd_ioctl_set_cu_mask, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, -+ kfd_ioctl_set_process_dgpu_aperture, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, -+ kfd_ioctl_set_trap_handler, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU_NEW, -+ kfd_ioctl_alloc_memory_of_gpu_new, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW, -+ kfd_ioctl_map_memory_to_gpu, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, -+ kfd_ioctl_unmap_memory_from_gpu, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, -+ kfd_ioctl_get_process_apertures_new, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_EVICT_MEMORY, -+ kfd_evict, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO, -+ kfd_ioctl_get_dmabuf_info, 0), -+ -+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, -+ kfd_ioctl_import_dmabuf, 0) - }; - - #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) - -+static int kfd_evict(struct file *filep, struct kfd_process *p, void *data) -+{ -+ struct kfd_ioctl_eviction_args *args = data; -+ -+ return evict_size(p, args->size, args->type); -+ -+} - static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) - { - struct kfd_process *process; -@@ -994,20 +1978,37 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) - static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) - { - struct kfd_process *process; -+ struct kfd_dev *kfd; -+ unsigned long vm_pgoff; -+ int retval; - - process = kfd_get_process(current); - if (IS_ERR(process)) - return PTR_ERR(process); - -- if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == -- KFD_MMAP_DOORBELL_MASK) { -- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; -+ vm_pgoff = vma->vm_pgoff; -+ vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff); -+ -+ switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { -+ case KFD_MMAP_TYPE_DOORBELL: - return kfd_doorbell_mmap(process, vma); -- } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == -- KFD_MMAP_EVENTS_MASK) { -- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; -+ -+ case KFD_MMAP_TYPE_EVENTS: - return kfd_event_mmap(process, vma); -+ -+ case KFD_MMAP_TYPE_MAP_BO: -+ kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); -+ if (!kfd) -+ return -EFAULT; -+ retval = kfd->kfd2kgd->mmap_bo(kfd->kgd, vma); -+ return retval; -+ -+ case KFD_MMAP_TYPE_RESERVED_MEM: -+ return kfd_reserved_mem_mmap(process, vma); -+ - } - - return -EFAULT; - } -+ -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -new file mode 100644 -index 0000000..b3d4a50 ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -@@ -0,0 +1,1163 @@ -+#include <linux/kernel.h> -+#include <linux/acpi.h> -+#include <linux/mm.h> -+#include <linux/amd-iommu.h> -+#include <linux/pci.h> -+#include "kfd_crat.h" -+#include "kfd_priv.h" -+#include "kfd_topology.h" -+ -+/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. -+ * GPU processor ID are expressed with Bit[31]=1. -+ * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs -+ * used in the CRAT. */ -+static uint32_t gpu_processor_id_low = 0x80001000; -+ -+/* Return the next available gpu_processor_id and increment it for next GPU -+ * @total_cu_count - Total CUs present in the GPU including ones masked off -+ */ -+static inline unsigned int get_and_inc_gpu_processor_id( -+ unsigned int total_cu_count) -+{ -+ int current_id = gpu_processor_id_low; -+ -+ gpu_processor_id_low += total_cu_count; -+ return current_id; -+} -+ -+/* Static table to describe GPU Cache information */ -+struct kfd_gpu_cache_info { -+ uint32_t cache_size; -+ uint32_t cache_level; -+ uint32_t flags; -+ /* Indicates how many Compute Units share this cache -+ * Value = 1 indicates the cache is not shared */ -+ uint32_t num_cu_shared; -+}; -+ -+static struct kfd_gpu_cache_info kaveri_cache_info[] = { -+ { -+ /* TCP L1 Cache per CU */ -+ .cache_size = 16, -+ .cache_level = 1, -+ .flags = (CRAT_CACHE_FLAGS_ENABLED | -+ CRAT_CACHE_FLAGS_DATA_CACHE | -+ CRAT_CACHE_FLAGS_SIMD_CACHE), -+ .num_cu_shared = 1, -+ -+ }, -+ { -+ /* Scalar L1 Instruction Cache (in SQC module) per bank */ -+ .cache_size = 16, -+ .cache_level = 1, -+ .flags = (CRAT_CACHE_FLAGS_ENABLED | -+ CRAT_CACHE_FLAGS_INST_CACHE | -+ CRAT_CACHE_FLAGS_SIMD_CACHE), -+ .num_cu_shared = 2, -+ }, -+ { -+ /* Scalar L1 Data Cache (in SQC module) per bank */ -+ .cache_size = 8, -+ .cache_level = 1, -+ .flags = (CRAT_CACHE_FLAGS_ENABLED | -+ CRAT_CACHE_FLAGS_DATA_CACHE | -+ CRAT_CACHE_FLAGS_SIMD_CACHE), -+ .num_cu_shared = 2, -+ }, -+ -+ /* TODO: Add L2 Cache information */ -+}; -+ -+ -+static struct kfd_gpu_cache_info carrizo_cache_info[] = { -+ { -+ /* TCP L1 Cache per CU */ -+ .cache_size = 16, -+ .cache_level = 1, -+ .flags = (CRAT_CACHE_FLAGS_ENABLED | -+ CRAT_CACHE_FLAGS_DATA_CACHE | -+ CRAT_CACHE_FLAGS_SIMD_CACHE), -+ .num_cu_shared = 1, -+ }, -+ { -+ /* Scalar L1 Instruction Cache (in SQC module) per bank */ -+ .cache_size = 8, -+ .cache_level = 1, -+ .flags = (CRAT_CACHE_FLAGS_ENABLED | -+ CRAT_CACHE_FLAGS_INST_CACHE | -+ CRAT_CACHE_FLAGS_SIMD_CACHE), -+ .num_cu_shared = 4, -+ }, -+ { -+ /* Scalar L1 Data Cache (in SQC module) per bank. */ -+ .cache_size = 4, -+ .cache_level = 1, -+ .flags = (CRAT_CACHE_FLAGS_ENABLED | -+ CRAT_CACHE_FLAGS_DATA_CACHE | -+ CRAT_CACHE_FLAGS_SIMD_CACHE), -+ .num_cu_shared = 4, -+ }, -+ -+ /* TODO: Add L2 Cache information */ -+}; -+ -+/* NOTE: In future if more information is added to struct kfd_gpu_cache_info -+ * the following ASICs may need a separate table. */ -+#define tonga_cache_info carrizo_cache_info -+#define fiji_cache_info carrizo_cache_info -+ -+static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, -+ struct crat_subtype_computeunit *cu) -+{ -+ BUG_ON(!dev); -+ BUG_ON(!cu); -+ -+ dev->node_props.cpu_cores_count = cu->num_cpu_cores; -+ dev->node_props.cpu_core_id_base = cu->processor_id_low; -+ if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) -+ dev->node_props.capability |= HSA_CAP_ATS_PRESENT; -+ -+ pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, -+ cu->processor_id_low); -+} -+ -+static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, -+ struct crat_subtype_computeunit *cu) -+{ -+ BUG_ON(!dev); -+ BUG_ON(!cu); -+ -+ dev->node_props.simd_id_base = cu->processor_id_low; -+ dev->node_props.simd_count = cu->num_simd_cores; -+ dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; -+ dev->node_props.max_waves_per_simd = cu->max_waves_simd; -+ dev->node_props.wave_front_size = cu->wave_front_size; -+ dev->node_props.array_count = cu->array_count; -+ dev->node_props.cu_per_simd_array = cu->num_cu_per_array; -+ dev->node_props.simd_per_cu = cu->num_simd_per_cu; -+ dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; -+ if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) -+ dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; -+ pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); -+} -+ -+/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct -+ * topology device present in the device_list -+ */ -+static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, -+ struct list_head *device_list) -+{ -+ struct kfd_topology_device *dev; -+ -+ BUG_ON(!cu); -+ -+ pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", -+ cu->proximity_domain, cu->hsa_capability); -+ list_for_each_entry(dev, device_list, list) { -+ if (cu->proximity_domain == dev->proximity_domain) { -+ if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) -+ kfd_populated_cu_info_cpu(dev, cu); -+ -+ if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) -+ kfd_populated_cu_info_gpu(dev, cu); -+ break; -+ } -+ } -+ -+ return 0; -+} -+ -+/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct -+ * topology device present in the device_list -+ */ -+static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, -+ struct list_head *device_list) -+{ -+ struct kfd_mem_properties *props; -+ struct kfd_topology_device *dev; -+ -+ BUG_ON(!mem); -+ -+ pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", -+ mem->proximity_domain); -+ list_for_each_entry(dev, device_list, list) { -+ if (mem->proximity_domain == dev->proximity_domain) { -+ props = kfd_alloc_struct(props); -+ if (props == NULL) -+ return -ENOMEM; -+ -+ /* -+ * We're on GPU node -+ */ -+ if (dev->node_props.cpu_cores_count == 0) { -+ /* APU */ -+ if (mem->visibility_type == 0) -+ props->heap_type = -+ HSA_MEM_HEAP_TYPE_FB_PRIVATE; -+ /* dGPU */ -+ else -+ props->heap_type = mem->visibility_type; -+ } -+ else -+ props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; -+ -+ if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) -+ props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; -+ if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) -+ props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; -+ -+ props->size_in_bytes = -+ ((uint64_t)mem->length_high << 32) + -+ mem->length_low; -+ props->width = mem->width; -+ -+ dev->node_props.mem_banks_count++; -+ list_add_tail(&props->list, &dev->mem_props); -+ -+ break; -+ } -+ } -+ -+ return 0; -+} -+ -+/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct -+ * topology device present in the device_list -+ */ -+static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, -+ struct list_head *device_list) -+{ -+ struct kfd_cache_properties *props; -+ struct kfd_topology_device *dev; -+ uint32_t id; -+ uint32_t total_num_of_cu; -+ -+ BUG_ON(!cache); -+ -+ id = cache->processor_id_low; -+ -+ list_for_each_entry(dev, device_list, list) { -+ total_num_of_cu = (dev->node_props.array_count * -+ dev->node_props.cu_per_simd_array); -+ -+ /* Cache infomration in CRAT doesn't have proximity_domain -+ * information as it is associated with a CPU core or GPU -+ * Compute Unit. So map the cache using CPU core Id or SIMD -+ * (GPU) ID. -+ * TODO: This works because currently we can safely assume that -+ * Compute Units are parsed before caches are parsed. In future -+ * remove this dependency -+ */ -+ if ((id >= dev->node_props.cpu_core_id_base && -+ id <= dev->node_props.cpu_core_id_base + -+ dev->node_props.cpu_cores_count) || -+ (id >= dev->node_props.simd_id_base && -+ id < dev->node_props.simd_id_base + -+ total_num_of_cu)) { -+ props = kfd_alloc_struct(props); -+ if (props == NULL) -+ return -ENOMEM; -+ -+ props->processor_id_low = id; -+ props->cache_level = cache->cache_level; -+ props->cache_size = cache->cache_size; -+ props->cacheline_size = cache->cache_line_size; -+ props->cachelines_per_tag = cache->lines_per_tag; -+ props->cache_assoc = cache->associativity; -+ props->cache_latency = cache->cache_latency; -+ memcpy(props->sibling_map, cache->sibling_map, -+ sizeof(props->sibling_map)); -+ -+ if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) -+ props->cache_type |= HSA_CACHE_TYPE_DATA; -+ if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) -+ props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; -+ if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) -+ props->cache_type |= HSA_CACHE_TYPE_CPU; -+ if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) -+ props->cache_type |= HSA_CACHE_TYPE_HSACU; -+ -+ dev->cache_count++; -+ dev->node_props.caches_count++; -+ list_add_tail(&props->list, &dev->cache_props); -+ -+ break; -+ } -+ } -+ -+ return 0; -+} -+ -+/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct -+ * topology device present in the device_list -+ */ -+static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, -+ struct list_head *device_list) -+{ -+ struct kfd_iolink_properties *props; -+ struct kfd_topology_device *dev; -+ uint32_t i = 0; -+ uint32_t id_from; -+ uint32_t id_to; -+ -+ BUG_ON(!iolink); -+ -+ id_from = iolink->proximity_domain_from; -+ id_to = iolink->proximity_domain_to; -+ -+ pr_debug("Found IO link entry in CRAT table with id_from=%d\n", id_from); -+ list_for_each_entry(dev, device_list, list) { -+ if (id_from == dev->proximity_domain) { -+ props = kfd_alloc_struct(props); -+ if (props == NULL) -+ return -ENOMEM; -+ -+ props->node_from = id_from; -+ props->node_to = id_to; -+ props->ver_maj = iolink->version_major; -+ props->ver_min = iolink->version_minor; -+ props->iolink_type = iolink->io_interface_type; -+ -+ /* -+ * weight factor (derived from CDIR), currently always 1 -+ */ -+ props->weight = 1; -+ -+ props->min_latency = iolink->minimum_latency; -+ props->max_latency = iolink->maximum_latency; -+ props->min_bandwidth = iolink->minimum_bandwidth_mbs; -+ props->max_bandwidth = iolink->maximum_bandwidth_mbs; -+ props->rec_transfer_size = -+ iolink->recommended_transfer_size; -+ -+ dev->io_link_count++; -+ dev->node_props.io_links_count++; -+ list_add_tail(&props->list, &dev->io_link_props); -+ -+ break; -+ } -+ i++; -+ } -+ -+ return 0; -+} -+ -+/* kfd_parse_subtype - parse subtypes and attach it to correct topology device -+ * present in the device_list -+ * @sub_type_hdr - subtype section of crat_image -+ * @device_list - list of topology devices present in this crat_image -+ */ -+static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, -+ struct list_head *device_list) -+{ -+ struct crat_subtype_computeunit *cu; -+ struct crat_subtype_memory *mem; -+ struct crat_subtype_cache *cache; -+ struct crat_subtype_iolink *iolink; -+ int ret = 0; -+ -+ BUG_ON(!sub_type_hdr); -+ -+ switch (sub_type_hdr->type) { -+ case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: -+ cu = (struct crat_subtype_computeunit *)sub_type_hdr; -+ ret = kfd_parse_subtype_cu(cu, device_list); -+ break; -+ case CRAT_SUBTYPE_MEMORY_AFFINITY: -+ mem = (struct crat_subtype_memory *)sub_type_hdr; -+ ret = kfd_parse_subtype_mem(mem, device_list); -+ break; -+ case CRAT_SUBTYPE_CACHE_AFFINITY: -+ cache = (struct crat_subtype_cache *)sub_type_hdr; -+ ret = kfd_parse_subtype_cache(cache, device_list); -+ break; -+ case CRAT_SUBTYPE_TLB_AFFINITY: -+ /* -+ * For now, nothing to do here -+ */ -+ pr_debug("Found TLB entry in CRAT table (not processing)\n"); -+ break; -+ case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: -+ /* -+ * For now, nothing to do here -+ */ -+ pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); -+ break; -+ case CRAT_SUBTYPE_IOLINK_AFFINITY: -+ iolink = (struct crat_subtype_iolink *)sub_type_hdr; -+ ret = kfd_parse_subtype_iolink(iolink, device_list); -+ break; -+ default: -+ pr_warn("Unknown subtype (%d) in CRAT\n", -+ sub_type_hdr->type); -+ } -+ -+ return ret; -+} -+ -+/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT -+ * create a kfd_topology_device and add in to device_list. Also parse -+ * CRAT subtypes and attach it to appropriate kfd_topology_device -+ * @crat_image - input image containing CRAT -+ * @device_list - [OUT] list of kfd_topology_device generated after parsing -+ * crat_image -+ * @proximity_domain - Proximity domain of the first device in the table -+ * Return - 0 if successful else -ve value -+ */ -+int kfd_parse_crat_table(void *crat_image, -+ struct list_head *device_list, -+ uint32_t proximity_domain) -+{ -+ struct kfd_topology_device *top_dev = NULL; -+ struct crat_subtype_generic *sub_type_hdr; -+ uint16_t node_id; -+ int ret; -+ struct crat_header *crat_table = (struct crat_header *)crat_image; -+ uint16_t num_nodes; -+ uint32_t image_len; -+ uint32_t last_header_type, last_header_length; -+ -+ if (!crat_image) -+ return -EINVAL; -+ -+ if (!list_empty(device_list)) { -+ pr_warn("Error device list should be empty\n"); -+ } -+ -+ num_nodes = crat_table->num_domains; -+ image_len = crat_table->length; -+ -+ pr_info("Parsing CRAT table with %d nodes\n", num_nodes); -+ -+ for (node_id = 0; node_id < num_nodes; node_id++) { -+ top_dev = kfd_create_topology_device(device_list); -+ if (!top_dev) -+ break; -+ top_dev->proximity_domain = proximity_domain++; -+ } -+ -+ if (!top_dev) -+ return -ENOMEM; -+ -+ memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); -+ memcpy(top_dev->oem_table_id, crat_table->oem_table_id, CRAT_OEMTABLEID_LENGTH); -+ top_dev->oem_revision = crat_table->oem_revision; -+ -+ last_header_type = last_header_length = 0; -+ sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); -+ while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < -+ ((char *)crat_image) + image_len) { -+ pr_debug("kfd parsing crat sub type header %p enabled: %s type: 0x%x length %d\n", -+ sub_type_hdr, -+ (sub_type_hdr->flags & -+ CRAT_SUBTYPE_FLAGS_ENABLED) -+ ? "true" : "false", -+ sub_type_hdr->type, -+ sub_type_hdr->length); -+ -+ if (sub_type_hdr->length == 0) { -+ pr_err("amdkfd: Parsing wrong CRAT's sub header last header type: %d last header len %d\n", -+ last_header_type, last_header_type); -+ pr_err("amdkfd: Current header type %d length %d\n", -+ sub_type_hdr->type, sub_type_hdr->length); -+ break; -+ } -+ -+ if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { -+ ret = kfd_parse_subtype(sub_type_hdr, device_list); -+ if (ret != 0) -+ return ret; -+ } -+ -+ last_header_type = sub_type_hdr->type; -+ last_header_length = sub_type_hdr->length; -+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -+ sub_type_hdr->length); -+ } -+ -+ return 0; -+} -+ -+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ -+static int fill_in_pcache(struct crat_subtype_cache *pcache, -+ struct kfd_gpu_cache_info *pcache_info, -+ struct kfd_cu_info *cu_info, -+ int mem_available, -+ int cu_bitmask, -+ int cache_type, unsigned int cu_processor_id, -+ int cu_block) -+{ -+ unsigned int cu_sibling_map_mask; -+ int first_active_cu; -+ -+ /* First check if enough memory is available */ -+ if (mem_available - sizeof(struct crat_subtype_cache) < 0) -+ return -ENOMEM; -+ -+ cu_sibling_map_mask = cu_bitmask; -+ cu_sibling_map_mask >>= cu_block; -+ cu_sibling_map_mask &= -+ ((1 << pcache_info[cache_type].num_cu_shared) - 1); -+ first_active_cu = ffs(cu_sibling_map_mask); -+ -+ /* CU could be inactive. In case of shared cache find the first active -+ * CU. and incase of non-shared cache check if the CU is inactive. If -+ * inactive active skip it*/ -+ if (first_active_cu) { -+ memset(pcache, 0, sizeof(struct crat_subtype_cache)); -+ pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; -+ pcache->length = sizeof(struct crat_subtype_cache); -+ pcache->flags = pcache_info[cache_type].flags; -+ pcache->processor_id_low = cu_processor_id -+ + (first_active_cu - 1); -+ pcache->cache_level = pcache_info[cache_type].cache_level; -+ pcache->cache_size = pcache_info[cache_type].cache_size; -+ -+ /* Sibling map is w.r.t processor_id_low, so shift out -+ * inactive CU */ -+ cu_sibling_map_mask = -+ cu_sibling_map_mask >> (first_active_cu - 1); -+ -+ pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); -+ pcache->sibling_map[1] = -+ (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); -+ pcache->sibling_map[2] = -+ (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); -+ pcache->sibling_map[3] = -+ (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); -+ return 0; -+ } -+ return 1; -+} -+ -+/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info tables -+ * @kdev - [IN] GPU device -+ * @gpu_processor_id - [IN] GPU processor ID to which these caches associate -+ * @available_size - [IN] Amount of memory available in pcache -+ * @cu_info - [IN] Compute Unit info obtained from KGD -+ * @pcache - [OUT] memory into which cache data is to be filled in. -+ * @size_filled - [OUT] amount of data used up in pcache. -+ * @num_of_entries - [OUT] number of caches added -+ */ -+static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, -+ int gpu_processor_id, -+ int available_size, -+ struct kfd_cu_info *cu_info, -+ struct crat_subtype_cache *pcache, -+ int *size_filled, -+ int *num_of_entries) -+{ -+ struct kfd_gpu_cache_info *pcache_info; -+ int num_of_cache_types = 0; -+ int i, j, k; -+ int ct = 0; -+ int mem_available = available_size; -+ unsigned int cu_processor_id; -+ int ret; -+ -+ switch (kdev->device_info->asic_family) { -+ case CHIP_KAVERI: -+ pcache_info = kaveri_cache_info; -+ num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); -+ break; -+ case CHIP_CARRIZO: -+ pcache_info = carrizo_cache_info; -+ num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); -+ break; -+ case CHIP_TONGA: -+ pcache_info = tonga_cache_info; -+ num_of_cache_types = ARRAY_SIZE(tonga_cache_info); -+ break; -+ case CHIP_FIJI: -+ pcache_info = fiji_cache_info; -+ num_of_cache_types = ARRAY_SIZE(fiji_cache_info); -+ break; -+ default: -+ return -EINVAL; -+ } -+ -+ *size_filled = 0; -+ *num_of_entries = 0; -+ -+ /* For each type of cache listed in the kfd_gpu_cache_info table, -+ * go through all available Compute Units. -+ * The [i,j,k] loop will -+ * if kfd_gpu_cache_info.num_cu_shared = 1 -+ * will parse through all available CU -+ * If (kfd_gpu_cache_info.num_cu_shared != 1) -+ * then it will consider only one CU from -+ * the shared unit -+ */ -+ -+ for (ct = 0; ct < num_of_cache_types; ct++) { -+ cu_processor_id = gpu_processor_id; -+ for (i = 0; i < cu_info->num_shader_engines; i++) { -+ for (j = 0; j < cu_info->num_shader_arrays_per_engine; -+ j++) { -+ for (k = 0; k < cu_info->num_cu_per_sh; -+ k += pcache_info[ct].num_cu_shared) { -+ -+ ret = fill_in_pcache(pcache, -+ pcache_info, -+ cu_info, -+ mem_available, -+ cu_info->cu_bitmap[i][j], -+ ct, -+ cu_processor_id, -+ k); -+ -+ if (ret < 0) -+ break; -+ -+ if (!ret) { -+ pcache++; -+ (*num_of_entries)++; -+ mem_available -= -+ sizeof(*pcache); -+ (*size_filled) += -+ sizeof(*pcache); -+ } -+ -+ /* Move to next CU block */ -+ cu_processor_id += -+ pcache_info[ct].num_cu_shared; -+ } -+ } -+ } -+ } -+ -+ pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); -+ -+ return 0; -+} -+ -+/* -+ * kfd_create_crat_image_acpi - Allocates memory for CRAT image and -+ * copies CRAT from ACPI (if available). -+ * -+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory -+ * -+ * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then -+ * *crat_image will be NULL -+ * @size: [OUT] size of crat_image -+ * -+ * Return 0 if successful else return -ve value -+ */ -+int kfd_create_crat_image_acpi(void **crat_image, size_t *size) -+{ -+ struct acpi_table_header *crat_table; -+ acpi_status status; -+ void *pcrat_image; -+ -+ if (!crat_image) -+ return -EINVAL; -+ -+ *crat_image = NULL; -+ -+ /* -+ * Fetch the CRAT table from ACPI -+ */ -+ status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); -+ if (status == AE_NOT_FOUND) { -+ pr_warn("CRAT table not found\n"); -+ return -ENODATA; -+ } else if (ACPI_FAILURE(status)) { -+ const char *err = acpi_format_exception(status); -+ pr_err("CRAT table error: %s\n", err); -+ return -EINVAL; -+ } -+ -+ pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); -+ if (!pcrat_image) { -+ pr_err("No memory for allocating CRAT image\n"); -+ return -ENOMEM; -+ } -+ -+ memcpy(pcrat_image, crat_table, crat_table->length); -+ -+ *crat_image = pcrat_image; -+ *size = crat_table->length; -+ -+ return 0; -+} -+ -+/* Memory required to create Virtual CRAT. -+ * Since there is no easy way to predict the amount of memory required, the -+ * following amount are allocated for CPU and GPU Virtual CRAT. This is -+ * expected to cover all known conditions. But to be safe additional check -+ * is put in the code to ensure we don't overwrite. -+ */ -+#define VCRAT_SIZE_FOR_CPU PAGE_SIZE -+#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) -+ -+/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node -+ * -+ * @numa_node_id: CPU NUMA node id -+ * @avail_size: Available size in the memory -+ * @sub_type_hdr: Memory into which compute info will be filled in -+ * -+ * Return 0 if successful else return -ve value -+ */ -+static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, -+ int proximity_domain, -+ struct crat_subtype_computeunit *sub_type_hdr) -+{ -+ const struct cpumask *cpumask; -+ -+ *avail_size -= sizeof(struct crat_subtype_computeunit); -+ if (*avail_size < 0) -+ return -ENOMEM; -+ -+ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); -+ -+ /* Fill in subtype header data */ -+ sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; -+ sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); -+ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; -+ -+ cpumask = cpumask_of_node(numa_node_id); -+ -+ /* Fill in CU data */ -+ sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; -+ sub_type_hdr->proximity_domain = proximity_domain; -+ sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); -+ if (sub_type_hdr->processor_id_low == -1) -+ return -EINVAL; -+ -+ sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); -+ -+ return 0; -+} -+ -+/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node -+ * -+ * @numa_node_id: CPU NUMA node id -+ * @avail_size: Available size in the memory -+ * @sub_type_hdr: Memory into which compute info will be filled in -+ * -+ * Return 0 if successful else return -ve value -+ */ -+static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, -+ int proximity_domain, -+ struct crat_subtype_memory *sub_type_hdr) -+{ -+ uint64_t mem_in_bytes = 0; -+ pg_data_t *pgdat; -+ int zone_type; -+ -+ *avail_size -= sizeof(struct crat_subtype_computeunit); -+ if (*avail_size < 0) -+ return -ENOMEM; -+ -+ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); -+ -+ /* Fill in subtype header data */ -+ sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; -+ sub_type_hdr->length = sizeof(struct crat_subtype_memory); -+ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; -+ -+ /* Fill in Memory Subunit data */ -+ -+ /* Unlike si_meminfo, si_meminfo_node is not exported. So -+ * the following lines are duplicated from si_meminfo_node -+ * function */ -+ pgdat = NODE_DATA(numa_node_id); -+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) -+ mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; -+ mem_in_bytes <<= PAGE_SHIFT; -+ -+ sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); -+ sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); -+ sub_type_hdr->proximity_domain = proximity_domain; -+ -+ return 0; -+} -+ -+/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU -+ * -+ * @pcrat_image: Fill in VCRAT for CPU -+ * @size: [IN] allocated size of crat_image. -+ * [OUT] actual size of data filled in crat_image -+ */ -+static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) -+{ -+ struct crat_header *crat_table = (struct crat_header *)pcrat_image; -+ struct acpi_table_header *acpi_table; -+ acpi_status status; -+ struct crat_subtype_generic *sub_type_hdr; -+ int avail_size = *size; -+ int numa_node_id; -+ int ret = 0; -+ -+ if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_CPU) -+ return -EINVAL; -+ -+ /* Fill in CRAT Header. -+ * Modify length and total_entries as subunits are added. -+ */ -+ avail_size -= sizeof(struct crat_header); -+ if (avail_size < 0) -+ return -ENOMEM; -+ -+ memset(crat_table, 0, sizeof(struct crat_header)); -+ memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature)); -+ crat_table->length = sizeof(struct crat_header); -+ -+ status = acpi_get_table("DSDT", 0, &acpi_table); -+ if (status == AE_NOT_FOUND) -+ pr_warn("DSDT table not found for OEM information\n"); -+ else { -+ crat_table->oem_revision = acpi_table->revision; -+ memcpy(crat_table->oem_id, acpi_table->oem_id, CRAT_OEMID_LENGTH); -+ memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, CRAT_OEMTABLEID_LENGTH); -+ } -+ crat_table->total_entries = 0; -+ crat_table->num_domains = 0; -+ -+ sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); -+ -+ for_each_online_node(numa_node_id) { -+ /* Fill in Subtype: Compute Unit */ -+ ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, -+ crat_table->num_domains, -+ (struct crat_subtype_computeunit *)sub_type_hdr); -+ if (ret < 0) -+ return ret; -+ crat_table->length += sub_type_hdr->length; -+ crat_table->total_entries++; -+ -+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -+ sub_type_hdr->length); -+ -+ /* Fill in Subtype: Memory */ -+ ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, -+ crat_table->num_domains, -+ (struct crat_subtype_memory *)sub_type_hdr); -+ if (ret < 0) -+ return ret; -+ crat_table->length += sub_type_hdr->length; -+ crat_table->total_entries++; -+ -+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -+ sub_type_hdr->length); -+ -+ crat_table->num_domains++; -+ } -+ -+ /* TODO: Add cache Subtype for CPU. -+ * Currently, CPU cache information is available in function -+ * detect_cache_attributes(cpu) defined in the file -+ * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not exported -+ * and to get the same information the code needs to be duplicated. -+ */ -+ -+ *size = crat_table->length; -+ pr_info("Virtual CRAT table created for CPU\n"); -+ -+ return 0; -+} -+ -+static int kfd_fill_gpu_memory_affinity(int *avail_size, -+ struct kfd_dev *kdev, uint8_t type, uint64_t size, -+ struct crat_subtype_memory *sub_type_hdr, -+ uint32_t proximity_domain, -+ const struct kfd_local_mem_info *local_mem_info) -+{ -+ *avail_size -= sizeof(struct crat_subtype_memory); -+ if (*avail_size < 0) -+ return -ENOMEM; -+ -+ memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); -+ sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; -+ sub_type_hdr->length = sizeof(struct crat_subtype_memory); -+ sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; -+ -+ sub_type_hdr->proximity_domain = proximity_domain; -+ -+ pr_debug("amdkfd: fill gpu memory affinity - type 0x%x size 0x%llx\n", -+ type, size); -+ -+ sub_type_hdr->length_low = lower_32_bits(size); -+ sub_type_hdr->length_high = upper_32_bits(size); -+ -+ sub_type_hdr->width = local_mem_info->vram_width; -+ sub_type_hdr->visibility_type = type; -+ -+ return 0; -+} -+ -+/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU -+ * to its NUMA node -+ * -+ * @avail_size: Available size in the memory -+ * @kdev - [IN] GPU device -+ * @sub_type_hdr: Memory into which io link info will be filled in -+ * @proximity_domain - proximity domain of the GPU node -+ * -+ * Return 0 if successful else return -ve value -+ */ -+static int kfd_fill_gpu_direct_io_link(int *avail_size, -+ struct kfd_dev *kdev, -+ struct crat_subtype_iolink *sub_type_hdr, -+ uint32_t proximity_domain) -+{ -+ int proximity_domain_to; -+ *avail_size -= sizeof(struct crat_subtype_iolink); -+ if (*avail_size < 0) -+ return -ENOMEM; -+ -+ memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); -+ -+ /* Fill in subtype header data */ -+ sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; -+ sub_type_hdr->length = sizeof(struct crat_subtype_iolink); -+ sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; -+ -+ /* Fill in IOLINK subtype. -+ * TODO: Fill-in other fields of iolink subtype */ -+ sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; -+ sub_type_hdr->proximity_domain_from = proximity_domain; -+ proximity_domain_to = -+ kfd_get_proximity_domain(kdev->pdev->bus); -+ if (proximity_domain_to == -1) -+ return -EINVAL; -+ -+ sub_type_hdr->proximity_domain_to = proximity_domain_to; -+ return 0; -+} -+ -+/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU -+ * -+ * @pcrat_image: Fill in VCRAT for GPU -+ * @size: [IN] allocated size of crat_image. -+ * [OUT] actual size of data filled in crat_image -+ */ -+static int kfd_create_vcrat_image_gpu(void *pcrat_image, -+ size_t *size, struct kfd_dev *kdev, -+ uint32_t proximity_domain) -+{ -+ struct crat_header *crat_table = (struct crat_header *)pcrat_image; -+ struct crat_subtype_generic *sub_type_hdr; -+ struct crat_subtype_computeunit *cu; -+ struct kfd_cu_info cu_info; -+ struct amd_iommu_device_info iommu_info; -+ int avail_size = *size; -+ uint32_t total_num_of_cu; -+ int num_of_cache_entries = 0; -+ int cache_mem_filled = 0; -+ int ret = 0; -+ const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | -+ AMD_IOMMU_DEVICE_FLAG_PRI_SUP | -+ AMD_IOMMU_DEVICE_FLAG_PASID_SUP; -+ struct kfd_local_mem_info local_mem_info; -+ -+ if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_GPU) -+ return -EINVAL; -+ -+ /* Fill the CRAT Header. -+ * Modify length and total_entries as subunits are added. -+ */ -+ avail_size -= sizeof(struct crat_header); -+ if (avail_size < 0) -+ return -ENOMEM; -+ -+ memset(crat_table, 0, sizeof(struct crat_header)); -+ -+ memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature)); -+ crat_table->length = sizeof(struct crat_header); /* Change length as we add more subtypes*/ -+ crat_table->num_domains = 1; -+ crat_table->total_entries = 0; -+ -+ /* Fill in Subtype: Compute Unit -+ * First fill in the sub type header and then sub type data -+ */ -+ avail_size -= sizeof(struct crat_subtype_computeunit); -+ if (avail_size < 0) -+ return -ENOMEM; -+ -+ sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); -+ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); -+ -+ sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; -+ sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); -+ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; -+ -+ /* Fill CU subtype data */ -+ cu = (struct crat_subtype_computeunit *)sub_type_hdr; -+ cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; -+ cu->proximity_domain = proximity_domain; -+ -+ kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); -+ cu->num_simd_per_cu = cu_info.simd_per_cu; -+ cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; -+ cu->max_waves_simd = cu_info.max_waves_per_simd; -+ -+ cu->wave_front_size = cu_info.wave_front_size; -+ cu->array_count = cu_info.num_shader_arrays_per_engine * -+ cu_info.num_shader_engines; -+ total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); -+ cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); -+ cu->num_cu_per_array = cu_info.num_cu_per_sh; -+ cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; -+ cu->num_banks = cu_info.num_shader_engines; -+ cu->lds_size_in_kb = cu_info.lds_size; -+ -+ cu->hsa_capability = 0; -+ -+ /* Check if this node supports IOMMU. During parsing this flag will -+ * translate to HSA_CAP_ATS_PRESENT */ -+ iommu_info.flags = 0; -+ if (0 == amd_iommu_device_info(kdev->pdev, &iommu_info)) { -+ if ((iommu_info.flags & required_iommu_flags) == required_iommu_flags) -+ cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; -+ } -+ -+ crat_table->length += sub_type_hdr->length; -+ crat_table->total_entries++; -+ -+ /* Fill in Subtype: Memory. Only on systems with large BAR (no -+ * private FB), report memory as public. On other systems -+ * report the total FB size (public+private) as a single -+ * private heap. */ -+ kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); -+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -+ sub_type_hdr->length); -+ -+ if (local_mem_info.local_mem_size_private == 0) -+ ret = kfd_fill_gpu_memory_affinity(&avail_size, -+ kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, -+ local_mem_info.local_mem_size_public, -+ (struct crat_subtype_memory *)sub_type_hdr, -+ proximity_domain, -+ &local_mem_info); -+ else -+ ret = kfd_fill_gpu_memory_affinity(&avail_size, -+ kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, -+ local_mem_info.local_mem_size_public + -+ local_mem_info.local_mem_size_private, -+ (struct crat_subtype_memory *)sub_type_hdr, -+ proximity_domain, -+ &local_mem_info); -+ if (ret < 0) -+ return ret; -+ -+ crat_table->length += sizeof(struct crat_subtype_memory); -+ crat_table->total_entries++; -+ -+ /* TODO: Fill in cache information. This information is NOT readily -+ * available in KGD */ -+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -+ sub_type_hdr->length); -+ ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, -+ avail_size, -+ &cu_info, -+ (struct crat_subtype_cache *)sub_type_hdr, -+ &cache_mem_filled, -+ &num_of_cache_entries); -+ -+ if (ret < 0) -+ return ret; -+ -+ crat_table->length += cache_mem_filled; -+ crat_table->total_entries += num_of_cache_entries; -+ avail_size -= cache_mem_filled; -+ -+ /* Fill in Subtype: IO_LINKS -+ * Only direct links are added here which is Link from GPU to -+ * to its NUMA node. Indirect links are added by userspace. -+ */ -+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -+ cache_mem_filled); -+ ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, -+ (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); -+ -+ if (ret < 0) -+ return ret; -+ -+ crat_table->length += sub_type_hdr->length; -+ crat_table->total_entries++; -+ -+ *size = crat_table->length; -+ pr_info("Virtual CRAT table created for GPU\n"); -+ -+ return ret; -+} -+ -+/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and -+ * creates a Virtual CRAT (VCRAT) image -+ * -+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory -+ * -+ * @crat_image: VCRAT image created because ACPI does not have a -+ * CRAT for this device -+ * @size: [OUT] size of virtual crat_image -+ * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device -+ * COMPUTE_UNIT_GPU - Create VCRAT for GPU -+ * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU -+ * -- this option is not currently implemented. The assumption -+ * is that all AMD APUs will have CRAT -+ * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU -+ * -+ * Return 0 if successful else return -ve value -+*/ -+int kfd_create_crat_image_virtual(void **crat_image, size_t *size, -+ int flags, struct kfd_dev *kdev, uint32_t proximity_domain) -+{ -+ void *pcrat_image; -+ int ret = 0; -+ -+ if (!crat_image) -+ return -EINVAL; -+ -+ *crat_image = NULL; -+ -+ /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and -+ * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover -+ * all the current conditions. A check is put not to overwrite beyond -+ * allocated size -+ */ -+ switch (flags) { -+ case COMPUTE_UNIT_CPU: -+ pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); -+ if (!pcrat_image) -+ return -ENOMEM; -+ *size = VCRAT_SIZE_FOR_CPU; -+ ret = kfd_create_vcrat_image_cpu(pcrat_image, size); -+ break; -+ case COMPUTE_UNIT_GPU: -+ if (!kdev) -+ return -EINVAL; -+ pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); -+ if (!pcrat_image) -+ return -ENOMEM; -+ *size = VCRAT_SIZE_FOR_GPU; -+ ret = kfd_create_vcrat_image_gpu(pcrat_image, size, -+ kdev, proximity_domain); -+ break; -+ case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) : -+ /*TODO:*/ -+ ret = -EINVAL; -+ pr_err("VCRAT not implemented for APU\n"); -+ break; -+ default: -+ ret = -EINVAL; -+ } -+ -+ if (ret == 0) -+ *crat_image = pcrat_image; -+ -+ return ret; -+} -+ -+ -+/* kfd_destroy_crat_image -+ * -+ * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) -+ * -+ */ -+void kfd_destroy_crat_image(void *crat_image) -+{ -+ if (crat_image) -+ kfree(crat_image); -+ return; -+} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h -index a374fa3..9af3745 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h -@@ -24,6 +24,7 @@ - #define KFD_CRAT_H_INCLUDED - - #include <linux/types.h> -+#include "kfd_priv.h" - - #pragma pack(1) - -@@ -44,6 +45,10 @@ - - #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) - -+/* Compute Unit flags */ -+#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ -+#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ -+ - struct crat_header { - uint32_t signature; - uint32_t length; -@@ -105,7 +110,7 @@ struct crat_subtype_computeunit { - uint8_t wave_front_size; - uint8_t num_banks; - uint16_t micro_engine_id; -- uint8_t num_arrays; -+ uint8_t array_count; - uint8_t num_cu_per_array; - uint8_t num_simd_per_cu; - uint8_t max_slots_scatch_cu; -@@ -127,13 +132,14 @@ struct crat_subtype_memory { - uint8_t length; - uint16_t reserved; - uint32_t flags; -- uint32_t promixity_domain; -+ uint32_t proximity_domain; - uint32_t base_addr_low; - uint32_t base_addr_high; - uint32_t length_low; - uint32_t length_high; - uint32_t width; -- uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; -+ uint8_t visibility_type; /* for virtual (dGPU) CRAT */ -+ uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; - }; - - /* -@@ -222,9 +228,12 @@ struct crat_subtype_ccompute { - /* - * HSA IO Link Affinity structure and definitions - */ --#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 --#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 --#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc -+#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) -+#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) -+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) -+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) -+#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) -+#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 - - /* - * IO interface types -@@ -232,8 +241,16 @@ struct crat_subtype_ccompute { - #define CRAT_IOLINK_TYPE_UNDEFINED 0 - #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 - #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 --#define CRAT_IOLINK_TYPE_OTHER 3 --#define CRAT_IOLINK_TYPE_MAX 255 -+#define CRAT_IOLINK_TYPE_AMBA 3 -+#define CRAT_IOLINK_TYPE_MIPI 4 -+#define CRAT_IOLINK_TYPE_QPI_1_1 5 -+#define CRAT_IOLINK_TYPE_RESERVED1 6 -+#define CRAT_IOLINK_TYPE_RESERVED2 7 -+#define CRAT_IOLINK_TYPE_RAPID_IO 8 -+#define CRAT_IOLINK_TYPE_INFINIBAND 9 -+#define CRAT_IOLINK_TYPE_RESERVED3 10 -+#define CRAT_IOLINK_TYPE_OTHER 11 -+#define CRAT_IOLINK_TYPE_MAX 255 - - #define CRAT_IOLINK_RESERVED_LENGTH 24 - -@@ -291,4 +308,11 @@ struct cdit_header { - - #pragma pack() - -+int kfd_create_crat_image_acpi(void **crat_image, size_t *size); -+void kfd_destroy_crat_image(void *crat_image); -+int kfd_parse_crat_table(void *crat_image, -+ struct list_head *device_list, -+ uint32_t proximity_domain); -+int kfd_create_crat_image_virtual(void **crat_image, size_t *size, -+ int flags, struct kfd_dev *kdev, uint32_t proximity_domain); - #endif /* KFD_CRAT_H_INCLUDED */ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c -index d5e19b5..4f2311e 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c -@@ -42,8 +42,6 @@ - - static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev) - { -- BUG_ON(!dev || !dev->kfd2kgd); -- - dev->kfd2kgd->address_watch_disable(dev->kgd); - } - -@@ -51,129 +49,118 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - unsigned int pasid, uint64_t vmid0_address, - uint32_t *packet_buff, size_t size_in_bytes) - { -+ int status = 0; -+ unsigned int *ib_packet_buff = NULL; - struct pm4__release_mem *rm_packet; - struct pm4__indirect_buffer_pasid *ib_packet; -+ struct kernel_queue *kq = dbgdev->kq; -+ size_t pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + sizeof(struct pm4__indirect_buffer_pasid); - struct kfd_mem_obj *mem_obj; -- size_t pq_packets_size_in_bytes; -+ -+ uint64_t *rm_state = NULL; -+ - union ULARGE_INTEGER *largep; - union ULARGE_INTEGER addr; -- struct kernel_queue *kq; -- uint64_t *rm_state; -- unsigned int *ib_packet_buff; -- int status; -- -- BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes); -- -- kq = dbgdev->kq; -- -- pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + -- sizeof(struct pm4__indirect_buffer_pasid); -- -- /* -- * We acquire a buffer from DIQ -- * The receive packet buff will be sitting on the Indirect Buffer -- * and in the PQ we put the IB packet + sync packet(s). -- */ -- status = kq->ops.acquire_packet_buffer(kq, -- pq_packets_size_in_bytes / sizeof(uint32_t), -- &ib_packet_buff); -- if (status != 0) { -- pr_err("amdkfd: acquire_packet_buffer failed\n"); -- return status; -- } - -- memset(ib_packet_buff, 0, pq_packets_size_in_bytes); -+ do { -+ if ((kq == NULL) || (packet_buff == NULL) || (size_in_bytes == 0)) { -+ pr_debug("Error! kfd: In func %s >> Illegal packet parameters\n", __func__); -+ status = -EINVAL; -+ break; -+ } -+ /* todo - enter proper locking to be multithreaded safe */ - -- ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff); -+ /* We acquire a buffer from DIQ -+ * The receive packet buff will be sitting on the Indirect Buffer -+ * and in the PQ we put the IB packet + sync packet(s). -+ */ -+ status = kq->ops.acquire_packet_buffer(kq, pq_packets_size_in_bytes / sizeof(uint32_t), &ib_packet_buff); -+ if (status != 0) { -+ pr_debug("Error! kfd: In func %s >> acquire_packet_buffer failed\n", __func__); -+ break; -+ } - -- ib_packet->header.count = 3; -- ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID; -- ib_packet->header.type = PM4_TYPE_3; -+ memset(ib_packet_buff, 0, pq_packets_size_in_bytes); - -- largep = (union ULARGE_INTEGER *) &vmid0_address; -+ ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff); - -- ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2; -- ib_packet->bitfields3.ib_base_hi = largep->u.high_part; -+ ib_packet->header.count = 3; -+ ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID; -+ ib_packet->header.type = PM4_TYPE_3; - -- ib_packet->control = (1 << 23) | (1 << 31) | -- ((size_in_bytes / sizeof(uint32_t)) & 0xfffff); -+ largep = (union ULARGE_INTEGER *) &vmid0_address; - -- ib_packet->bitfields5.pasid = pasid; -+ ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2; -+ ib_packet->bitfields3.ib_base_hi = largep->u.high_part; - -- /* -- * for now we use release mem for GPU-CPU synchronization -- * Consider WaitRegMem + WriteData as a better alternative -- * we get a GART allocations ( gpu/cpu mapping), -- * for the sync variable, and wait until: -- * (a) Sync with HW -- * (b) Sync var is written by CP to mem. -- */ -- rm_packet = (struct pm4__release_mem *) (ib_packet_buff + -- (sizeof(struct pm4__indirect_buffer_pasid) / -- sizeof(unsigned int))); -+ ib_packet->control = (1 << 23) | (1 << 31) | -+ ((size_in_bytes / sizeof(uint32_t)) & 0xfffff); - -- status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), -- &mem_obj); -+ ib_packet->bitfields5.pasid = pasid; - -- if (status != 0) { -- pr_err("amdkfd: Failed to allocate GART memory\n"); -- kq->ops.rollback_packet(kq); -- return status; -- } -+ /* -+ * for now we use release mem for GPU-CPU synchronization -+ * Consider WaitRegMem + WriteData as a better alternative -+ * we get a GART allocations ( gpu/cpu mapping), -+ * for the sync variable, and wait until: -+ * (a) Sync with HW -+ * (b) Sync var is written by CP to mem. -+ */ -+ rm_packet = (struct pm4__release_mem *) (ib_packet_buff + -+ (sizeof(struct pm4__indirect_buffer_pasid) / sizeof(unsigned int))); - -- rm_state = (uint64_t *) mem_obj->cpu_ptr; -+ status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), -+ &mem_obj); - -- *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING; -+ if (status == 0) { - -- rm_packet->header.opcode = IT_RELEASE_MEM; -- rm_packet->header.type = PM4_TYPE_3; -- rm_packet->header.count = sizeof(struct pm4__release_mem) / -- sizeof(unsigned int) - 2; -+ rm_state = (uint64_t *) mem_obj->cpu_ptr; - -- rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; -- rm_packet->bitfields2.event_index = -- event_index___release_mem__end_of_pipe; -+ *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING; - -- rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru; -- rm_packet->bitfields2.atc = 0; -- rm_packet->bitfields2.tc_wb_action_ena = 1; -+ rm_packet->header.opcode = IT_RELEASE_MEM; -+ rm_packet->header.type = PM4_TYPE_3; -+ rm_packet->header.count = sizeof(struct pm4__release_mem) / sizeof(unsigned int) - 2; - -- addr.quad_part = mem_obj->gpu_addr; -+ rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; -+ rm_packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; -+ rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru; -+ rm_packet->bitfields2.atc = 0; -+ rm_packet->bitfields2.tc_wb_action_ena = 1; - -- rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2; -- rm_packet->address_hi = addr.u.high_part; -+ addr.quad_part = mem_obj->gpu_addr; - -- rm_packet->bitfields3.data_sel = -- data_sel___release_mem__send_64_bit_data; -+ rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2; -+ rm_packet->address_hi = addr.u.high_part; - -- rm_packet->bitfields3.int_sel = -- int_sel___release_mem__send_data_after_write_confirm; -+ rm_packet->bitfields3.data_sel = data_sel___release_mem__send_64_bit_data; -+ rm_packet->bitfields3.int_sel = int_sel___release_mem__send_data_after_write_confirm; -+ rm_packet->bitfields3.dst_sel = dst_sel___release_mem__memory_controller; - -- rm_packet->bitfields3.dst_sel = -- dst_sel___release_mem__memory_controller; -+ rm_packet->data_lo = QUEUESTATE__ACTIVE; - -- rm_packet->data_lo = QUEUESTATE__ACTIVE; -+ kq->ops.submit_packet(kq); - -- kq->ops.submit_packet(kq); -+ /* Wait till CP writes sync code: */ - -- /* Wait till CP writes sync code: */ -- status = amdkfd_fence_wait_timeout( -- (unsigned int *) rm_state, -- QUEUESTATE__ACTIVE, 1500); -+ status = amdkfd_fence_wait_timeout( -+ (unsigned int *) rm_state, -+ QUEUESTATE__ACTIVE, 1500); -+ -+ } else { -+ pr_debug("Error! kfd: In func %s >> failed to allocate GART memory\n", __func__); -+ } -+ } while (false); - -- kfd_gtt_sa_free(dbgdev->dev, mem_obj); -+ if (rm_state != NULL) -+ kfd_gtt_sa_free(dbgdev->dev, mem_obj); - - return status; - } - - static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) - { -- BUG_ON(!dbgdev); -- -- /* -- * no action is needed in this case, -- * just make sure diq will not be used -- */ -+ /* no action is needed in this case, just make sure diq will not be used */ - - dbgdev->kq = NULL; - -@@ -182,57 +169,68 @@ static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) - - static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) - { -+ -+ int status = 0; -+ struct kernel_queue *kq = NULL; - struct queue_properties properties; - unsigned int qid; -- struct kernel_queue *kq = NULL; -- int status; -+ struct process_queue_manager *pqm = dbgdev->pqm; - -- BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->dev); -+ do { - -- status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, -- &properties, 0, KFD_QUEUE_TYPE_DIQ, -- &qid); -+ if (!pqm) { -+ pr_debug("Error! kfd: In func %s >> No PQM\n", __func__); -+ status = -EFAULT; -+ break; -+ } - -- if (status) { -- pr_err("amdkfd: Failed to create DIQ\n"); -- return status; -- } -+ properties.type = KFD_QUEUE_TYPE_DIQ; - -- pr_debug("DIQ Created with queue id: %d\n", qid); -+ status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, -+ &properties, &qid); - -- kq = pqm_get_kernel_queue(dbgdev->pqm, qid); -+ if (status != 0) { -+ pr_debug("Error! kfd: In func %s >> Create Queue failed\n", __func__); -+ break; -+ } - -- if (kq == NULL) { -- pr_err("amdkfd: Error getting DIQ\n"); -- pqm_destroy_queue(dbgdev->pqm, qid); -- return -EFAULT; -- } -+ pr_debug("kfd: DIQ Created with queue id: %d\n", qid); -+ -+ kq = pqm_get_kernel_queue(dbgdev->pqm, qid); -+ -+ if (kq == NULL) { -+ pr_debug("Error! kfd: In func %s >> Error getting Kernel Queue\n", __func__); -+ status = -ENOMEM; -+ break; -+ } -+ -+ dbgdev->kq = kq; - -- dbgdev->kq = kq; -+ } while (false); - - return status; - } - - static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev) - { -- BUG_ON(!dbgdev || !dbgdev->dev); -- - /* disable watch address */ -+ - dbgdev_address_watch_disable_nodiq(dbgdev->dev); - return 0; - } - - static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev) - { -- /* todo - disable address watch */ -- int status; -- -- BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->kq); -- -- status = pqm_destroy_queue(dbgdev->pqm, -- dbgdev->kq->queue->properties.queue_id); -- dbgdev->kq = NULL; -- -+ /* todo - if needed, kill wavefronts and disable watch */ -+ int status = 0; -+ if ((dbgdev == NULL) || (dbgdev->pqm == NULL) || (dbgdev->kq == NULL)) { -+ pr_debug("kfd Err:In func %s >> can't destroy diq\n", __func__); -+ status = -EFAULT; -+ } else { -+ pqm_destroy_queue(dbgdev->pqm, -+ dbgdev->kq->queue->properties.queue_id); -+ dbgdev->kq = NULL; -+ } - return status; - } - -@@ -241,341 +239,350 @@ static void dbgdev_address_watch_set_registers( - union TCP_WATCH_ADDR_H_BITS *addrHi, - union TCP_WATCH_ADDR_L_BITS *addrLo, - union TCP_WATCH_CNTL_BITS *cntl, -- unsigned int index, unsigned int vmid) -+ unsigned int index, unsigned int vmid, -+ unsigned int asic_family) - { - union ULARGE_INTEGER addr; - -- BUG_ON(!adw_info || !addrHi || !addrLo || !cntl); -- - addr.quad_part = 0; - addrHi->u32All = 0; - addrLo->u32All = 0; - cntl->u32All = 0; - - if (adw_info->watch_mask != NULL) -- cntl->bitfields.mask = -- (uint32_t) (adw_info->watch_mask[index] & -- ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); -+ cntl->bitfields.mask = (uint32_t) (adw_info->watch_mask[index] & ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); - else - cntl->bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; - - addr.quad_part = (unsigned long long) adw_info->watch_address[index]; - -- addrHi->bitfields.addr = addr.u.high_part & -- ADDRESS_WATCH_REG_ADDHIGH_MASK; -+ addrHi->bitfields.addr = addr.u.high_part & ADDRESS_WATCH_REG_ADDHIGH_MASK; - addrLo->bitfields.addr = - (addr.u.low_part >> ADDRESS_WATCH_REG_ADDLOW_SHIFT); - - cntl->bitfields.mode = adw_info->watch_mode[index]; - cntl->bitfields.vmid = (uint32_t) vmid; -- /* for now assume it is an ATC address */ -- cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; -- -+ /* for APU assume it is an ATC address. */ -+ if (KFD_IS_DGPU(asic_family) == false) -+ cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; - pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask); -- pr_debug("\t\t%20s %08x\n", "set reg add high :", -- addrHi->bitfields.addr); -- pr_debug("\t\t%20s %08x\n", "set reg add low :", -- addrLo->bitfields.addr); -+ pr_debug("\t\t%20s %08x\n", "set reg add high :", addrHi->bitfields.addr); -+ pr_debug("\t\t%20s %08x\n", "set reg add low :", addrLo->bitfields.addr); -+ - } - - static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, - struct dbg_address_watch_info *adw_info) - { -+ -+ int status = 0; -+ - union TCP_WATCH_ADDR_H_BITS addrHi; - union TCP_WATCH_ADDR_L_BITS addrLo; - union TCP_WATCH_CNTL_BITS cntl; -- struct kfd_process_device *pdd; -+ -+ unsigned int vmid; - unsigned int i; - -- BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); -+ struct kfd_process_device *pdd; - -- /* taking the vmid for that process on the safe way using pdd */ -- pdd = kfd_get_process_device_data(dbgdev->dev, -- adw_info->process); -- if (!pdd) { -- pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); -- return -EFAULT; -- } -+ do { -+ /* taking the vmid for that process on the safe way using pdd */ -+ pdd = kfd_get_process_device_data(dbgdev->dev, -+ adw_info->process); -+ if (!pdd) { -+ pr_debug("Error! kfd: In func %s >> no PDD available\n", __func__); -+ status = -EFAULT; -+ break; -+ } - -- addrHi.u32All = 0; -- addrLo.u32All = 0; -- cntl.u32All = 0; -+ addrHi.u32All = 0; -+ addrLo.u32All = 0; -+ cntl.u32All = 0; - -- if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || -- (adw_info->num_watch_points == 0)) { -- pr_err("amdkfd: num_watch_points is invalid\n"); -- return -EINVAL; -- } -+ vmid = pdd->qpd.vmid; - -- if ((adw_info->watch_mode == NULL) || -- (adw_info->watch_address == NULL)) { -- pr_err("amdkfd: adw_info fields are not valid\n"); -- return -EINVAL; -- } -+ if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) -+ || (adw_info->num_watch_points == 0)) { -+ status = -EINVAL; -+ break; -+ } - -- for (i = 0 ; i < adw_info->num_watch_points ; i++) { -- dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, -- &cntl, i, pdd->qpd.vmid); -- -- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); -- pr_debug("\t\t%20s %08x\n", "register index :", i); -- pr_debug("\t\t%20s %08x\n", "vmid is :", pdd->qpd.vmid); -- pr_debug("\t\t%20s %08x\n", "Address Low is :", -- addrLo.bitfields.addr); -- pr_debug("\t\t%20s %08x\n", "Address high is :", -- addrHi.bitfields.addr); -- pr_debug("\t\t%20s %08x\n", "Address high is :", -- addrHi.bitfields.addr); -- pr_debug("\t\t%20s %08x\n", "Control Mask is :", -- cntl.bitfields.mask); -- pr_debug("\t\t%20s %08x\n", "Control Mode is :", -- cntl.bitfields.mode); -- pr_debug("\t\t%20s %08x\n", "Control Vmid is :", -- cntl.bitfields.vmid); -- pr_debug("\t\t%20s %08x\n", "Control atc is :", -- cntl.bitfields.atc); -- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); -- -- pdd->dev->kfd2kgd->address_watch_execute( -- dbgdev->dev->kgd, -- i, -- cntl.u32All, -- addrHi.u32All, -- addrLo.u32All); -- } -+ if ((adw_info->watch_mode == NULL) || (adw_info->watch_address == NULL)) { -+ status = -EINVAL; -+ break; -+ } - -- return 0; -+ for (i = 0; i < adw_info->num_watch_points; i++) { -+ -+ dbgdev_address_watch_set_registers( -+ adw_info, -+ &addrHi, -+ &addrLo, -+ &cntl, -+ i, -+ vmid, -+ dbgdev->dev->device_info->asic_family -+ ); -+ -+ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); -+ pr_debug("\t\t%20s %08x\n", "register index :", i); -+ pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); -+ pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr); -+ pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr); -+ pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr); -+ pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask); -+ pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode); -+ pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid); -+ pr_debug("\t\t%20s %08x\n", "Control atc is :", cntl.bitfields.atc); -+ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); -+ -+ pdd->dev->kfd2kgd->address_watch_execute( -+ dbgdev->dev->kgd, -+ i, -+ cntl.u32All, -+ addrHi.u32All, -+ addrLo.u32All); -+ } -+ -+ } while (false); -+ -+ return status; - } - - static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, - struct dbg_address_watch_info *adw_info) - { -- struct pm4__set_config_reg *packets_vec; -+ -+ int status = 0; -+ unsigned int i = 0; - union TCP_WATCH_ADDR_H_BITS addrHi; - union TCP_WATCH_ADDR_L_BITS addrLo; - union TCP_WATCH_CNTL_BITS cntl; -- struct kfd_mem_obj *mem_obj; -- unsigned int aw_reg_add_dword; -- uint32_t *packet_buff_uint; -- unsigned int i; -- int status; -- size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; -+ - /* we do not control the vmid in DIQ mode, just a place holder */ - unsigned int vmid = 0; - -- BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); -+ struct kfd_mem_obj *mem_obj; -+ uint32_t *packet_buff_uint = NULL; -+ -+ struct pm4__set_config_reg *packets_vec = NULL; -+ -+ size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; -+ -+ unsigned int aw_reg_add_dword; - - addrHi.u32All = 0; - addrLo.u32All = 0; - cntl.u32All = 0; - -- if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || -- (adw_info->num_watch_points == 0)) { -- pr_err("amdkfd: num_watch_points is invalid\n"); -- return -EINVAL; -- } -+ do { - -- if ((NULL == adw_info->watch_mode) || -- (NULL == adw_info->watch_address)) { -- pr_err("amdkfd: adw_info fields are not valid\n"); -- return -EINVAL; -- } -+ if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || (adw_info->num_watch_points == 0)) { -+ status = -EINVAL; -+ break; -+ } -+ -+ if ((NULL == adw_info->watch_mode) || (NULL == adw_info->watch_address)) { -+ status = -EINVAL; -+ break; -+ } - -- status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); -+ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); - -- if (status != 0) { -- pr_err("amdkfd: Failed to allocate GART memory\n"); -- return status; -- } -+ if (status != 0) -+ break; - -- packet_buff_uint = mem_obj->cpu_ptr; -- -- memset(packet_buff_uint, 0, ib_size); -- -- packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); -- -- packets_vec[0].header.count = 1; -- packets_vec[0].header.opcode = IT_SET_CONFIG_REG; -- packets_vec[0].header.type = PM4_TYPE_3; -- packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; -- packets_vec[0].bitfields2.insert_vmid = 1; -- packets_vec[1].ordinal1 = packets_vec[0].ordinal1; -- packets_vec[1].bitfields2.insert_vmid = 0; -- packets_vec[2].ordinal1 = packets_vec[0].ordinal1; -- packets_vec[2].bitfields2.insert_vmid = 0; -- packets_vec[3].ordinal1 = packets_vec[0].ordinal1; -- packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; -- packets_vec[3].bitfields2.insert_vmid = 1; -- -- for (i = 0; i < adw_info->num_watch_points; i++) { -- dbgdev_address_watch_set_registers(adw_info, -- &addrHi, -- &addrLo, -- &cntl, -- i, -- vmid); -- -- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); -- pr_debug("\t\t%20s %08x\n", "register index :", i); -- pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); -- pr_debug("\t\t%20s %p\n", "Add ptr is :", -- adw_info->watch_address); -- pr_debug("\t\t%20s %08llx\n", "Add is :", -- adw_info->watch_address[i]); -- pr_debug("\t\t%20s %08x\n", "Address Low is :", -- addrLo.bitfields.addr); -- pr_debug("\t\t%20s %08x\n", "Address high is :", -- addrHi.bitfields.addr); -- pr_debug("\t\t%20s %08x\n", "Control Mask is :", -- cntl.bitfields.mask); -- pr_debug("\t\t%20s %08x\n", "Control Mode is :", -- cntl.bitfields.mode); -- pr_debug("\t\t%20s %08x\n", "Control Vmid is :", -- cntl.bitfields.vmid); -- pr_debug("\t\t%20s %08x\n", "Control atc is :", -- cntl.bitfields.atc); -- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); -- -- aw_reg_add_dword = -- dbgdev->dev->kfd2kgd->address_watch_get_offset( -- dbgdev->dev->kgd, -- i, -- ADDRESS_WATCH_REG_CNTL); -+ packet_buff_uint = mem_obj->cpu_ptr; -+ -+ memset(packet_buff_uint, 0, ib_size); - -- aw_reg_add_dword /= sizeof(uint32_t); -+ packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); - -- packets_vec[0].bitfields2.reg_offset = -- aw_reg_add_dword - AMD_CONFIG_REG_BASE; -+ packets_vec[0].header.count = 1; -+ packets_vec[0].header.opcode = IT_SET_CONFIG_REG; -+ packets_vec[0].header.type = PM4_TYPE_3; -+ packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; -+ packets_vec[0].bitfields2.insert_vmid = 1; -+ packets_vec[1].ordinal1 = packets_vec[0].ordinal1; -+ packets_vec[1].bitfields2.insert_vmid = 0; -+ packets_vec[2].ordinal1 = packets_vec[0].ordinal1; -+ packets_vec[2].bitfields2.insert_vmid = 0; -+ packets_vec[3].ordinal1 = packets_vec[0].ordinal1; -+ packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; -+ packets_vec[3].bitfields2.insert_vmid = 1; - -- packets_vec[0].reg_data[0] = cntl.u32All; -+ for (i = 0; i < adw_info->num_watch_points; i++) { - -- aw_reg_add_dword = -- dbgdev->dev->kfd2kgd->address_watch_get_offset( -- dbgdev->dev->kgd, -+ dbgdev_address_watch_set_registers( -+ adw_info, -+ &addrHi, -+ &addrLo, -+ &cntl, - i, -- ADDRESS_WATCH_REG_ADDR_HI); -+ vmid, -+ dbgdev->dev->device_info->asic_family -+ ); -+ -+ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); -+ pr_debug("\t\t%20s %08x\n", "register index :", i); -+ pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); -+ pr_debug("\t\t%20s %p\n", "Add ptr is :", adw_info->watch_address); -+ pr_debug("\t\t%20s %08llx\n", "Add is :", adw_info->watch_address[i]); -+ pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr); -+ pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr); -+ pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask); -+ pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode); -+ pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid); -+ pr_debug("\t\t%20s %08x\n", "Control atc is :", cntl.bitfields.atc); -+ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); -+ -+ aw_reg_add_dword = -+ dbgdev->dev->kfd2kgd -+ ->address_watch_get_offset( -+ dbgdev->dev->kgd, -+ i, -+ ADDRESS_WATCH_REG_CNTL); -+ -+ packets_vec[0].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; -+ packets_vec[0].reg_data[0] = cntl.u32All; - -- aw_reg_add_dword /= sizeof(uint32_t); -+ aw_reg_add_dword = -+ dbgdev->dev->kfd2kgd -+ ->address_watch_get_offset( -+ dbgdev->dev->kgd, -+ i, -+ ADDRESS_WATCH_REG_ADDR_HI); - -- packets_vec[1].bitfields2.reg_offset = -- aw_reg_add_dword - AMD_CONFIG_REG_BASE; -- packets_vec[1].reg_data[0] = addrHi.u32All; - -- aw_reg_add_dword = -- dbgdev->dev->kfd2kgd->address_watch_get_offset( -- dbgdev->dev->kgd, -- i, -- ADDRESS_WATCH_REG_ADDR_LO); -+ packets_vec[1].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; -+ packets_vec[1].reg_data[0] = addrHi.u32All; - -- aw_reg_add_dword /= sizeof(uint32_t); -+ aw_reg_add_dword = -+ dbgdev->dev->kfd2kgd -+ ->address_watch_get_offset( -+ dbgdev->dev->kgd, -+ i, -+ ADDRESS_WATCH_REG_ADDR_LO); - -- packets_vec[2].bitfields2.reg_offset = -- aw_reg_add_dword - AMD_CONFIG_REG_BASE; -- packets_vec[2].reg_data[0] = addrLo.u32All; - -- /* enable watch flag if address is not zero*/ -- if (adw_info->watch_address[i] > 0) -- cntl.bitfields.valid = 1; -- else -- cntl.bitfields.valid = 0; -+ packets_vec[2].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; -+ packets_vec[2].reg_data[0] = addrLo.u32All; - -- aw_reg_add_dword = -- dbgdev->dev->kfd2kgd->address_watch_get_offset( -- dbgdev->dev->kgd, -- i, -- ADDRESS_WATCH_REG_CNTL); -+ /* enable watch flag if address is not zero*/ -+ if (adw_info->watch_address[i] > 0) -+ cntl.bitfields.valid = 1; -+ else -+ cntl.bitfields.valid = 0; - -- aw_reg_add_dword /= sizeof(uint32_t); -+ aw_reg_add_dword = -+ dbgdev->dev->kfd2kgd -+ ->address_watch_get_offset( -+ dbgdev->dev->kgd, -+ i, -+ ADDRESS_WATCH_REG_CNTL); - -- packets_vec[3].bitfields2.reg_offset = -- aw_reg_add_dword - AMD_CONFIG_REG_BASE; -- packets_vec[3].reg_data[0] = cntl.u32All; - -- status = dbgdev_diq_submit_ib( -- dbgdev, -- adw_info->process->pasid, -- mem_obj->gpu_addr, -- packet_buff_uint, -- ib_size); -+ packets_vec[3].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; -+ packets_vec[3].reg_data[0] = cntl.u32All; -+ -+ status = dbgdev_diq_submit_ib( -+ dbgdev, -+ adw_info->process->pasid, -+ mem_obj->gpu_addr, -+ packet_buff_uint, -+ ib_size); -+ -+ if (status != 0) { -+ pr_debug("Error! kfd: In func %s >> failed to submit DIQ packet\n", __func__); -+ break; -+ } - -- if (status != 0) { -- pr_err("amdkfd: Failed to submit IB to DIQ\n"); -- break; - } -- } - -- kfd_gtt_sa_free(dbgdev->dev, mem_obj); -+ } while (false); -+ if (packet_buff_uint != NULL) -+ kfd_gtt_sa_free(dbgdev->dev, mem_obj); -+ - return status; -+ - } - - static int dbgdev_wave_control_set_registers( - struct dbg_wave_control_info *wac_info, - union SQ_CMD_BITS *in_reg_sq_cmd, -- union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) -+ union GRBM_GFX_INDEX_BITS *in_reg_gfx_index, -+ unsigned int asic_family) - { - int status = 0; - union SQ_CMD_BITS reg_sq_cmd; - union GRBM_GFX_INDEX_BITS reg_gfx_index; -- struct HsaDbgWaveMsgAMDGen2 *pMsg; -- -- BUG_ON(!wac_info || !in_reg_sq_cmd || !in_reg_gfx_index); - - reg_sq_cmd.u32All = 0; -+ - reg_gfx_index.u32All = 0; -- pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2; - - switch (wac_info->mode) { -- /* Send command to single wave */ -- case HSA_DBG_WAVEMODE_SINGLE: -- /* -- * Limit access to the process waves only, -- * by setting vmid check -- */ -+ case HSA_DBG_WAVEMODE_SINGLE: /* Send command to single wave */ -+ /*limit access to the process waves only,by setting vmid check */ - reg_sq_cmd.bits.check_vmid = 1; -- reg_sq_cmd.bits.simd_id = pMsg->ui32.SIMD; -- reg_sq_cmd.bits.wave_id = pMsg->ui32.WaveId; -+ reg_sq_cmd.bits.simd_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.SIMD; -+ reg_sq_cmd.bits.wave_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.WaveId; - reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_SINGLE; - -- reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; -- reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; -- reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; -+ reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray; -+ reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine; -+ reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU; - - break; - -- /* Send command to all waves with matching VMID */ -- case HSA_DBG_WAVEMODE_BROADCAST_PROCESS: -+ case HSA_DBG_WAVEMODE_BROADCAST_PROCESS: /* Send command to all waves with matching VMID */ -+ - - reg_gfx_index.bits.sh_broadcast_writes = 1; - reg_gfx_index.bits.se_broadcast_writes = 1; - reg_gfx_index.bits.instance_broadcast_writes = 1; - - reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; -- - break; - -- /* Send command to all CU waves with matching VMID */ -- case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU: -+ case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU: /* Send command to all CU waves with matching VMID */ - - reg_sq_cmd.bits.check_vmid = 1; - reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; - -- reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; -- reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; -- reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; -+ reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray; -+ reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine; -+ reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU; - - break; - - default: -- return -EINVAL; -+ status = -EINVAL; -+ break; - } - - switch (wac_info->operand) { - case HSA_DBG_WAVEOP_HALT: -- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; -+ if (asic_family == CHIP_KAVERI) { -+ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; -+ pr_debug("kfd:dbgdev: halting KV\n"); -+ } else { -+ reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; -+ reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT; -+ pr_debug("kfd:dbgdev: halting CZ\n"); -+ } - break; - - case HSA_DBG_WAVEOP_RESUME: -- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; -+ if (asic_family == CHIP_KAVERI) { -+ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; -+ pr_debug("kfd:dbgdev: resuming KV\n"); -+ } else { -+ reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; -+ reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME; -+ pr_debug("kfd:dbgdev: resuming CZ\n"); -+ } - break; - - case HSA_DBG_WAVEOP_KILL: -@@ -601,128 +608,114 @@ static int dbgdev_wave_control_set_registers( - } - - if (status == 0) { -- *in_reg_sq_cmd = reg_sq_cmd; -+ *in_reg_sq_cmd = reg_sq_cmd; - *in_reg_gfx_index = reg_gfx_index; - } -- - return status; -+ - } - - static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, - struct dbg_wave_control_info *wac_info) - { - -- int status; -+ int status = 0; - union SQ_CMD_BITS reg_sq_cmd; - union GRBM_GFX_INDEX_BITS reg_gfx_index; - struct kfd_mem_obj *mem_obj; -- uint32_t *packet_buff_uint; -- struct pm4__set_config_reg *packets_vec; -+ uint32_t *packet_buff_uint = NULL; -+ struct pm4__set_config_reg *packets_vec = NULL; - size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; - -- BUG_ON(!dbgdev || !wac_info); -- - reg_sq_cmd.u32All = 0; -+ do { - -- status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, -- ®_gfx_index); -- if (status) { -- pr_err("amdkfd: Failed to set wave control registers\n"); -- return status; -- } -- -- /* we do not control the VMID in DIQ,so reset it to a known value */ -- reg_sq_cmd.bits.vm_id = 0; -- -- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); -- -- pr_debug("\t\t mode is: %u\n", wac_info->mode); -- pr_debug("\t\t operand is: %u\n", wac_info->operand); -- pr_debug("\t\t trap id is: %u\n", wac_info->trapId); -- pr_debug("\t\t msg value is: %u\n", -- wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); -- pr_debug("\t\t vmid is: N/A\n"); -- -- pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); -- pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); -- pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); -- pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); -- pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); -- pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); -- pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); -- -- pr_debug("\t\t ibw is : %u\n", -- reg_gfx_index.bitfields.instance_broadcast_writes); -- pr_debug("\t\t ii is : %u\n", -- reg_gfx_index.bitfields.instance_index); -- pr_debug("\t\t sebw is : %u\n", -- reg_gfx_index.bitfields.se_broadcast_writes); -- pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); -- pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); -- pr_debug("\t\t sbw is : %u\n", -- reg_gfx_index.bitfields.sh_broadcast_writes); -- -- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); -- -- status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); -- -- if (status != 0) { -- pr_err("amdkfd: Failed to allocate GART memory\n"); -- return status; -- } -- -- packet_buff_uint = mem_obj->cpu_ptr; -+ status = dbgdev_wave_control_set_registers(wac_info, -+ ®_sq_cmd, -+ ®_gfx_index, -+ dbgdev->dev->device_info->asic_family); - -- memset(packet_buff_uint, 0, ib_size); -+ /* we do not control the VMID in DIQ,so reset it to a known value */ -+ reg_sq_cmd.bits.vm_id = 0; -+ if (status != 0) -+ break; -+ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); -+ -+ pr_debug("\t\t mode is: %u\n", wac_info->mode); -+ pr_debug("\t\t operand is: %u\n", wac_info->operand); -+ pr_debug("\t\t trap id is: %u\n", wac_info->trapId); -+ pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); -+ pr_debug("\t\t vmid is: N/A\n"); -+ -+ pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); -+ pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); -+ pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); -+ pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); -+ pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); -+ pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); -+ pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); -+ -+ pr_debug("\t\t ibw is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes); -+ pr_debug("\t\t ii is : %u\n", reg_gfx_index.bitfields.instance_index); -+ pr_debug("\t\t sebw is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes); -+ pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); -+ pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); -+ pr_debug("\t\t sbw is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes); -+ -+ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); -+ -+ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); -+ -+ if (status != 0) -+ break; - -- packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; -- packets_vec[0].header.count = 1; -- packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; -- packets_vec[0].header.type = PM4_TYPE_3; -- packets_vec[0].bitfields2.reg_offset = -- GRBM_GFX_INDEX / (sizeof(uint32_t)) - -- USERCONFIG_REG_BASE; -+ packet_buff_uint = mem_obj->cpu_ptr; - -- packets_vec[0].bitfields2.insert_vmid = 0; -- packets_vec[0].reg_data[0] = reg_gfx_index.u32All; -+ memset(packet_buff_uint, 0, ib_size); - -- packets_vec[1].header.count = 1; -- packets_vec[1].header.opcode = IT_SET_CONFIG_REG; -- packets_vec[1].header.type = PM4_TYPE_3; -- packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - -- AMD_CONFIG_REG_BASE; -+ packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; -+ packets_vec[0].header.count = 1; -+ packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; -+ packets_vec[0].header.type = PM4_TYPE_3; -+ packets_vec[0].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE; -+ packets_vec[0].bitfields2.insert_vmid = 0; -+ packets_vec[0].reg_data[0] = reg_gfx_index.u32All; - -- packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; -- packets_vec[1].bitfields2.insert_vmid = 1; -- packets_vec[1].reg_data[0] = reg_sq_cmd.u32All; -+ packets_vec[1].header.count = 1; -+ packets_vec[1].header.opcode = IT_SET_CONFIG_REG; -+ packets_vec[1].header.type = PM4_TYPE_3; -+ packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - CONFIG_REG_BASE; -+ packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; -+ packets_vec[1].bitfields2.insert_vmid = 1; -+ packets_vec[1].reg_data[0] = reg_sq_cmd.u32All; - -- /* Restore the GRBM_GFX_INDEX register */ -+ /* Restore the GRBM_GFX_INDEX register */ - -- reg_gfx_index.u32All = 0; -- reg_gfx_index.bits.sh_broadcast_writes = 1; -- reg_gfx_index.bits.instance_broadcast_writes = 1; -- reg_gfx_index.bits.se_broadcast_writes = 1; -+ reg_gfx_index.u32All = 0; -+ reg_gfx_index.bits.sh_broadcast_writes = 1; -+ reg_gfx_index.bits.instance_broadcast_writes = 1; -+ reg_gfx_index.bits.se_broadcast_writes = 1; - - -- packets_vec[2].ordinal1 = packets_vec[0].ordinal1; -- packets_vec[2].bitfields2.reg_offset = -- GRBM_GFX_INDEX / (sizeof(uint32_t)) - -- USERCONFIG_REG_BASE; -+ packets_vec[2].ordinal1 = packets_vec[0].ordinal1; -+ packets_vec[2].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE; -+ packets_vec[2].bitfields2.insert_vmid = 0; -+ packets_vec[2].reg_data[0] = reg_gfx_index.u32All; - -- packets_vec[2].bitfields2.insert_vmid = 0; -- packets_vec[2].reg_data[0] = reg_gfx_index.u32All; -+ status = dbgdev_diq_submit_ib( -+ dbgdev, -+ wac_info->process->pasid, -+ mem_obj->gpu_addr, -+ packet_buff_uint, -+ ib_size); - -- status = dbgdev_diq_submit_ib( -- dbgdev, -- wac_info->process->pasid, -- mem_obj->gpu_addr, -- packet_buff_uint, -- ib_size); -+ if (status != 0) -+ pr_debug("%s\n", " Critical Error ! Submit diq packet failed "); - -- if (status != 0) -- pr_err("amdkfd: Failed to submit IB to DIQ\n"); -+ } while (false); - -- kfd_gtt_sa_free(dbgdev->dev, mem_obj); -+ if (packet_buff_uint != NULL) -+ kfd_gtt_sa_free(dbgdev->dev, mem_obj); - - return status; - } -@@ -730,66 +723,69 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, - static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, - struct dbg_wave_control_info *wac_info) - { -- int status; -+ int status = 0; -+ unsigned int vmid = 0xffff; - union SQ_CMD_BITS reg_sq_cmd; - union GRBM_GFX_INDEX_BITS reg_gfx_index; -- struct kfd_process_device *pdd; - -- BUG_ON(!dbgdev || !dbgdev->dev || !wac_info); -+ struct kfd_process_device *pdd = NULL; - - reg_sq_cmd.u32All = 0; -+ status = 0; - - /* taking the VMID for that process on the safe way using PDD */ - pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process); - -- if (!pdd) { -- pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); -- return -EFAULT; -- } -- status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, -- ®_gfx_index); -- if (status) { -- pr_err("amdkfd: Failed to set wave control registers\n"); -- return status; -+ if (pdd) { -+ status = dbgdev_wave_control_set_registers(wac_info, -+ ®_sq_cmd, -+ ®_gfx_index, -+ dbgdev->dev->device_info->asic_family); -+ if (status == 0) { -+ -+ /* for non DIQ we need to patch the VMID: */ -+ -+ vmid = pdd->qpd.vmid; -+ reg_sq_cmd.bits.vm_id = vmid; -+ -+ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); -+ -+ pr_debug("\t\t mode is: %u\n", wac_info->mode); -+ pr_debug("\t\t operand is: %u\n", wac_info->operand); -+ pr_debug("\t\t trap id is: %u\n", wac_info->trapId); -+ pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); -+ pr_debug("\t\t vmid is: %u\n", vmid); -+ -+ pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); -+ pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); -+ pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); -+ pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); -+ pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); -+ pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); -+ pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); -+ -+ pr_debug("\t\t ibw is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes); -+ pr_debug("\t\t ii is : %u\n", reg_gfx_index.bitfields.instance_index); -+ pr_debug("\t\t sebw is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes); -+ pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); -+ pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); -+ pr_debug("\t\t sbw is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes); -+ -+ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); -+ -+ dbgdev->dev->kfd2kgd -+ ->wave_control_execute(dbgdev->dev->kgd, -+ reg_gfx_index.u32All, -+ reg_sq_cmd.u32All); -+ } else { -+ status = -EINVAL; -+ } -+ } else { -+ status = -EFAULT; - } - -- /* for non DIQ we need to patch the VMID: */ -+ return status; - -- reg_sq_cmd.bits.vm_id = pdd->qpd.vmid; -- -- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); -- -- pr_debug("\t\t mode is: %u\n", wac_info->mode); -- pr_debug("\t\t operand is: %u\n", wac_info->operand); -- pr_debug("\t\t trap id is: %u\n", wac_info->trapId); -- pr_debug("\t\t msg value is: %u\n", -- wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); -- pr_debug("\t\t vmid is: %u\n", pdd->qpd.vmid); -- -- pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); -- pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); -- pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); -- pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); -- pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); -- pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); -- pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); -- -- pr_debug("\t\t ibw is : %u\n", -- reg_gfx_index.bitfields.instance_broadcast_writes); -- pr_debug("\t\t ii is : %u\n", -- reg_gfx_index.bitfields.instance_index); -- pr_debug("\t\t sebw is : %u\n", -- reg_gfx_index.bitfields.se_broadcast_writes); -- pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); -- pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); -- pr_debug("\t\t sbw is : %u\n", -- reg_gfx_index.bitfields.sh_broadcast_writes); -- -- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); -- -- return dbgdev->dev->kfd2kgd->wave_control_execute(dbgdev->dev->kgd, -- reg_gfx_index.u32All, -- reg_sq_cmd.u32All); - } - - int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) -@@ -800,13 +796,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - union GRBM_GFX_INDEX_BITS reg_gfx_index; - struct kfd_process_device *pdd; - struct dbg_wave_control_info wac_info; -- int temp; -- int first_vmid_to_scan = 8; -- int last_vmid_to_scan = 15; -- -- first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1; -- temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan; -- last_vmid_to_scan = first_vmid_to_scan + ffz(temp); -+ int first_vmid_to_scan = dev->vm_info.first_vmid_kfd; -+ int last_vmid_to_scan = dev->vm_info.last_vmid_kfd; - - reg_sq_cmd.u32All = 0; - status = 0; -@@ -823,7 +814,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { - if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid - (dev->kgd, vmid)) { -- if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid -+ if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid - (dev->kgd, vmid) == p->pasid) { - pr_debug("Killing wave fronts of vmid %d and pasid %d\n", - vmid, p->pasid); -@@ -833,7 +824,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - } - - if (vmid > last_vmid_to_scan) { -- pr_err("amdkfd: didn't found vmid for pasid (%d)\n", p->pasid); -+ pr_err("amdkfd: didn't find vmid for pasid (%d)\n", p->pasid); - return -EFAULT; - } - -@@ -843,7 +834,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - return -EFAULT; - - status = dbgdev_wave_control_set_registers(&wac_info, ®_sq_cmd, -- ®_gfx_index); -+ ®_gfx_index, dev->device_info->asic_family); - if (status != 0) - return -EINVAL; - -@@ -858,15 +849,12 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - } - - void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, -- enum DBGDEV_TYPE type) -+ DBGDEV_TYPE type) - { -- BUG_ON(!pdbgdev || !pdev); -- - pdbgdev->dev = pdev; - pdbgdev->kq = NULL; - pdbgdev->type = type; - pdbgdev->pqm = NULL; -- - switch (type) { - case DBGDEV_TYPE_NODIQ: - pdbgdev->dbgdev_register = dbgdev_register_nodiq; -@@ -876,10 +864,12 @@ void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, - break; - case DBGDEV_TYPE_DIQ: - default: -+ - pdbgdev->dbgdev_register = dbgdev_register_diq; - pdbgdev->dbgdev_unregister = dbgdev_unregister_diq; - pdbgdev->dbgdev_wave_control = dbgdev_wave_control_diq; - pdbgdev->dbgdev_address_watch = dbgdev_address_watch_diq; -+ - break; - } - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h -index 03424c2..82f48ff 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h -@@ -23,6 +23,10 @@ - #ifndef KFD_DBGDEV_H_ - #define KFD_DBGDEV_H_ - -+/* -+ * SQ_IND_CMD_CMD enum -+ */ -+ - enum { - SQ_CMD_VMID_OFFSET = 28, - ADDRESS_WATCH_CNTL_OFFSET = 24 -@@ -48,9 +52,9 @@ enum { - - /* CONFIG reg space definition */ - enum { -- AMD_CONFIG_REG_BASE = 0x2000, /* in dwords */ -- AMD_CONFIG_REG_END = 0x2B00, -- AMD_CONFIG_REG_SIZE = AMD_CONFIG_REG_END - AMD_CONFIG_REG_BASE -+ CONFIG_REG_BASE = 0x2000, /* in dwords */ -+ CONFIG_REG_END = 0x2B00, -+ CONFIG_REG_SIZE = CONFIG_REG_END - CONFIG_REG_BASE - }; - - /* SH reg space definition */ -@@ -60,22 +64,43 @@ enum { - SH_REG_SIZE = SH_REG_END - SH_REG_BASE - }; - -+/* SQ_CMD definitions */ -+ -+enum { -+ SQ_IND_CMD_DATA_RESUME = 0, -+ SQ_IND_CMD_DATA_HALT = 1 -+}; -+ -+enum SQ_IND_CMD_NEW { -+ SQ_IND_CMD_NEW_NULL = 0x00000000, -+ SQ_IND_CMD_NEW_SETHALT = 0x00000001, -+ SQ_IND_CMD_NEW_SAVECTX = 0x00000002, -+ SQ_IND_CMD_NEW_KILL = 0x00000003, -+ SQ_IND_CMD_NEW_DEBUG = 0x00000004, -+ SQ_IND_CMD_NEW_TRAP = 0x00000005, -+ SQ_IND_CMD_NEW_SET_PRIO = 0x00000006 -+ -+}; -+ - enum SQ_IND_CMD_CMD { - SQ_IND_CMD_CMD_NULL = 0x00000000, - SQ_IND_CMD_CMD_HALT = 0x00000001, - SQ_IND_CMD_CMD_RESUME = 0x00000002, - SQ_IND_CMD_CMD_KILL = 0x00000003, - SQ_IND_CMD_CMD_DEBUG = 0x00000004, -- SQ_IND_CMD_CMD_TRAP = 0x00000005, -+ SQ_IND_CMD_CMD_TRAP = 0x00000005 - }; -+/* -+ * SQ_IND_CMD_MODE enum -+ */ - --enum SQ_IND_CMD_MODE { -+typedef enum SQ_IND_CMD_MODE { - SQ_IND_CMD_MODE_SINGLE = 0x00000000, - SQ_IND_CMD_MODE_BROADCAST = 0x00000001, - SQ_IND_CMD_MODE_BROADCAST_QUEUE = 0x00000002, - SQ_IND_CMD_MODE_BROADCAST_PIPE = 0x00000003, - SQ_IND_CMD_MODE_BROADCAST_ME = 0x00000004, --}; -+} SQ_IND_CMD_MODE; - - union SQ_IND_INDEX_BITS { - struct { -@@ -106,18 +131,32 @@ union SQ_IND_CMD_BITS { - union SQ_CMD_BITS { - struct { - uint32_t cmd:3; -- uint32_t:1; -+ uint32_t:1; - uint32_t mode:3; - uint32_t check_vmid:1; - uint32_t trap_id:3; -- uint32_t:5; -+ uint32_t:5; - uint32_t wave_id:4; - uint32_t simd_id:2; -- uint32_t:2; -+ uint32_t:2; - uint32_t queue_id:3; -- uint32_t:1; -+ uint32_t:1; - uint32_t vm_id:4; - } bitfields, bits; -+ struct { -+ uint32_t cmd:3; -+ uint32_t:1; -+ uint32_t mode:3; -+ uint32_t check_vmid:1; -+ uint32_t data:3; -+ uint32_t:5; -+ uint32_t wave_id:4; -+ uint32_t simd_id:2; -+ uint32_t:2; -+ uint32_t queue_id:3; -+ uint32_t:1; -+ uint32_t vm_id:4; -+ } bitfields_sethalt, bits_sethalt; - uint32_t u32All; - signed int i32All; - float f32All; -@@ -169,7 +208,7 @@ union TCP_WATCH_ADDR_L_BITS { - }; - - enum { -- QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */ -+ QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */ - QUEUESTATE__ACTIVE_COMPLETION_PENDING, - QUEUESTATE__ACTIVE - }; -@@ -187,7 +226,6 @@ union ULARGE_INTEGER { - #define KFD_CIK_VMID_END_OFFSET (KFD_CIK_VMID_START_OFFSET + (8)) - - --void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, -- enum DBGDEV_TYPE type); -+void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, DBGDEV_TYPE type); - --#endif /* KFD_DBGDEV_H_ */ -+#endif /* KFD_DBGDEV_H_ */ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c -index 56d6763..5d269ea 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c -@@ -36,42 +36,50 @@ - - static DEFINE_MUTEX(kfd_dbgmgr_mutex); - --struct mutex *kfd_get_dbgmgr_mutex(void) -+struct mutex * -+get_dbgmgr_mutex(void) - { - return &kfd_dbgmgr_mutex; - } - -+/*===========================================================================*/ - --static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) -+static void -+kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) - { -- BUG_ON(!pmgr); -- - kfree(pmgr->dbgdev); -- - pmgr->dbgdev = NULL; - pmgr->pasid = 0; - pmgr->dev = NULL; - } - --void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) -+/*===========================================================================*/ -+ -+void -+kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) - { - if (pmgr != NULL) { - kfd_dbgmgr_uninitialize(pmgr); - kfree(pmgr); -+ pmgr = NULL; - } - } - --bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) -+/*===========================================================================*/ -+ -+bool -+kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) - { -- enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; -+ DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; - struct kfd_dbgmgr *new_buff; - - BUG_ON(pdev == NULL); - BUG_ON(!pdev->init_complete); - - new_buff = kfd_alloc_struct(new_buff); -- if (!new_buff) { -- pr_err("amdkfd: Failed to allocate dbgmgr instance\n"); -+ if (!new_buff) -+ { -+ dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgmgr instance\n", __func__); - return false; - } - -@@ -79,7 +87,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) - new_buff->dev = pdev; - new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev); - if (!new_buff->dbgdev) { -- pr_err("amdkfd: Failed to allocate dbgdev instance\n"); -+ dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgdev\n", __func__); - kfree(new_buff); - return false; - } -@@ -94,75 +102,200 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) - return true; - } - --long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) -+/*===========================================================================*/ -+ -+long -+kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) - { -- BUG_ON(!p || !pmgr || !pmgr->dbgdev); -+ long status = 0; - -- if (pmgr->pasid != 0) { -- pr_debug("H/W debugger is already active using pasid %d\n", -- pmgr->pasid); -- return -EBUSY; -- } -+ do { -+ -+ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) { -+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); -+ /* Invalid Pointer. */ -+ status = -EINVAL; -+ break; -+ } -+ if (pmgr->pasid != 0) { -+ /* HW debugger is already active. */ -+ status = -EBUSY; -+ break; -+ } -+ -+ /* remember pasid */ -+ -+ pmgr->pasid = p->pasid; -+ -+ /* provide the pqm for diq generation */ - -- /* remember pasid */ -- pmgr->pasid = p->pasid; -+ pmgr->dbgdev->pqm = &p->pqm; - -- /* provide the pqm for diq generation */ -- pmgr->dbgdev->pqm = &p->pqm; -+ /* activate the actual registering */ -+ /* todo: you should lock with the process mutex here */ -+ pmgr->dbgdev->dbgdev_register(pmgr->dbgdev); -+ /* todo: you should unlock with the process mutex here */ - -- /* activate the actual registering */ -- pmgr->dbgdev->dbgdev_register(pmgr->dbgdev); -+ } while (false); - -- return 0; -+ return status; - } - --long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) -+/* ========================================================================== */ -+ -+long -+kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) - { -- BUG_ON(!p || !pmgr || !pmgr->dbgdev); - -- /* Is the requests coming from the already registered process? */ -- if (pmgr->pasid != p->pasid) { -- pr_debug("H/W debugger is not registered by calling pasid %d\n", -- p->pasid); -- return -EINVAL; -- } -+ long status = 0; - -- pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev); -+ do { - -- pmgr->pasid = 0; -+ if ((pmgr == NULL) || (pmgr->dev == NULL) -+ || (pmgr->dbgdev == NULL) || (p == NULL)) { -+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); -+ /* Invalid Pointer */ -+ status = -EINVAL; -+ break; -+ } -+ if (pmgr->pasid != p->pasid) { -+ /* Is the requests coming from the already registered process? */ -+ status = -EINVAL; -+ break; -+ } -+ -+ /* todo: you should lock with the process mutex here */ -+ -+ pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev); - -- return 0; -+ /* todo: you should unlock with the process mutex here */ -+ -+ pmgr->pasid = 0; -+ -+ } while (false); -+ -+ return status; - } - --long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, -- struct dbg_wave_control_info *wac_info) -+/* =========================================================================== */ -+ -+long -+kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info) - { -- BUG_ON(!pmgr || !pmgr->dbgdev || !wac_info); -+ long status = 0; - -- /* Is the requests coming from the already registered process? */ -- if (pmgr->pasid != wac_info->process->pasid) { -- pr_debug("H/W debugger support was not registered for requester pasid %d\n", -- wac_info->process->pasid); -- return -EINVAL; -- } -+ dev_info(NULL, "kfd: In func %s\n", __func__); -+ -+ do { -+ -+ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (wac_info == NULL) -+ || (wac_info->process == NULL)) { -+ /* Invalid Pointer */ -+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); -+ status = -EINVAL; -+ break; -+ } -+ /* Is the requests coming from the already registered process? */ -+ if (pmgr->pasid != wac_info->process->pasid) { -+ /* HW debugger support was not registered for requester process */ -+ status = -EINVAL; -+ break; -+ } -+ -+ status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info); -+ -+ } while (false); -+ -+ return status; - -- return (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info); - } - --long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, -- struct dbg_address_watch_info *adw_info) -+/* =========================================================================== */ -+ -+long -+kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info) - { -- BUG_ON(!pmgr || !pmgr->dbgdev || !adw_info); -+ long status = 0; - -+ dev_info(NULL, "kfd: In func %s\n", __func__); - -- /* Is the requests coming from the already registered process? */ -- if (pmgr->pasid != adw_info->process->pasid) { -- pr_debug("H/W debugger support was not registered for requester pasid %d\n", -- adw_info->process->pasid); -- return -EINVAL; -- } -+ do { -+ -+ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (adw_info == NULL) -+ || (adw_info->process == NULL)) { -+ /* Invalid Pointer */ -+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); -+ status = -EINVAL; -+ break; -+ } -+ /* Is the requests coming from the already registered process? */ -+ if (pmgr->pasid != adw_info->process->pasid) { -+ /* HW debugger support was not registered for requester process */ -+ status = -EINVAL; -+ break; -+ } -+ -+ status = (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev, adw_info); -+ -+ } while (false); -+ -+ return status; - -- return (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev, -- adw_info); - } - -+ -+/* =========================================================================== */ -+/* -+ * Handle abnormal process termination -+ * if we are in the midst of a debug session, we should kill all pending waves -+ * of the debugged process and unregister the process from the Debugger. -+ */ -+long -+kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process) -+{ -+ long status = 0; -+ struct dbg_wave_control_info wac_info; -+ -+ dev_info(NULL, "kfd: In func %s\n", __func__); -+ -+ do { -+ -+ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) { -+ /* Invalid Pointer */ -+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); -+ status = -EINVAL; -+ break; -+ } -+ /* first, we kill all the wavefronts of this process */ -+ -+ wac_info.process = process; -+ wac_info.mode = HSA_DBG_WAVEMODE_BROADCAST_PROCESS; -+ wac_info.operand = HSA_DBG_WAVEOP_KILL; -+ wac_info.trapId = 0x0; /* not used for the KILL */ -+ wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = 0; /* not used for kill */ -+ wac_info.dbgWave_msg.MemoryVA = NULL; /* not used for kill */ -+ -+ status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, &wac_info); -+ -+ if (status != 0) { -+ dev_info(NULL, "Error! kfd: In func %s: wave control failed, status is: %ld\n", __func__, status); -+ break; -+ } -+ if (pmgr->pasid == wac_info.process->pasid) { -+ /* if terminated process was registered for debug, then unregister it */ -+ status = kfd_dbgmgr_unregister(pmgr, process); -+ pmgr->pasid = 0; -+ } -+ if (status != 0) -+ dev_info(NULL, -+ "Error! kfd: In func %s: unregister failed, status is: %ld debugger can not be reused\n", -+ __func__, status); -+ -+ } while (false); -+ -+ return status; -+ -+} -+ -+ -+/*///////////////////////////////////////////////////////////////////////////////////////// */ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h -index 257a745..2b6484e 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h -@@ -26,252 +26,242 @@ - - #include "kfd_priv.h" - --/* must align with hsakmttypes definition */ -+/* -+ * SQ_IND_CMD_CMD enum -+ */ -+ -+ -+/* must align with hsakmttypes definition. */ - #pragma pack(push, 4) - --enum HSA_DBG_WAVEOP { -- HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ -- HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ -- HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ -- HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter -- debug mode */ -- HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take -- a trap */ -+typedef enum _HSA_DBG_WAVEOP { -+ HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ -+ HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ -+ HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ -+ HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter debug mode */ -+ HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take a trap */ - HSA_DBG_NUM_WAVEOP = 5, - HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF --}; -+} HSA_DBG_WAVEOP; - --enum HSA_DBG_WAVEMODE { -- /* send command to a single wave */ -- HSA_DBG_WAVEMODE_SINGLE = 0, -- /* -- * Broadcast to all wavefronts of all processes is not -- * supported for HSA user mode -- */ -- -- /* send to waves within current process */ -- HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, -- /* send to waves within current process on CU */ -- HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, -+typedef enum _HSA_DBG_WAVEMODE { -+ HSA_DBG_WAVEMODE_SINGLE = 0, /* send command to a single wave */ -+ /* Broadcast to all wavefronts of all processes is not supported for HSA user mode */ -+ HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, /* send to waves within current process */ -+ HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, /* send to waves within current process on CU */ - HSA_DBG_NUM_WAVEMODE = 3, - HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF --}; -+} HSA_DBG_WAVEMODE; - --enum HSA_DBG_WAVEMSG_TYPE { -+typedef enum _HSA_DBG_WAVEMSG_TYPE { - HSA_DBG_WAVEMSG_AUTO = 0, - HSA_DBG_WAVEMSG_USER = 1, - HSA_DBG_WAVEMSG_ERROR = 2, - HSA_DBG_NUM_WAVEMSG, - HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF --}; -+} HSA_DBG_WAVEMSG_TYPE; - --enum HSA_DBG_WATCH_MODE { -- HSA_DBG_WATCH_READ = 0, /* Read operations only */ -- HSA_DBG_WATCH_NONREAD = 1, /* Write or Atomic operations only */ -- HSA_DBG_WATCH_ATOMIC = 2, /* Atomic Operations only */ -- HSA_DBG_WATCH_ALL = 3, /* Read, Write or Atomic operations */ -+typedef enum _HSA_DBG_WATCH_MODE { -+ HSA_DBG_WATCH_READ = 0, /* Read operations only */ -+ HSA_DBG_WATCH_NONREAD = 1, /* Write or Atomic operations only */ -+ HSA_DBG_WATCH_ATOMIC = 2, /* Atomic Operations only */ -+ HSA_DBG_WATCH_ALL = 3, /* Read, Write or Atomic operations */ - HSA_DBG_WATCH_NUM, - HSA_DBG_WATCH_SIZE = 0xFFFFFFFF --}; -+} HSA_DBG_WATCH_MODE; - - /* This structure is hardware specific and may change in the future */ --struct HsaDbgWaveMsgAMDGen2 { -+typedef struct _HsaDbgWaveMsgAMDGen2 { - union { -- struct ui32 { -- uint32_t UserData:8; /* user data */ -- uint32_t ShaderArray:1; /* Shader array */ -- uint32_t Priv:1; /* Privileged */ -- uint32_t Reserved0:4; /* This field is reserved, -- should be 0 */ -- uint32_t WaveId:4; /* wave id */ -- uint32_t SIMD:2; /* SIMD id */ -- uint32_t HSACU:4; /* Compute unit */ -- uint32_t ShaderEngine:2;/* Shader engine */ -- uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ -- uint32_t Reserved1:4; /* This field is reserved, -- should be 0 */ -+ struct { -+ uint32_t UserData:8; /* user data */ -+ uint32_t ShaderArray:1; /* Shader array */ -+ uint32_t Priv:1; /* Privileged */ -+ uint32_t Reserved0:4; /* This field is reserved, should be 0 */ -+ uint32_t WaveId:4; /* wave id */ -+ uint32_t SIMD:2; /* SIMD id */ -+ uint32_t HSACU:4; /* Compute unit */ -+ uint32_t ShaderEngine:2; /* Shader engine */ -+ uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ -+ uint32_t Reserved1:4; /* This field is reserved, should be 0 */ - } ui32; - uint32_t Value; - }; -- uint32_t Reserved2; --}; - --union HsaDbgWaveMessageAMD { -- struct HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2; -- /* for future HsaDbgWaveMsgAMDGen3; */ --}; -- --struct HsaDbgWaveMessage { -- void *MemoryVA; /* ptr to associated host-accessible data */ -- union HsaDbgWaveMessageAMD DbgWaveMsg; --}; -+ uint32_t Reserved2; - --/* -- * TODO: This definitions to be MOVED to kfd_event, once it is implemented. -- * -- * HSA sync primitive, Event and HW Exception notification API definitions. -- * The API functions allow the runtime to define a so-called sync-primitive, -- * a SW object combining a user-mode provided "syncvar" and a scheduler event -- * that can be signaled through a defined GPU interrupt. A syncvar is -- * a process virtual memory location of a certain size that can be accessed -- * by CPU and GPU shader code within the process to set and query the content -- * within that memory. The definition of the content is determined by the HSA -- * runtime and potentially GPU shader code interfacing with the HSA runtime. -- * The syncvar values may be commonly written through an PM4 WRITE_DATA packet -- * in the user mode instruction stream. The OS scheduler event is typically -- * associated and signaled by an interrupt issued by the GPU, but other HSA -- * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced -- * by the KFD by this mechanism, too. */ -- --/* these are the new definitions for events */ --enum HSA_EVENTTYPE { -- HSA_EVENTTYPE_SIGNAL = 0, /* user-mode generated GPU signal */ -- HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ -- HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change -- (start/stop) */ -- HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ -- HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ -- HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ -- HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */ -- HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state -- (EOP pm4) */ -+} HsaDbgWaveMsgAMDGen2; -+ -+typedef union _HsaDbgWaveMessageAMD { -+ HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2; -+ /* for future HsaDbgWaveMsgAMDGen3; */ -+} HsaDbgWaveMessageAMD; -+ -+typedef struct _HsaDbgWaveMessage { -+ void *MemoryVA; /* ptr to associated host-accessible data */ -+ HsaDbgWaveMessageAMD DbgWaveMsg; -+} HsaDbgWaveMessage; -+ -+/* TODO: This definitions to be MOVED to kfd_event, once it is implemented. -+ -+ HSA sync primitive, Event and HW Exception notification API definitions -+ The API functions allow the runtime to define a so-called sync-primitive, a SW object -+ combining a user-mode provided "syncvar" and a scheduler event that can be signaled -+ through a defined GPU interrupt. A syncvar is a process virtual memory location of -+ a certain size that can be accessed by CPU and GPU shader code within the process to set -+ and query the content within that memory. The definition of the content is determined by -+ the HSA runtime and potentially GPU shader code interfacing with the HSA runtime. -+ The syncvar values may be commonly written through an PM4 WRITE_DATA packet in the -+ user mode instruction stream. The OS scheduler event is typically associated and -+ signaled by an interrupt issued by the GPU, but other HSA system interrupt conditions -+ from other HW (e.g. IOMMUv2) may besurfaced by the KFD by this mechanism, too. */ -+ -+/* these are the new definitions for events */ -+ -+typedef enum _HSA_EVENTTYPE { -+ HSA_EVENTTYPE_SIGNAL = 0, /* /user-mode generated GPU signal */ -+ HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ -+ HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change( start/stop ) */ -+ HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ -+ HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ -+ HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ -+ HSA_EVENTTYPE_PROFILE_EVENT = 6, /* GPU signal for profiling */ -+ HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state (EOP pm4) */ - /* ... */ - HSA_EVENTTYPE_MAXID, - HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF --}; -+} HSA_EVENTTYPE; -+ -+typedef uint32_t HSA_EVENTID; - --/* Sub-definitions for various event types: Syncvar */ --struct HsaSyncVar { -- union SyncVar { -- void *UserData; /* pointer to user mode data */ -- uint64_t UserDataPtrValue; /* 64bit compatibility of value */ -+/* Subdefinitions for various event types: Syncvar */ -+ -+typedef struct _HsaSyncVar { -+ union { -+ void *UserData; /* pointer to user mode data */ -+ uint64_t UserDataPtrValue; /* 64bit compatibility of value */ - } SyncVar; - uint64_t SyncVarSize; --}; -+} HsaSyncVar; - --/* Sub-definitions for various event types: NodeChange */ -+/* -+ Subdefinitions for various event types: NodeChange -+*/ - --enum HSA_EVENTTYPE_NODECHANGE_FLAGS { -+typedef enum _HSA_EVENTTYPE_NODECHANGE_FLAGS { - HSA_EVENTTYPE_NODECHANGE_ADD = 0, - HSA_EVENTTYPE_NODECHANGE_REMOVE = 1, - HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF --}; -+} HSA_EVENTTYPE_NODECHANGE_FLAGS; - --struct HsaNodeChange { -- /* HSA node added/removed on the platform */ -- enum HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; --}; -+typedef struct _HsaNodeChange { -+ HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; /* HSA node added/removed on the platform */ -+} HsaNodeChange; -+ -+/* -+ Sub-definitions for various event types: DeviceStateChange -+*/ - --/* Sub-definitions for various event types: DeviceStateChange */ --enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS { -- /* device started (and available) */ -- HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, -- /* device stopped (i.e. unavailable) */ -- HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, -+typedef enum _HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS { -+ HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, /* device started (and available) */ -+ HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, /* device stopped (i.e. unavailable) */ - HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF --}; -+} HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS; - --enum HSA_DEVICE { -+typedef enum _HSA_DEVICE { - HSA_DEVICE_CPU = 0, - HSA_DEVICE_GPU = 1, - MAX_HSA_DEVICE = 2 --}; -+} HSA_DEVICE; - --struct HsaDeviceStateChange { -+typedef struct _HsaDeviceStateChange { - uint32_t NodeId; /* F-NUMA node that contains the device */ -- enum HSA_DEVICE Device; /* device type: GPU or CPU */ -- enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */ --}; -+ HSA_DEVICE Device; /* device type: GPU or CPU */ -+ HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */ -+} HsaDeviceStateChange; - --struct HsaEventData { -- enum HSA_EVENTTYPE EventType; /* event type */ -- union EventData { -- /* -- * return data associated with HSA_EVENTTYPE_SIGNAL -- * and other events -- */ -- struct HsaSyncVar SyncVar; -+typedef struct _HsaEventData { -+ HSA_EVENTTYPE EventType; /* event type */ -+ union { -+ /* return data associated with HSA_EVENTTYPE_SIGNAL and other events */ -+ HsaSyncVar SyncVar; - - /* data associated with HSA_EVENTTYPE_NODE_CHANGE */ -- struct HsaNodeChange NodeChangeState; -+ HsaNodeChange NodeChangeState; - - /* data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE */ -- struct HsaDeviceStateChange DeviceState; -+ HsaDeviceStateChange DeviceState; - } EventData; - -- /* the following data entries are internal to the KFD & thunk itself */ -+ /* the following data entries are internal to the KFD & thunk itself. */ - -- /* internal thunk store for Event data (OsEventHandle) */ -- uint64_t HWData1; -- /* internal thunk store for Event data (HWAddress) */ -- uint64_t HWData2; -- /* internal thunk store for Event data (HWData) */ -- uint32_t HWData3; --}; -+ uint64_t HWData1; /* internal thunk store for Event data (OsEventHandle) */ -+ uint64_t HWData2; /* internal thunk store for Event data (HWAddress) */ -+ uint32_t HWData3; /* internal thunk store for Event data (HWData) */ -+} HsaEventData; - --struct HsaEventDescriptor { -- /* event type to allocate */ -- enum HSA_EVENTTYPE EventType; -- /* H-NUMA node containing GPU device that is event source */ -- uint32_t NodeId; -- /* pointer to user mode syncvar data, syncvar->UserDataPtrValue -- * may be NULL -- */ -- struct HsaSyncVar SyncVar; --}; -+typedef struct _HsaEventDescriptor { -+ HSA_EVENTTYPE EventType; /* event type to allocate */ -+ uint32_t NodeId; /* H-NUMA node containing GPU device that is event source */ -+ HsaSyncVar SyncVar; /* pointer to user mode syncvar data, syncvar->UserDataPtrValue may be NULL */ -+} HsaEventDescriptor; -+ -+typedef struct _HsaEvent { -+ HSA_EVENTID EventId; -+ HsaEventData EventData; -+} HsaEvent; - --struct HsaEvent { -- uint32_t EventId; -- struct HsaEventData EventData; --}; - - #pragma pack(pop) - --enum DBGDEV_TYPE { -+typedef enum _DBGDEV_TYPE { - DBGDEV_TYPE_ILLEGAL = 0, - DBGDEV_TYPE_NODIQ = 1, - DBGDEV_TYPE_DIQ = 2, - DBGDEV_TYPE_TEST = 3 --}; -+} DBGDEV_TYPE; - - struct dbg_address_watch_info { - struct kfd_process *process; -- enum HSA_DBG_WATCH_MODE *watch_mode; -+ HSA_DBG_WATCH_MODE *watch_mode; - uint64_t *watch_address; - uint64_t *watch_mask; -- struct HsaEvent *watch_event; -+ HsaEvent *watch_event; - uint32_t num_watch_points; - }; - - struct dbg_wave_control_info { - struct kfd_process *process; - uint32_t trapId; -- enum HSA_DBG_WAVEOP operand; -- enum HSA_DBG_WAVEMODE mode; -- struct HsaDbgWaveMessage dbgWave_msg; -+ HSA_DBG_WAVEOP operand; -+ HSA_DBG_WAVEMODE mode; -+ HsaDbgWaveMessage dbgWave_msg; - }; - - struct kfd_dbgdev { - - /* The device that owns this data. */ -+ - struct kfd_dev *dev; - - /* kernel queue for DIQ */ -+ - struct kernel_queue *kq; - - /* a pointer to the pqm of the calling process */ -+ - struct process_queue_manager *pqm; - - /* type of debug device ( DIQ, non DIQ, etc. ) */ -- enum DBGDEV_TYPE type; -+ -+ DBGDEV_TYPE type; - - /* virtualized function pointers to device dbg */ -+ - int (*dbgdev_register)(struct kfd_dbgdev *dbgdev); - int (*dbgdev_unregister)(struct kfd_dbgdev *dbgdev); -- int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev, -- struct dbg_address_watch_info *adw_info); -- int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, -- struct dbg_wave_control_info *wac_info); -+ int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev, struct dbg_address_watch_info *adw_info); -+ int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, struct dbg_wave_control_info *wac_info); - - }; - -@@ -282,13 +272,12 @@ struct kfd_dbgmgr { - }; - - /* prototypes for debug manager functions */ --struct mutex *kfd_get_dbgmgr_mutex(void); -+struct mutex *get_dbgmgr_mutex(void); - void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr); - bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev); - long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p); - long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p); --long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, -- struct dbg_wave_control_info *wac_info); --long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, -- struct dbg_address_watch_info *adw_info); --#endif /* KFD_DBGMGR_H_ */ -+long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info); -+long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info); -+long kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process); -+#endif /* KFD_DBGMGR_H_ */ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c -index 3f95f7c..20592ba 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c -@@ -24,9 +24,11 @@ - #include <linux/bsearch.h> - #include <linux/pci.h> - #include <linux/slab.h> -+#include <linux/highmem.h> - #include "kfd_priv.h" - #include "kfd_device_queue_manager.h" - #include "kfd_pm4_headers.h" -+#include "cwsr_trap_handler_carrizo.h" - - #define MQD_SIZE_ALIGNED 768 - -@@ -38,7 +40,8 @@ static const struct kfd_device_info kaveri_device_info = { - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED -+ .mqd_size_aligned = MQD_SIZE_ALIGNED, -+ .is_need_iommu_device = true - }; - - static const struct kfd_device_info carrizo_device_info = { -@@ -49,14 +52,50 @@ static const struct kfd_device_info carrizo_device_info = { - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED -+ .mqd_size_aligned = MQD_SIZE_ALIGNED, -+ .is_need_iommu_device = true - }; - -+static const struct kfd_device_info tonga_device_info = { -+ .asic_family = CHIP_TONGA, -+ .max_pasid_bits = 16, -+ .max_no_of_hqd = 24, -+ .ih_ring_entry_size = 4 * sizeof(uint32_t), -+ .event_interrupt_class = &event_interrupt_class_cik, -+ .num_of_watch_points = 4, -+ .mqd_size_aligned = MQD_SIZE_ALIGNED, -+ .is_need_iommu_device = false -+}; -+ -+static const struct kfd_device_info fiji_device_info = { -+ .asic_family = CHIP_FIJI, -+ .max_pasid_bits = 16, -+ .max_no_of_hqd = 24, -+ .ih_ring_entry_size = 4 * sizeof(uint32_t), -+ .event_interrupt_class = &event_interrupt_class_cik, -+ .num_of_watch_points = 4, -+ .mqd_size_aligned = MQD_SIZE_ALIGNED, -+ .is_need_iommu_device = false -+} -+; - struct kfd_deviceid { - unsigned short did; - const struct kfd_device_info *device_info; - }; - -+/* -+ * // -+// TONGA/AMETHYST device IDs (performance segment) -+// -+#define DEVICE_ID_VI_TONGA_P_6920 0x6920 // unfused -+#define DEVICE_ID_VI_TONGA_P_6921 0x6921 // Amethyst XT -+#define DEVICE_ID_VI_TONGA_P_6928 0x6928 // Tonga GL XT -+#define DEVICE_ID_VI_TONGA_P_692B 0x692B // Tonga GL PRO -+#define DEVICE_ID_VI_TONGA_P_692F 0x692F // Tonga GL PRO VF -+#define DEVICE_ID_VI_TONGA_P_6938 0x6938 // Tonga XT -+#define DEVICE_ID_VI_TONGA_P_6939 0x6939 // Tonga PRO -+ * -+ */ - /* Please keep this sorted by increasing device id. */ - static const struct kfd_deviceid supported_devices[] = { - { 0x1304, &kaveri_device_info }, /* Kaveri */ -@@ -85,13 +124,23 @@ static const struct kfd_deviceid supported_devices[] = { - { 0x9874, &carrizo_device_info }, /* Carrizo */ - { 0x9875, &carrizo_device_info }, /* Carrizo */ - { 0x9876, &carrizo_device_info }, /* Carrizo */ -- { 0x9877, &carrizo_device_info } /* Carrizo */ -+ { 0x9877, &carrizo_device_info }, /* Carrizo */ -+ { 0x6920, &tonga_device_info }, /* Tonga */ -+ { 0x6921, &tonga_device_info }, /* Tonga */ -+ { 0x6928, &tonga_device_info }, /* Tonga */ -+ { 0x692B, &tonga_device_info }, /* Tonga */ -+ { 0x692F, &tonga_device_info }, /* Tonga */ -+ { 0x6938, &tonga_device_info }, /* Tonga */ -+ { 0x6939, &tonga_device_info }, /* Tonga */ -+ { 0x7300, &fiji_device_info } /* Fiji */ - }; - - static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, - unsigned int chunk_size); - static void kfd_gtt_sa_fini(struct kfd_dev *kfd); - -+static int kfd_resume(struct kfd_dev *kfd); -+ - static const struct kfd_device_info *lookup_device_info(unsigned short did) - { - size_t i; -@@ -117,6 +166,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, - if (!device_info) - return NULL; - -+ BUG_ON(!f2g); -+ - kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); - if (!kfd) - return NULL; -@@ -170,15 +221,8 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) - pasid_limit, - kfd->doorbell_process_limit - 1); - -- err = amd_iommu_init_device(kfd->pdev, pasid_limit); -- if (err < 0) { -- dev_err(kfd_device, "error initializing iommu device\n"); -- return false; -- } -- - if (!kfd_set_pasid_limit(pasid_limit)) { - dev_err(kfd_device, "error setting pasid limit\n"); -- amd_iommu_free_device(kfd->pdev); - return false; - } - -@@ -219,13 +263,81 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, - return AMD_IOMMU_INV_PRI_RSP_INVALID; - } - -+static int kfd_cwsr_init(struct kfd_dev *kfd) -+{ -+ /* -+ * Initialize the CWSR required memory for TBA and TMA -+ * only support CWSR on VI and up with FW version >=625. -+ */ -+ if (cwsr_enable && -+ (kfd->mec_fw_version >= KFD_CWSR_CZ_FW_VER)) { -+ void *cwsr_addr = NULL; -+ unsigned int size = sizeof(cwsr_trap_carrizo_hex); -+ -+ if (size > PAGE_SIZE) { -+ pr_err("amdkfd: wrong CWSR ISA size.\n"); -+ return -EINVAL; -+ } -+ kfd->cwsr_size = -+ ALIGN(size, PAGE_SIZE) + PAGE_SIZE; -+ kfd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, -+ get_order(kfd->cwsr_size)); -+ if (!kfd->cwsr_pages) { -+ pr_err("amdkfd: error alloc CWSR isa memory.\n"); -+ return -ENOMEM; -+ } -+ /*Only first page used for cwsr ISA code */ -+ cwsr_addr = kmap(kfd->cwsr_pages); -+ memset(cwsr_addr, 0, PAGE_SIZE); -+ memcpy(cwsr_addr, cwsr_trap_carrizo_hex, size); -+ kunmap(kfd->cwsr_pages); -+ kfd->tma_offset = ALIGN(size, PAGE_SIZE); -+ kfd->cwsr_enabled = true; -+ dev_info(kfd_device, -+ "Reserved %d pages for cwsr.\n", -+ (kfd->cwsr_size >> PAGE_SHIFT)); -+ } -+ -+ return 0; -+} -+ -+static void kfd_cwsr_fini(struct kfd_dev *kfd) -+{ -+ if (kfd->cwsr_pages) -+ __free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size)); -+} -+ - bool kgd2kfd_device_init(struct kfd_dev *kfd, - const struct kgd2kfd_shared_resources *gpu_resources) - { - unsigned int size; -+ unsigned int vmid_bitmap_kfd, vmid_num_kfd; -+ -+ kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, -+ KGD_ENGINE_MEC1); - - kfd->shared_resources = *gpu_resources; - -+ vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap; -+ kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1; -+ kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1; -+ vmid_num_kfd = kfd->vm_info.last_vmid_kfd -+ - kfd->vm_info.first_vmid_kfd + 1; -+ kfd->vm_info.vmid_num_kfd = vmid_num_kfd; -+ -+ /* If MEC firmware is too old, turn off hws multiple process mapping */ -+ if (kfd->mec_fw_version < KFD_MULTI_PROC_MAPPING_HWS_SUPPORT) -+ kfd->max_proc_per_quantum = 0; -+ /* Verify module parameters regarding mapped process number*/ -+ else if ((hws_max_conc_proc < 0) -+ || (hws_max_conc_proc > vmid_num_kfd)) { -+ dev_err(kfd_device, -+ "hws_max_conc_proc (%d) must be between 0 and %d, use %d instead\n", -+ hws_max_conc_proc, vmid_num_kfd, vmid_num_kfd); -+ kfd->max_proc_per_quantum = vmid_num_kfd; -+ } else -+ kfd->max_proc_per_quantum = hws_max_conc_proc; -+ - /* calculate max size of mqds needed for queues */ - size = max_num_of_queues_per_device * - kfd->device_info->mqd_size_aligned; -@@ -280,16 +392,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - goto kfd_interrupt_error; - } - -- if (!device_iommu_pasid_init(kfd)) { -- dev_err(kfd_device, -- "Error initializing iommuv2 for device (%x:%x)\n", -- kfd->pdev->vendor, kfd->pdev->device); -- goto device_iommu_pasid_error; -- } -- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, -- iommu_pasid_shutdown_callback); -- amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); -- - kfd->dqm = device_queue_manager_init(kfd); - if (!kfd->dqm) { - dev_err(kfd_device, -@@ -298,13 +400,21 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - goto device_queue_manager_error; - } - -- if (kfd->dqm->ops.start(kfd->dqm) != 0) { -- dev_err(kfd_device, -- "Error starting queuen manager for device (%x:%x)\n", -- kfd->pdev->vendor, kfd->pdev->device); -- goto dqm_start_error; -+ if (kfd->device_info->is_need_iommu_device) { -+ if (!device_iommu_pasid_init(kfd)) { -+ dev_err(kfd_device, -+ "Error initializing iommuv2 for device (%x:%x)\n", -+ kfd->pdev->vendor, kfd->pdev->device); -+ goto device_iommu_pasid_error; -+ } - } - -+ if (kfd_cwsr_init(kfd)) -+ goto device_iommu_pasid_error; -+ -+ if (kfd_resume(kfd)) -+ goto kfd_resume_error; -+ - kfd->dbgmgr = NULL; - - kfd->init_complete = true; -@@ -316,11 +426,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - - goto out; - --dqm_start_error: -+kfd_resume_error: -+ kfd_cwsr_fini(kfd); -+device_iommu_pasid_error: - device_queue_manager_uninit(kfd->dqm); - device_queue_manager_error: -- amd_iommu_free_device(kfd->pdev); --device_iommu_pasid_error: - kfd_interrupt_exit(kfd); - kfd_interrupt_error: - kfd_topology_remove_device(kfd); -@@ -338,8 +448,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - void kgd2kfd_device_exit(struct kfd_dev *kfd) - { - if (kfd->init_complete) { -+ kgd2kfd_suspend(kfd); -+ kfd_cwsr_fini(kfd); - device_queue_manager_uninit(kfd->dqm); -- amd_iommu_free_device(kfd->pdev); - kfd_interrupt_exit(kfd); - kfd_topology_remove_device(kfd); - kfd_gtt_sa_fini(kfd); -@@ -355,32 +466,68 @@ void kgd2kfd_suspend(struct kfd_dev *kfd) - - if (kfd->init_complete) { - kfd->dqm->ops.stop(kfd->dqm); -- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); -- amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); -- amd_iommu_free_device(kfd->pdev); -+ if (kfd->device_info->is_need_iommu_device) { -+ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); -+ amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); -+ amd_iommu_free_device(kfd->pdev); -+ } - } - } - --int kgd2kfd_resume(struct kfd_dev *kfd) -+int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem) - { -- unsigned int pasid_limit; -- int err; -+ return evict_bo(dev, mem); -+} - -+int kgd2kfd_restore(struct kfd_dev *kfd) -+{ -+ return restore(kfd); -+} -+ -+int kgd2kfd_resume(struct kfd_dev *kfd) -+{ - BUG_ON(kfd == NULL); - -- pasid_limit = kfd_get_pasid_limit(); -+ if (!kfd->init_complete) -+ return 0; -+ -+ return kfd_resume(kfd); -+ -+} -+ -+static int kfd_resume(struct kfd_dev *kfd) -+{ -+ int err = 0; -+ -+ if (kfd->device_info->is_need_iommu_device) { -+ unsigned int pasid_limit = kfd_get_pasid_limit(); - -- if (kfd->init_complete) { - err = amd_iommu_init_device(kfd->pdev, pasid_limit); -- if (err < 0) -+ if (err) - return -ENXIO; - amd_iommu_set_invalidate_ctx_cb(kfd->pdev, -- iommu_pasid_shutdown_callback); -- amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); -- kfd->dqm->ops.start(kfd->dqm); -+ iommu_pasid_shutdown_callback); -+ amd_iommu_set_invalid_ppr_cb(kfd->pdev, -+ iommu_invalid_ppr_cb); - } - -- return 0; -+ err = kfd->dqm->ops.start(kfd->dqm); -+ if (err) { -+ dev_err(kfd_device, -+ "Error starting queue manager for device (%x:%x)\n", -+ kfd->pdev->vendor, kfd->pdev->device); -+ goto dqm_start_error; -+ } -+ -+ kfd->kfd2kgd->write_config_static_mem(kfd->kgd, true, 1, 3, 0); -+ -+ return err; -+ -+dqm_start_error: -+ if (kfd->device_info->is_need_iommu_device) -+ amd_iommu_free_device(kfd->pdev); -+ -+ return err; - } - - /* This is called directly from KGD at ISR. */ -@@ -399,6 +546,58 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) - spin_unlock(&kfd->interrupt_lock); - } - -+int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm) -+{ -+ struct kfd_process *p; -+ struct kfd_process_device *pdd; -+ int r; -+ -+ BUG_ON(kfd == NULL); -+ if (!kfd->init_complete) -+ return 0; -+ -+ /* Because we are called from arbitrary context (workqueue) as opposed -+ * to process context, kfd_process could attempt to exit while we are -+ * running so the lookup function returns a read-locked process. */ -+ p = kfd_lookup_process_by_mm(mm); -+ if (!p) -+ return -ENODEV; -+ -+ r = -ENODEV; -+ pdd = kfd_get_process_device_data(kfd, p); -+ if (pdd) -+ r = process_evict_queues(kfd->dqm, &pdd->qpd); -+ -+ up_read(&p->lock); -+ return r; -+} -+ -+int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) -+{ -+ struct kfd_process *p; -+ struct kfd_process_device *pdd; -+ int r; -+ -+ BUG_ON(kfd == NULL); -+ if (!kfd->init_complete) -+ return 0; -+ -+ /* Because we are called from arbitrary context (workqueue) as opposed -+ * to process context, kfd_process could attempt to exit while we are -+ * running so the lookup function returns a read-locked process. */ -+ p = kfd_lookup_process_by_mm(mm); -+ if (!p) -+ return -ENODEV; -+ -+ r = -ENODEV; -+ pdd = kfd_get_process_device_data(kfd, p); -+ if (pdd) -+ r = process_restore_queues(kfd->dqm, &pdd->qpd); -+ -+ up_read(&p->lock); -+ return r; -+} -+ - static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, - unsigned int chunk_size) - { -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c -index 42de22b..e123390 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c -@@ -44,9 +44,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - --static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); --static int destroy_queues_cpsch(struct device_queue_manager *dqm, -- bool preempt_static_queues, bool lock); -+static int execute_queues_cpsch(struct device_queue_manager *dqm); -+static int unmap_queues_cpsch(struct device_queue_manager *dqm, -+ enum kfd_unmap_queues_filter filter, -+ uint32_t filter_param, bool reset); - - static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, -@@ -116,11 +117,11 @@ static int allocate_vmid(struct device_queue_manager *dqm, - if (dqm->vmid_bitmap == 0) - return -ENOMEM; - -- bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM); -+ bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, -+ dqm->dev->vm_info.vmid_num_kfd); - clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); - -- /* Kaveri kfd vmid's starts from vmid 8 */ -- allocated_vmid = bit + KFD_VMID_START_OFFSET; -+ allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; - pr_debug("kfd: vmid allocation %d\n", allocated_vmid); - qpd->vmid = allocated_vmid; - q->properties.vmid = allocated_vmid; -@@ -128,6 +129,11 @@ static int allocate_vmid(struct device_queue_manager *dqm, - set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); - program_sh_mem_settings(dqm, qpd); - -+ dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, -+ allocated_vmid, -+ qpd->page_table_base); -+ /*invalidate the VM context after pasid and vmid mapping is set up*/ -+ radeon_flush_tlb(dqm->dev, qpd->pqm->process->pasid); - return 0; - } - -@@ -135,7 +141,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - struct queue *q) - { -- int bit = qpd->vmid - KFD_VMID_START_OFFSET; -+ int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; - - /* Release the vmid mapping */ - set_pasid_vmid_mapping(dqm, 0, qpd->vmid); -@@ -175,6 +181,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, - } - *allocated_vmid = qpd->vmid; - q->properties.vmid = qpd->vmid; -+ /* -+ * Eviction state logic: we only mark active queues as evicted -+ * to avoid the overhead of restoring inactive queues later -+ */ -+ if (qpd->evicted) -+ q->properties.is_evicted = (q->properties.queue_size > 0 && -+ q->properties.queue_percent > 0 && -+ q->properties.queue_address != 0); - - if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) - retval = create_compute_queue_nocpsch(dqm, q, qpd); -@@ -281,8 +295,12 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - q->pipe, - q->queue); - -+ dqm->dev->kfd2kgd->alloc_memory_of_scratch( -+ dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); -+ - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, -- q->queue, (uint32_t __user *) q->properties.write_ptr); -+ q->queue, (uint32_t __user *) q->properties.write_ptr, -+ qpd->page_table_base); - if (retval != 0) { - deallocate_hqd(dqm, q); - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -@@ -362,34 +380,56 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) - { - int retval; - struct mqd_manager *mqd; -+ struct kfd_process_device *pdd; -+ - bool prev_active = false; - - BUG_ON(!dqm || !q || !q->mqd); - - mutex_lock(&dqm->lock); -+ -+ pdd = kfd_get_process_device_data(q->device, q->process); -+ if (!pdd) { -+ mutex_unlock(&dqm->lock); -+ return -ENODEV; -+ } - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (mqd == NULL) { - mutex_unlock(&dqm->lock); - return -ENOMEM; - } -+ /* -+ * Eviction state logic: we only mark active queues as evicted -+ * to avoid the overhead of restoring inactive queues later -+ */ -+ if (pdd->qpd.evicted > 0) -+ q->properties.is_evicted = (q->properties.queue_size > 0 && -+ q->properties.queue_percent > 0 && -+ q->properties.queue_address != 0); - -+ /* save previous activity state for counters */ - if (q->properties.is_active) - prev_active = true; - -- /* -- * -- * check active state vs. the previous state -- * and modify counter accordingly -- */ -+ - retval = mqd->update_mqd(mqd, q->mqd, &q->properties); -+ if (sched_policy == KFD_SCHED_POLICY_NO_HWS && -+ q->properties.type == KFD_QUEUE_TYPE_COMPUTE) -+ retval = mqd->load_mqd(mqd, q->mqd, q->pipe, -+ q->queue, -+ (uint32_t __user *)q->properties.write_ptr, 0); -+ /* -+ * check active state vs. the previous state -+ * and modify counter accordingly -+ */ - if ((q->properties.is_active) && (!prev_active)) - dqm->queue_count++; - else if ((!q->properties.is_active) && (prev_active)) - dqm->queue_count--; - - if (sched_policy != KFD_SCHED_POLICY_NO_HWS) -- retval = execute_queues_cpsch(dqm, false); -+ retval = execute_queues_cpsch(dqm); - - mutex_unlock(&dqm->lock); - return retval; -@@ -415,15 +455,115 @@ static struct mqd_manager *get_mqd_manager_nocpsch( - return mqd; - } - -+int process_evict_queues(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd) -+{ -+ struct queue *q, *next; -+ struct mqd_manager *mqd; -+ int retval = 0; -+ -+ BUG_ON(!dqm || !qpd); -+ -+ mutex_lock(&dqm->lock); -+ if (qpd->evicted++ > 0) { /* already evicted, do nothing */ -+ mutex_unlock(&dqm->lock); -+ return 0; -+ } -+ /* unactivate all active queues on the qpd */ -+ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { -+ mqd = dqm->ops.get_mqd_manager(dqm, -+ get_mqd_type_from_queue_type(q->properties.type)); -+ if (!mqd) { /* should not be here */ -+ BUG(); -+ continue; -+ } -+ /* if the queue is not active anyway, it is not evicted */ -+ if (q->properties.is_active == true) -+ q->properties.is_evicted = true; -+ -+ retval = mqd->update_mqd(mqd, q->mqd, &q->properties); -+ if (sched_policy == KFD_SCHED_POLICY_NO_HWS && -+ q->properties.type == KFD_QUEUE_TYPE_COMPUTE) -+ retval = mqd->load_mqd(mqd, q->mqd, q->pipe, -+ q->queue, -+ (uint32_t __user *)q->properties.write_ptr, 0); -+ if (q->properties.is_evicted) -+ dqm->queue_count--; -+ } -+ if (sched_policy != KFD_SCHED_POLICY_NO_HWS) -+ retval = execute_queues_cpsch(dqm); -+ -+ mutex_unlock(&dqm->lock); -+ return retval; -+ -+} -+ -+int process_restore_queues(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd) -+{ -+ struct queue *q, *next; -+ struct mqd_manager *mqd; -+ int retval = 0; -+ -+ -+ BUG_ON(!dqm || !qpd); -+ -+ mutex_lock(&dqm->lock); -+ if (qpd->evicted == 0) { /* already restored, do nothing */ -+ mutex_unlock(&dqm->lock); -+ return 0; -+ } -+ -+ if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ -+ qpd->evicted--; -+ mutex_unlock(&dqm->lock); -+ return 0; -+ } -+ -+ /* activate all active queues on the qpd */ -+ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { -+ mqd = dqm->ops.get_mqd_manager(dqm, -+ get_mqd_type_from_queue_type(q->properties.type)); -+ if (!mqd) { /* should not be here */ -+ BUG(); -+ continue; -+ } -+ if (q->properties.is_evicted) { -+ q->properties.is_evicted = false; -+ retval = mqd->update_mqd(mqd, q->mqd, &q->properties); -+ if (sched_policy == KFD_SCHED_POLICY_NO_HWS && -+ q->properties.type == KFD_QUEUE_TYPE_COMPUTE) -+ retval = -+ mqd->load_mqd( -+ mqd, -+ q->mqd, -+ q->pipe, -+ q->queue, -+ (uint32_t __user *)q->properties.write_ptr, -+ 0); -+ dqm->queue_count++; -+ } -+ } -+ if (sched_policy != KFD_SCHED_POLICY_NO_HWS) -+ retval = execute_queues_cpsch(dqm); -+ -+ if (retval == 0) -+ qpd->evicted = 0; -+ mutex_unlock(&dqm->lock); -+ return retval; -+ -+} -+ - static int register_process_nocpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) - { -+ struct kfd_process_device *pdd; - struct device_process_node *n; - int retval; - - BUG_ON(!dqm || !qpd); - -- pr_debug("kfd: In func %s\n", __func__); -+ pr_debug("In func %s\n", __func__); - - n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL); - if (!n) -@@ -434,6 +574,11 @@ static int register_process_nocpsch(struct device_queue_manager *dqm, - mutex_lock(&dqm->lock); - list_add(&n->list, &dqm->queues); - -+ pdd = qpd_to_pdd(qpd); -+ qpd->page_table_base = -+ dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); -+ pr_debug("Retrieved PD address == 0x%08u\n", qpd->page_table_base); -+ - retval = dqm->ops_asic_specific.register_process(dqm, qpd); - - dqm->processes_count++; -@@ -499,7 +644,6 @@ static void init_interrupts(struct device_queue_manager *dqm) - if (is_pipe_enabled(dqm, 0, i)) - dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i); - } -- - static int init_scheduler(struct device_queue_manager *dqm) - { - int retval = 0; -@@ -534,7 +678,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) - for (i = 0; i < get_pipes_per_mec(dqm); i++) - dqm->allocated_queues[i] = (1 << get_queues_per_pipe(dqm)) - 1; - -- dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; -+ dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; - dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; - - init_scheduler(dqm); -@@ -607,8 +751,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - if (retval != 0) - return retval; - -- q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; -- q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM; -+ q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; -+ q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; - - pr_debug("kfd: sdma id is: %d\n", q->sdma_id); - pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); -@@ -623,7 +767,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - } - - retval = mqd->load_mqd(mqd, q->mqd, 0, -- 0, NULL); -+ 0, NULL, 0); - if (retval != 0) { - deallocate_sdma_queue(dqm, q->sdma_id); - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -@@ -646,8 +790,7 @@ static int set_sched_resources(struct device_queue_manager *dqm) - - pr_debug("kfd: In func %s\n", __func__); - -- res.vmid_mask = (1 << VMID_PER_DEVICE) - 1; -- res.vmid_mask <<= KFD_VMID_START_OFFSET; -+ res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap; - - res.queue_mask = 0; - for (i = 0; i < KGD_MAX_QUEUES; ++i) { -@@ -696,6 +839,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) - dqm->queue_count = dqm->processes_count = 0; - dqm->sdma_queue_count = 0; - dqm->active_runlist = false; -+ dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; - retval = dqm->ops_asic_specific.initialize(dqm); - if (retval != 0) - goto fail_init_pipelines; -@@ -716,7 +860,7 @@ static int start_cpsch(struct device_queue_manager *dqm) - - retval = 0; - -- retval = pm_init(&dqm->packets, dqm); -+ retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); - if (retval != 0) - goto fail_packet_manager_init; - -@@ -743,7 +887,9 @@ static int start_cpsch(struct device_queue_manager *dqm) - kfd_bind_process_to_device(dqm->dev, - node->qpd->pqm->process); - -- execute_queues_cpsch(dqm, true); -+ mutex_lock(&dqm->lock); -+ execute_queues_cpsch(dqm); -+ mutex_unlock(&dqm->lock); - - return 0; - fail_allocate_vidmem: -@@ -760,7 +906,11 @@ static int stop_cpsch(struct device_queue_manager *dqm) - - BUG_ON(!dqm); - -- destroy_queues_cpsch(dqm, true, true); -+ mutex_lock(&dqm->lock); -+ -+ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false); -+ -+ mutex_unlock(&dqm->lock); - - list_for_each_entry(node, &dqm->queues, list) { - pdd = qpd_to_pdd(node->qpd); -@@ -799,7 +949,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, - list_add(&kq->list, &qpd->priv_queue_list); - dqm->queue_count++; - qpd->is_debug = true; -- execute_queues_cpsch(dqm, false); -+ execute_queues_cpsch(dqm); - mutex_unlock(&dqm->lock); - - return 0; -@@ -815,11 +965,11 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, - - mutex_lock(&dqm->lock); - /* here we actually preempt the DIQ */ -- destroy_queues_cpsch(dqm, true, false); -+ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false); - list_del(&kq->list); - dqm->queue_count--; - qpd->is_debug = false; -- execute_queues_cpsch(dqm, false); -+ execute_queues_cpsch(dqm); - /* - * Unconditionally decrement this counter, regardless of the queue's - * type. -@@ -830,14 +980,6 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, - mutex_unlock(&dqm->lock); - } - --static void select_sdma_engine_id(struct queue *q) --{ -- static int sdma_id; -- -- q->sdma_id = sdma_id; -- sdma_id = (sdma_id + 1) % 2; --} -- - static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd, int *allocate_vmid) - { -@@ -860,9 +1002,15 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - goto out; - } - -- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) -- select_sdma_engine_id(q); -- -+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -+ retval = allocate_sdma_queue(dqm, &q->sdma_id); -+ if (retval != 0) -+ goto out; -+ q->properties.sdma_queue_id = -+ q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; -+ q->properties.sdma_engine_id = -+ q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; -+ } - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - -@@ -870,8 +1018,19 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - mutex_unlock(&dqm->lock); - return -ENOMEM; - } -+ /* -+ * Eviction state logic: we only mark active queues as evicted -+ * to avoid the overhead of restoring inactive queues later -+ */ -+ if (qpd->evicted) -+ q->properties.is_evicted = (q->properties.queue_size > 0 && -+ q->properties.queue_percent > 0 && -+ q->properties.queue_address != 0); - - dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); -+ -+ q->properties.tba_addr = qpd->tba_addr; -+ q->properties.tma_addr = qpd->tma_addr; - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); - if (retval != 0) -@@ -880,7 +1039,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - list_add(&q->list, &qpd->queues_list); - if (q->properties.is_active) { - dqm->queue_count++; -- retval = execute_queues_cpsch(dqm, false); -+ retval = execute_queues_cpsch(dqm); - } - - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) -@@ -917,20 +1076,20 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - return 0; - } - --static int destroy_sdma_queues(struct device_queue_manager *dqm, -+static int unmap_sdma_queues(struct device_queue_manager *dqm, - unsigned int sdma_engine) - { - return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, -- KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false, -+ KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, - sdma_engine); - } - --static int destroy_queues_cpsch(struct device_queue_manager *dqm, -- bool preempt_static_queues, bool lock) -+/* dqm->lock mutex has to be locked before calling this function */ -+static int unmap_queues_cpsch(struct device_queue_manager *dqm, -+ enum kfd_unmap_queues_filter filter, -+ uint32_t filter_param, bool reset) - { - int retval; -- enum kfd_preempt_type_filter preempt_type; -- struct kfd_process_device *pdd; - - BUG_ON(!dqm); - -@@ -940,23 +1099,21 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm, - mutex_lock(&dqm->lock); - if (!dqm->active_runlist) - goto out; -+ if (dqm->active_runlist == false) -+ return retval; - - pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n", - dqm->sdma_queue_count); - - if (dqm->sdma_queue_count > 0) { -- destroy_sdma_queues(dqm, 0); -- destroy_sdma_queues(dqm, 1); -+ unmap_sdma_queues(dqm, 0); -+ unmap_sdma_queues(dqm, 1); - } - -- preempt_type = preempt_static_queues ? -- KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES : -- KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES; -- - retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, -- preempt_type, 0, false, 0); -+ filter, filter_param, reset, 0); - if (retval != 0) -- goto out; -+ return retval; - - *dqm->fence_addr = KFD_FENCE_INIT; - pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, -@@ -965,55 +1122,47 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm, - retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, - QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); - if (retval != 0) { -- pdd = kfd_get_process_device_data(dqm->dev, -- kfd_get_process(current)); -- pdd->reset_wavefronts = true; -- goto out; -+ pr_err("kfd: unmapping queues failed."); -+ return retval; - } -+ - pm_release_ib(&dqm->packets); - dqm->active_runlist = false; - --out: -- if (lock) -- mutex_unlock(&dqm->lock); - return retval; - } - --static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock) -+/* dqm->lock mutex has to be locked before calling this function */ -+static int execute_queues_cpsch(struct device_queue_manager *dqm) - { - int retval; - - BUG_ON(!dqm); - -- if (lock) -- mutex_lock(&dqm->lock); -- -- retval = destroy_queues_cpsch(dqm, false, false); -+ retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, -+ 0, false); - if (retval != 0) { - pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption"); -- goto out; -+ return retval; - } - - if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { - retval = 0; -- goto out; -+ return retval; - } - - if (dqm->active_runlist) { - retval = 0; -- goto out; -+ return retval; - } - - retval = pm_send_runlist(&dqm->packets, &dqm->queues); - if (retval != 0) { - pr_err("kfd: failed to execute runlist"); -- goto out; -+ return retval; - } - dqm->active_runlist = true; - --out: -- if (lock) -- mutex_unlock(&dqm->lock); - return retval; - } - -@@ -1051,14 +1200,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, - goto failed; - } - -- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) -+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - dqm->sdma_queue_count--; -+ deallocate_sdma_queue(dqm, q->sdma_id); -+ } - - list_del(&q->list); - if (q->properties.is_active) - dqm->queue_count--; - -- execute_queues_cpsch(dqm, false); -+ retval = execute_queues_cpsch(dqm); - - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); - -@@ -1072,7 +1223,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, - - mutex_unlock(&dqm->lock); - -- return 0; -+ return retval; - - failed: - failed_try_destroy_debugged_queue: -@@ -1156,6 +1307,172 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, - return false; - } - -+static int set_trap_handler(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd, -+ uint64_t tba_addr, -+ uint64_t tma_addr) -+{ -+ uint64_t *tma; -+ -+ tma = (uint64_t *)(qpd->cwsr_kaddr + dqm->dev->tma_offset); -+ tma[0] = tba_addr; -+ tma[1] = tma_addr; -+ return 0; -+} -+ -+ -+static int set_page_directory_base(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd) -+{ -+ struct kfd_process_device *pdd; -+ uint32_t pd_base; -+ int retval = 0; -+ -+ BUG_ON(!dqm || !qpd); -+ -+ mutex_lock(&dqm->lock); -+ -+ pdd = qpd_to_pdd(qpd); -+ -+ /* Retrieve PD base */ -+ pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); -+ -+ /* If it has not changed, just get out */ -+ if (qpd->page_table_base == pd_base) -+ goto out; -+ -+ /* Update PD Base in QPD */ -+ qpd->page_table_base = pd_base; -+ pr_debug("Updated PD address == 0x%08u\n", pd_base); -+ -+ /* -+ * Preempt queues, destroy runlist and create new runlist. Queues -+ * will have the update PD base address -+ */ -+ if (sched_policy != KFD_SCHED_POLICY_NO_HWS) -+ retval = execute_queues_cpsch(dqm); -+ -+out: -+ mutex_unlock(&dqm->lock); -+ -+ return retval; -+} -+ -+static int process_termination_nocpsch(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd) -+{ -+ struct queue *q, *next; -+ struct mqd_manager *mqd; -+ struct device_process_node *cur, *next_dpn; -+ -+ mutex_lock(&dqm->lock); -+ -+ /* Clear all user mode queues */ -+ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { -+ mqd = dqm->ops.get_mqd_manager(dqm, -+ get_mqd_type_from_queue_type(q->properties.type)); -+ if (!mqd) { -+ mutex_unlock(&dqm->lock); -+ return -ENOMEM; -+ } -+ -+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -+ dqm->sdma_queue_count--; -+ deallocate_sdma_queue(dqm, q->sdma_id); -+ } -+ -+ list_del(&q->list); -+ if (q->properties.is_active) -+ dqm->queue_count--; -+ -+ dqm->total_queue_count--; -+ mqd->destroy_mqd(mqd, q->mqd, -+ KFD_PREEMPT_TYPE_WAVEFRONT_RESET, -+ QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, -+ q->pipe, q->queue); -+ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -+ if (list_empty(&qpd->queues_list)) -+ deallocate_vmid(dqm, qpd, q); -+ } -+ -+ /* Unregister process */ -+ list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { -+ if (qpd == cur->qpd) { -+ list_del(&cur->list); -+ kfree(cur); -+ dqm->processes_count--; -+ break; -+ } -+ } -+ -+ mutex_unlock(&dqm->lock); -+ -+ return 0; -+} -+ -+ -+static int process_termination_cpsch(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd) -+{ -+ int retval; -+ struct queue *q, *next; -+ struct kernel_queue *kq, *kq_next; -+ struct mqd_manager *mqd; -+ struct device_process_node *cur, *next_dpn; -+ -+ retval = 0; -+ -+ mutex_lock(&dqm->lock); -+ -+ /* Clean all kernel queues */ -+ list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) { -+ list_del(&kq->list); -+ dqm->queue_count--; -+ qpd->is_debug = false; -+ dqm->total_queue_count--; -+ } -+ -+ /* Clear all user mode queues */ -+ list_for_each_entry(q, &qpd->queues_list, list) { -+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -+ dqm->sdma_queue_count--; -+ deallocate_sdma_queue(dqm, q->sdma_id); -+ } -+ -+ if (q->properties.is_active) -+ dqm->queue_count--; -+ -+ dqm->total_queue_count--; -+ } -+ -+ /* Unregister process */ -+ list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { -+ if (qpd == cur->qpd) { -+ list_del(&cur->list); -+ kfree(cur); -+ dqm->processes_count--; -+ break; -+ } -+ } -+ -+ retval = execute_queues_cpsch(dqm); -+ -+ /* lastly, free mqd resources */ -+ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { -+ mqd = dqm->ops.get_mqd_manager(dqm, -+ get_mqd_type_from_queue_type(q->properties.type)); -+ if (!mqd) { -+ mutex_unlock(&dqm->lock); -+ return -ENOMEM; -+ } -+ list_del(&q->list); -+ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -+ } -+ -+ mutex_unlock(&dqm->lock); -+ return retval; -+} -+ - struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) - { - struct device_queue_manager *dqm; -@@ -1186,6 +1503,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) - dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; - dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; - dqm->ops.set_cache_memory_policy = set_cache_memory_policy; -+ dqm->ops.set_trap_handler = set_trap_handler; -+ dqm->ops.set_page_directory_base = set_page_directory_base; -+ dqm->ops.process_termination = process_termination_cpsch; - break; - case KFD_SCHED_POLICY_NO_HWS: - /* initialize dqm for no cp scheduling */ -@@ -1200,6 +1520,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) - dqm->ops.initialize = initialize_nocpsch; - dqm->ops.uninitialize = uninitialize_nocpsch; - dqm->ops.set_cache_memory_policy = set_cache_memory_policy; -+ dqm->ops.set_trap_handler = set_trap_handler; -+ dqm->ops.set_page_directory_base = set_page_directory_base; -+ dqm->ops.process_termination = process_termination_nocpsch; - break; - default: - BUG(); -@@ -1214,6 +1537,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) - case CHIP_KAVERI: - device_queue_manager_init_cik(&dqm->ops_asic_specific); - break; -+ -+ case CHIP_TONGA: -+ case CHIP_FIJI: -+ device_queue_manager_init_vi_tonga(&dqm->ops_asic_specific); -+ break; - } - - if (dqm->ops.initialize(dqm) != 0) { -@@ -1231,3 +1559,20 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) - dqm->ops.uninitialize(dqm); - kfree(dqm); - } -+ -+int kfd_process_vm_fault(struct device_queue_manager *dqm, -+ unsigned int pasid) -+{ -+ struct kfd_process_device *pdd; -+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); -+ int ret = 0; -+ -+ if (!p) -+ return -EINVAL; -+ pdd = kfd_get_process_device_data(dqm->dev, p); -+ if (pdd) -+ ret = process_evict_queues(dqm, &pdd->qpd); -+ up_read(&p->lock); -+ -+ return ret; -+} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h -index faf820a..d6af017 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h -@@ -29,10 +29,7 @@ - #include "kfd_priv.h" - #include "kfd_mqd_manager.h" - --#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500) --#define CIK_VMID_NUM (8) --#define KFD_VMID_START_OFFSET (8) --#define VMID_PER_DEVICE CIK_VMID_NUM -+#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (9000) - #define KFD_DQM_FIRST_PIPE (0) - #define CIK_SDMA_QUEUES (4) - #define CIK_SDMA_QUEUES_PER_ENGINE (2) -@@ -79,6 +76,12 @@ struct device_process_node { - * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the - * memory apertures. - * -+ * @set_page_directory_base: Sets the PD base address (GPU local memory) -+ * in all the queues of the relevant process running on the specified device. -+ * It preempts the queues, updates the value and execute the runlist again. -+ * -+ * @process_termination: Clears all process queues belongs to that device. -+ * - */ - - struct device_queue_manager_ops { -@@ -122,6 +125,16 @@ struct device_queue_manager_ops { - enum cache_policy alternate_policy, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); -+ -+ int (*set_trap_handler)(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd, -+ uint64_t tba_addr, -+ uint64_t tma_addr); -+ -+ int (*set_page_directory_base)(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd); -+ int (*process_termination)(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd); - }; - - struct device_queue_manager_asic_ops { -@@ -178,12 +191,20 @@ struct device_queue_manager { - - void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops); - void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops); -+void device_queue_manager_init_vi_tonga( -+ struct device_queue_manager_asic_ops *ops); - void program_sh_mem_settings(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); - unsigned int get_queues_num(struct device_queue_manager *dqm); - unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); - unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); - -+int process_evict_queues(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd); -+int process_restore_queues(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd); -+ -+ - static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) - { - return (pdd->lds_base >> 16) & 0xFF; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c -index 48dc056..da55e39c 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c -@@ -24,6 +24,7 @@ - #include "kfd_device_queue_manager.h" - #include "cik_regs.h" - #include "oss/oss_2_4_sh_mask.h" -+#include "gca/gfx_7_2_sh_mask.h" - - static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, -@@ -125,6 +126,7 @@ static int register_process_cik(struct device_queue_manager *dqm, - } else { - temp = get_sh_mem_bases_nybble_64(pdd); - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); -+ qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; - } - - pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c -index 7e9cae9..c023e50 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c -@@ -39,6 +39,31 @@ static int initialize_cpsch_vi(struct device_queue_manager *dqm); - static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd); - -+/* -+ * Tonga device queue manager functions -+ */ -+static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd, -+ enum cache_policy default_policy, -+ enum cache_policy alternate_policy, -+ void __user *alternate_aperture_base, -+ uint64_t alternate_aperture_size); -+static int register_process_vi_tonga(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd); -+static void init_sdma_vm_tonga(struct device_queue_manager *dqm, -+ struct queue *q, -+ struct qcm_process_device *qpd); -+ -+void device_queue_manager_init_vi_tonga( -+ struct device_queue_manager_asic_ops *ops) -+{ -+ ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; -+ ops->register_process = register_process_vi_tonga; -+ ops->initialize = initialize_cpsch_vi; -+ ops->init_sdma_vm = init_sdma_vm_tonga; -+} -+ -+ - void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops) - { - ops->set_cache_memory_policy = set_cache_memory_policy_vi; -@@ -104,6 +129,33 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, - return true; - } - -+static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd, -+ enum cache_policy default_policy, -+ enum cache_policy alternate_policy, -+ void __user *alternate_aperture_base, -+ uint64_t alternate_aperture_size) -+{ -+ uint32_t default_mtype; -+ uint32_t ape1_mtype; -+ -+ default_mtype = (default_policy == cache_policy_coherent) ? -+ MTYPE_UC : -+ MTYPE_NC_NV; -+ -+ ape1_mtype = (alternate_policy == cache_policy_coherent) ? -+ MTYPE_UC : -+ MTYPE_NC_NV; -+ -+ qpd->sh_mem_config = -+ SH_MEM_ALIGNMENT_MODE_UNALIGNED << -+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | -+ default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | -+ ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; -+ -+ return true; -+} -+ - static int register_process_vi(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) - { -@@ -137,6 +189,8 @@ static int register_process_vi(struct device_queue_manager *dqm, - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); - qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 << - SH_MEM_CONFIG__ADDRESS_MODE__SHIFT; -+ qpd->sh_mem_config |= 1 << -+ SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; - } - - pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", -@@ -145,6 +199,41 @@ static int register_process_vi(struct device_queue_manager *dqm, - return 0; - } - -+static int register_process_vi_tonga(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd) -+{ -+ struct kfd_process_device *pdd; -+ unsigned int temp; -+ -+ BUG_ON(!dqm || !qpd); -+ -+ pdd = qpd_to_pdd(qpd); -+ -+ /* check if sh_mem_config register already configured */ -+ if (qpd->sh_mem_config == 0) { -+ qpd->sh_mem_config = -+ SH_MEM_ALIGNMENT_MODE_UNALIGNED << -+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | -+ MTYPE_UC << -+ SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | -+ MTYPE_UC << -+ SH_MEM_CONFIG__APE1_MTYPE__SHIFT; -+ -+ qpd->sh_mem_ape1_limit = 0; -+ qpd->sh_mem_ape1_base = 0; -+ } -+ -+ /* On dGPU we're always in GPUVM64 addressing mode with 64-bit -+ * aperture addresses. */ -+ temp = get_sh_mem_bases_nybble_64(pdd); -+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); -+ -+ pr_debug("kfd: sh_mem_bases nybble: 0x%X and register 0x%X\n", -+ temp, qpd->sh_mem_bases); -+ -+ return 0; -+} -+ - static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd) - { -@@ -161,6 +250,23 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, - q->properties.sdma_vm_addr = value; - } - -+static void init_sdma_vm_tonga(struct device_queue_manager *dqm, -+ struct queue *q, -+ struct qcm_process_device *qpd) -+{ -+ uint32_t value = 0; -+ -+ if (q->process->is_32bit_user_mode) -+ value |= (1 << SDMA0_RLC0_VIRTUAL_ADDR__PTR32__SHIFT) | -+ get_sh_mem_bases_32(qpd_to_pdd(qpd)); -+ else -+ value |= ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << -+ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & -+ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; -+ q->properties.sdma_vm_addr = value; -+} -+ -+ - static int initialize_cpsch_vi(struct device_queue_manager *dqm) - { - return 0; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c -index 453c5d6..d6a7e2a 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c -@@ -142,12 +142,11 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - -- pr_debug("kfd: mapping doorbell page in %s\n" -+ pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n" - " target user address == 0x%08llX\n" - " physical address == 0x%08llX\n" - " vm_flags == 0x%04lX\n" - " size == 0x%04lX\n", -- __func__, - (unsigned long long) vma->vm_start, address, vma->vm_flags, - doorbell_process_allocation()); - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c -index d1ce83d..23b5936 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c -@@ -32,11 +32,10 @@ - #include "kfd_events.h" - #include <linux/device.h> - --/* -- * A task can only be on a single wait_queue at a time, but we need to support -+/* A task can only be on a single wait_queue at a time, but we need to support - * waiting on multiple events (any/all). -- * Instead of each event simply having a wait_queue with sleeping tasks, it -- * has a singly-linked list of tasks. -+ * Instead of each event simply having a wait_queue with sleeping tasks, it has a -+ * singly-linked list of tasks. - * A thread that wants to sleep creates an array of these, one for each event - * and adds one to each event's waiter chain. - */ -@@ -52,12 +51,11 @@ struct kfd_event_waiter { - uint32_t input_index; - }; - --/* -- * Over-complicated pooled allocator for event notification slots. -+/* Over-complicated pooled allocator for event notification slots. - * -- * Each signal event needs a 64-bit signal slot where the signaler will write -- * a 1 before sending an interrupt.l (This is needed because some interrupts -- * do not contain enough spare data bits to identify an event.) -+ * Each signal event needs a 64-bit signal slot where the signaler will write a 1 -+ * before sending an interrupt.l (This is needed because some interrupts do not -+ * contain enough spare data bits to identify an event.) - * We get whole pages from vmalloc and map them to the process VA. - * Individual signal events are then allocated a slot in a page. - */ -@@ -65,6 +63,7 @@ struct kfd_event_waiter { - struct signal_page { - struct list_head event_pages; /* kfd_process.signal_event_pages */ - uint64_t *kernel_address; -+ uint64_t handle; - uint64_t __user *user_address; - uint32_t page_index; /* Index into the mmap aperture. */ - unsigned int free_slots; -@@ -74,8 +73,7 @@ struct signal_page { - #define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT - #define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE) - #define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1) --#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \ -- SLOT_BITMAP_SIZE * sizeof(long)) -+#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + SLOT_BITMAP_SIZE * sizeof(long)) - - /* - * For signal events, the event ID is used as the interrupt user data. -@@ -85,23 +83,27 @@ struct signal_page { - #define INTERRUPT_DATA_BITS 8 - #define SIGNAL_EVENT_ID_SLOT_SHIFT 0 - -+/* We can only create 8 debug events */ -+ -+#define KFD_DEBUG_EVENT_LIMIT 8 -+#define KFD_DEBUG_EVENT_MASK 0x1F -+#define KFD_DEBUG_EVENT_SHIFT 5 -+ - static uint64_t *page_slots(struct signal_page *page) - { - return page->kernel_address; - } - --static bool allocate_free_slot(struct kfd_process *process, -- struct signal_page **out_page, -- unsigned int *out_slot_index) -+static bool -+allocate_free_slot(struct kfd_process *process, -+ struct signal_page **out_page, -+ unsigned int *out_slot_index) - { - struct signal_page *page; - - list_for_each_entry(page, &process->signal_event_pages, event_pages) { - if (page->free_slots > 0) { -- unsigned int slot = -- find_first_zero_bit(page->used_slot_bitmap, -- SLOTS_PER_PAGE); -- -+ unsigned int slot = find_first_zero_bit(page->used_slot_bitmap, SLOTS_PER_PAGE); - __set_bit(slot, page->used_slot_bitmap); - page->free_slots--; - -@@ -130,6 +132,8 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) - { - void *backing_store; - struct signal_page *page; -+ unsigned int slot; -+ int i; - - page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); - if (!page) -@@ -137,17 +141,23 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) - - page->free_slots = SLOTS_PER_PAGE; - -- backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, -+ backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, \ - get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); - if (!backing_store) - goto fail_alloc_signal_store; - - /* prevent user-mode info leaks */ -- memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, -- KFD_SIGNAL_EVENT_LIMIT * 8); -- -+ memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, KFD_SIGNAL_EVENT_LIMIT * 8); - page->kernel_address = backing_store; - -+ /* Set bits of debug events to prevent allocation */ -+ for (i = 0 ; i < KFD_DEBUG_EVENT_LIMIT ; i++) { -+ slot = (i << KFD_DEBUG_EVENT_SHIFT) | -+ KFD_DEBUG_EVENT_MASK; -+ __set_bit(slot, page->used_slot_bitmap); -+ page->free_slots--; -+ } -+ - if (list_empty(&p->signal_event_pages)) - page->page_index = 0; - else -@@ -169,10 +179,10 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) - return false; - } - --static bool allocate_event_notification_slot(struct file *devkfd, -- struct kfd_process *p, -- struct signal_page **page, -- unsigned int *signal_slot_index) -+static bool -+allocate_event_notification_slot(struct file *devkfd, struct kfd_process *p, -+ struct signal_page **page, -+ unsigned int *signal_slot_index) - { - bool ret; - -@@ -186,6 +196,88 @@ static bool allocate_event_notification_slot(struct file *devkfd, - return ret; - } - -+static bool -+allocate_signal_page_dgpu(struct kfd_process *p, -+ uint64_t *kernel_address, uint64_t handle) -+{ -+ struct signal_page *my_page; -+ -+ my_page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); -+ if (!my_page) -+ return false; -+ -+ /* prevent user-mode info leaks */ -+ memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, -+ KFD_SIGNAL_EVENT_LIMIT * 8); -+ -+ my_page->kernel_address = kernel_address; -+ my_page->handle = handle; -+ my_page->user_address = NULL; -+ my_page->free_slots = SLOTS_PER_PAGE; -+ if (list_empty(&p->signal_event_pages)) -+ my_page->page_index = 0; -+ else -+ my_page->page_index = list_tail_entry(&p->signal_event_pages, -+ struct signal_page, -+ event_pages)->page_index + 1; -+ -+ pr_debug("allocated new event signal page at %p, for process %p\n", -+ my_page, p); -+ pr_debug("page index is %d\n", my_page->page_index); -+ -+ list_add(&my_page->event_pages, &p->signal_event_pages); -+ -+ return true; -+} -+ -+void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle) -+{ -+ struct signal_page *page, *tmp; -+ -+ list_for_each_entry_safe(page, tmp, &p->signal_event_pages, -+ event_pages) { -+ if (page->handle == handle) { -+ list_del(&page->event_pages); -+ kfree(page); -+ break; -+ } -+ } -+} -+ -+static bool -+allocate_debug_event_notification_slot(struct file *devkfd, -+ struct kfd_process *p, -+ struct signal_page **out_page, -+ unsigned int *out_slot_index) -+{ -+ struct signal_page *page; -+ unsigned int slot; -+ bool ret; -+ -+ if (list_empty(&p->signal_event_pages)) { -+ ret = allocate_signal_page(devkfd, p); -+ if (ret == false) -+ return ret; -+ } -+ -+ page = list_entry((&p->signal_event_pages)->next, struct signal_page, -+ event_pages); -+ slot = (p->debug_event_count << KFD_DEBUG_EVENT_SHIFT) | -+ KFD_DEBUG_EVENT_MASK; -+ -+ pr_debug("page == %p\n", page); -+ pr_debug("slot == %d\n", slot); -+ -+ page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT; -+ *out_page = page; -+ *out_slot_index = slot; -+ -+ pr_debug("allocated debug event signal slot in page %p, slot %d\n", -+ page, slot); -+ -+ return true; -+} -+ - /* Assumes that the process's event_mutex is locked. */ - static void release_event_notification_slot(struct signal_page *page, - size_t slot_index) -@@ -202,10 +294,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, - { - struct signal_page *page; - -- /* -- * This is safe because we don't delete signal pages until the -- * process exits. -- */ -+ /* This is safe because we don't delete signal pages until the process exits. */ - list_for_each_entry(page, &p->signal_event_pages, event_pages) - if (page->page_index == page_index) - return page; -@@ -213,10 +302,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, - return NULL; - } - --/* -- * Assumes that p->event_mutex is held and of course that p is not going -- * away (current or locked). -- */ -+/* Assumes that p->event_mutex is held and of course that p is not going away (current or locked). */ - static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) - { - struct kfd_event *ev; -@@ -231,32 +317,27 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) - static u32 make_signal_event_id(struct signal_page *page, - unsigned int signal_slot_index) - { -- return page->page_index | -- (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT); -+ return page->page_index | (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT); - } - --/* -- * Produce a kfd event id for a nonsignal event. -- * These are arbitrary numbers, so we do a sequential search through -- * the hash table for an unused number. -+/* Produce a kfd event id for a nonsignal event. -+ * These are arbitrary numbers, so we do a sequential search through the hash table -+ * for an unused number. - */ - static u32 make_nonsignal_event_id(struct kfd_process *p) - { - u32 id; - - for (id = p->next_nonsignal_event_id; -- id < KFD_LAST_NONSIGNAL_EVENT_ID && -- lookup_event_by_id(p, id) != NULL; -- id++) -+ id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL; -+ id++) - ; - - if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { - -- /* -- * What if id == LAST_NONSIGNAL_EVENT_ID - 1? -- * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so -- * the first loop fails immediately and we proceed with the -- * wraparound loop below. -+ /* What if id == LAST_NONSIGNAL_EVENT_ID - 1? -+ * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so the first loop -+ * fails immediately and we proceed with the wraparound loop below. - */ - p->next_nonsignal_event_id = id + 1; - -@@ -264,54 +345,68 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) - } - - for (id = KFD_FIRST_NONSIGNAL_EVENT_ID; -- id < KFD_LAST_NONSIGNAL_EVENT_ID && -- lookup_event_by_id(p, id) != NULL; -- id++) -+ id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL; -+ id++) - ; - - - if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { - p->next_nonsignal_event_id = id + 1; - return id; -+ } else { -+ p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; -+ return 0; - } -- -- p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; -- return 0; - } - --static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p, -- struct signal_page *page, -- unsigned int signal_slot) -+static struct kfd_event * -+lookup_event_by_page_slot(struct kfd_process *p, -+ struct signal_page *page, unsigned int signal_slot) - { - return lookup_event_by_id(p, make_signal_event_id(page, signal_slot)); - } - --static int create_signal_event(struct file *devkfd, -- struct kfd_process *p, -- struct kfd_event *ev) -+static int -+create_signal_event(struct file *devkfd, struct kfd_process *p, struct kfd_event *ev) - { -- if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { -+ if ((ev->type == KFD_EVENT_TYPE_SIGNAL) && -+ (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT)) { - pr_warn("amdkfd: Signal event wasn't created because limit was reached\n"); - return -ENOMEM; -+ } else if ((ev->type == KFD_EVENT_TYPE_DEBUG) && -+ (p->debug_event_count == KFD_DEBUG_EVENT_LIMIT)) { -+ pr_warn("amdkfd: Debug event wasn't created because limit was reached\n"); -+ return -ENOMEM; - } - -- if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page, -+ if (ev->type == KFD_EVENT_TYPE_SIGNAL) { -+ if (!allocate_event_notification_slot(devkfd, p, -+ &ev->signal_page, - &ev->signal_slot_index)) { -- pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n"); -- return -ENOMEM; -- } -+ pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n"); -+ return -ENOMEM; -+ } - -- p->signal_event_count++; -+ p->signal_event_count++; - -- ev->user_signal_address = -- &ev->signal_page->user_address[ev->signal_slot_index]; -+ if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) == -+ KFD_DEBUG_EVENT_MASK) -+ p->signal_event_count++; - -- ev->event_id = make_signal_event_id(ev->signal_page, -- ev->signal_slot_index); -+ } else if (ev->type == KFD_EVENT_TYPE_DEBUG) { -+ if (!allocate_debug_event_notification_slot(devkfd, p, -+ &ev->signal_page, -+ &ev->signal_slot_index)) { -+ pr_warn("amdkfd: Debug event wasn't created because out of kernel memory\n"); -+ return -ENOMEM; -+ } - -- pr_debug("signal event number %zu created with id %d, address %p\n", -- p->signal_event_count, ev->event_id, -- ev->user_signal_address); -+ p->debug_event_count++; -+ } -+ -+ ev->user_signal_address = &ev->signal_page->user_address[ev->signal_slot_index]; -+ -+ ev->event_id = make_signal_event_id(ev->signal_page, ev->signal_slot_index); - - pr_debug("signal event number %zu created with id %d, address %p\n", - p->signal_event_count, ev->event_id, -@@ -320,12 +415,10 @@ static int create_signal_event(struct file *devkfd, - return 0; - } - --/* -- * No non-signal events are supported yet. -- * We create them as events that never signal. -- * Set event calls from user-mode are failed. -- */ --static int create_other_event(struct kfd_process *p, struct kfd_event *ev) -+/* No non-signal events are supported yet. -+ * We create them as events that never signal. Set event calls from user-mode are failed. */ -+static int -+create_other_event(struct kfd_process *p, struct kfd_event *ev) - { - ev->event_id = make_nonsignal_event_id(p); - if (ev->event_id == 0) -@@ -341,20 +434,25 @@ void kfd_event_init_process(struct kfd_process *p) - INIT_LIST_HEAD(&p->signal_event_pages); - p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; - p->signal_event_count = 0; -+ p->debug_event_count = 0; - } - - static void destroy_event(struct kfd_process *p, struct kfd_event *ev) - { - if (ev->signal_page != NULL) { -- release_event_notification_slot(ev->signal_page, -- ev->signal_slot_index); -- p->signal_event_count--; -+ if (ev->type == KFD_EVENT_TYPE_SIGNAL) { -+ release_event_notification_slot(ev->signal_page, -+ ev->signal_slot_index); -+ p->signal_event_count--; -+ if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) == -+ KFD_DEBUG_EVENT_MASK) -+ p->signal_event_count--; -+ } else if (ev->type == KFD_EVENT_TYPE_DEBUG) { -+ p->debug_event_count--; -+ } - } - -- /* -- * Abandon the list of waiters. Individual waiting threads will -- * clean up their own data. -- */ -+ /* Abandon the list of waiters. Individual waiting threads will clean up their own data.*/ - list_del(&ev->waiters); - - hash_del(&ev->events); -@@ -371,18 +469,17 @@ static void destroy_events(struct kfd_process *p) - destroy_event(p, ev); - } - --/* -- * We assume that the process is being destroyed and there is no need to -- * unmap the pages or keep bookkeeping data in order. -- */ -+/* We assume that the process is being destroyed and there is no need to unmap the pages -+ * or keep bookkeeping data in order. */ - static void shutdown_signal_pages(struct kfd_process *p) - { - struct signal_page *page, *tmp; - -- list_for_each_entry_safe(page, tmp, &p->signal_event_pages, -- event_pages) { -- free_pages((unsigned long)page->kernel_address, -- get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); -+ list_for_each_entry_safe(page, tmp, &p->signal_event_pages, event_pages) { -+ if (page->user_address) { -+ free_pages((unsigned long)page->kernel_address, -+ get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); -+ } - kfree(page); - } - } -@@ -395,8 +492,7 @@ void kfd_event_free_process(struct kfd_process *p) - - static bool event_can_be_gpu_signaled(const struct kfd_event *ev) - { -- return ev->type == KFD_EVENT_TYPE_SIGNAL || -- ev->type == KFD_EVENT_TYPE_DEBUG; -+ return ev->type == KFD_EVENT_TYPE_SIGNAL || ev->type == KFD_EVENT_TYPE_DEBUG; - } - - static bool event_can_be_cpu_signaled(const struct kfd_event *ev) -@@ -407,11 +503,12 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) - int kfd_event_create(struct file *devkfd, struct kfd_process *p, - uint32_t event_type, bool auto_reset, uint32_t node_id, - uint32_t *event_id, uint32_t *event_trigger_data, -- uint64_t *event_page_offset, uint32_t *event_slot_index) -+ uint64_t *event_page_offset, uint32_t *event_slot_index, -+ void *kern_addr) - { - int ret = 0; -- struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); - -+ struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); - if (!ev) - return -ENOMEM; - -@@ -421,17 +518,20 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, - - INIT_LIST_HEAD(&ev->waiters); - -- *event_page_offset = 0; -- - mutex_lock(&p->event_mutex); - -+ if (kern_addr && list_empty(&p->signal_event_pages)) -+ allocate_signal_page_dgpu(p, kern_addr, *event_page_offset); -+ -+ *event_page_offset = 0; -+ - switch (event_type) { - case KFD_EVENT_TYPE_SIGNAL: - case KFD_EVENT_TYPE_DEBUG: - ret = create_signal_event(devkfd, p, ev); - if (!ret) { - *event_page_offset = (ev->signal_page->page_index | -- KFD_MMAP_EVENTS_MASK); -+ KFD_MMAP_TYPE_EVENTS); - *event_page_offset <<= PAGE_SHIFT; - *event_slot_index = ev->signal_slot_index; - } -@@ -538,8 +638,7 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id) - - static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev) - { -- page_slots(ev->signal_page)[ev->signal_slot_index] = -- UNSIGNALED_EVENT_SLOT; -+ page_slots(ev->signal_page)[ev->signal_slot_index] = UNSIGNALED_EVENT_SLOT; - } - - static bool is_slot_signaled(struct signal_page *page, unsigned int index) -@@ -547,8 +646,7 @@ static bool is_slot_signaled(struct signal_page *page, unsigned int index) - return page_slots(page)[index] != UNSIGNALED_EVENT_SLOT; - } - --static void set_event_from_interrupt(struct kfd_process *p, -- struct kfd_event *ev) -+static void set_event_from_interrupt(struct kfd_process *p, struct kfd_event *ev) - { - if (ev && event_can_be_gpu_signaled(ev)) { - acknowledge_signal(p, ev); -@@ -561,42 +659,39 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, - { - struct kfd_event *ev; - -- /* -- * Because we are called from arbitrary context (workqueue) as opposed -+ /* Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are -- * running so the lookup function returns a locked process. -- */ -+ * running so the lookup function returns a read-locked process. */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); -- - if (!p) - return; /* Presumably process exited. */ - - mutex_lock(&p->event_mutex); - -- if (valid_id_bits >= INTERRUPT_DATA_BITS) { -+ if ((valid_id_bits >= INTERRUPT_DATA_BITS) && -+ ((partial_id & KFD_DEBUG_EVENT_MASK) == -+ KFD_DEBUG_EVENT_MASK)) { - /* Partial ID is a full ID. */ - ev = lookup_event_by_id(p, partial_id); - set_event_from_interrupt(p, ev); - } else { -- /* -- * Partial ID is in fact partial. For now we completely -- * ignore it, but we could use any bits we did receive to -- * search faster. -- */ -+ /* Partial ID is in fact partial. For now we completely ignore it, -+ * but we could use any bits we did receive to search faster. */ - struct signal_page *page; - unsigned i; - -- list_for_each_entry(page, &p->signal_event_pages, event_pages) -- for (i = 0; i < SLOTS_PER_PAGE; i++) -+ list_for_each_entry(page, &p->signal_event_pages, event_pages) { -+ for (i = 0; i < SLOTS_PER_PAGE; i++) { - if (is_slot_signaled(page, i)) { -- ev = lookup_event_by_page_slot(p, -- page, i); -+ ev = lookup_event_by_page_slot(p, page, i); - set_event_from_interrupt(p, ev); - } -+ } -+ } - } - - mutex_unlock(&p->event_mutex); -- mutex_unlock(&p->mutex); -+ up_read(&p->lock); - } - - static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) -@@ -604,20 +699,20 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) - struct kfd_event_waiter *event_waiters; - uint32_t i; - -- event_waiters = kmalloc_array(num_events, -- sizeof(struct kfd_event_waiter), -- GFP_KERNEL); -+ event_waiters = kmalloc(num_events * sizeof(struct kfd_event_waiter), GFP_KERNEL); - -- for (i = 0; (event_waiters) && (i < num_events) ; i++) { -- INIT_LIST_HEAD(&event_waiters[i].waiters); -- event_waiters[i].sleeping_task = current; -- event_waiters[i].activated = false; -+ if (event_waiters) { -+ for (i = 0; i < num_events; i++) { -+ INIT_LIST_HEAD(&event_waiters[i].waiters); -+ event_waiters[i].sleeping_task = current; -+ event_waiters[i].activated = false; -+ } - } - - return event_waiters; - } - --static int init_event_waiter(struct kfd_process *p, -+static int init_event_waiter_get_status(struct kfd_process *p, - struct kfd_event_waiter *waiter, - uint32_t event_id, - uint32_t input_index) -@@ -632,13 +727,21 @@ static int init_event_waiter(struct kfd_process *p, - waiter->activated = ev->signaled; - ev->signaled = ev->signaled && !ev->auto_reset; - -- list_add(&waiter->waiters, &ev->waiters); -- - return 0; - } - -+static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) -+{ -+ struct kfd_event *ev = waiter->event; -+ -+ /* Only add to the wait list if we actually need to -+ * wait on this event. */ -+ if (!waiter->activated) -+ list_add(&waiter->waiters, &ev->waiters); -+} -+ - static bool test_event_condition(bool all, uint32_t num_events, -- struct kfd_event_waiter *event_waiters) -+ struct kfd_event_waiter *event_waiters) - { - uint32_t i; - uint32_t activated_count = 0; -@@ -663,23 +766,15 @@ static bool copy_signaled_event_data(uint32_t num_events, - struct kfd_event_waiter *event_waiters, - struct kfd_event_data __user *data) - { -- struct kfd_hsa_memory_exception_data *src; -- struct kfd_hsa_memory_exception_data __user *dst; -- struct kfd_event_waiter *waiter; -- struct kfd_event *event; - uint32_t i; - -- for (i = 0; i < num_events; i++) { -- waiter = &event_waiters[i]; -- event = waiter->event; -- if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) { -- dst = &data[waiter->input_index].memory_exception_data; -- src = &event->memory_exception_data; -- if (copy_to_user(dst, src, -- sizeof(struct kfd_hsa_memory_exception_data))) -+ for (i = 0; i < num_events; i++) -+ if (event_waiters[i].activated && -+ event_waiters[i].event->type == KFD_EVENT_TYPE_MEMORY) -+ if (copy_to_user(&data[event_waiters[i].input_index].memory_exception_data, -+ &event_waiters[i].event->memory_exception_data, -+ sizeof(struct kfd_hsa_memory_exception_data))) - return false; -- } -- } - - return true; - -@@ -695,11 +790,9 @@ static long user_timeout_to_jiffies(uint32_t user_timeout_ms) - if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE) - return MAX_SCHEDULE_TIMEOUT; - -- /* -- * msecs_to_jiffies interprets all values above 2^31-1 as infinite, -+ /* msecs_to_jiffies interprets all values above 2^31-1 as infinite, - * but we consider them finite. -- * This hack is wrong, but nobody is likely to notice. -- */ -+ * This hack is wrong, but nobody is likely to notice. */ - user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF); - - return msecs_to_jiffies(user_timeout_ms) + 1; -@@ -724,11 +817,16 @@ int kfd_wait_on_events(struct kfd_process *p, - (struct kfd_event_data __user *) data; - uint32_t i; - int ret = 0; -+ - struct kfd_event_waiter *event_waiters = NULL; - long timeout = user_timeout_to_jiffies(user_timeout_ms); - - mutex_lock(&p->event_mutex); - -+ /* Set to something unreasonable - this is really -+ * just a bool for now. */ -+ *wait_result = KFD_WAIT_TIMEOUT; -+ - event_waiters = alloc_event_waiters(num_events); - if (!event_waiters) { - ret = -ENOMEM; -@@ -744,14 +842,34 @@ int kfd_wait_on_events(struct kfd_process *p, - goto fail; - } - -- ret = init_event_waiter(p, &event_waiters[i], -+ ret = init_event_waiter_get_status(p, &event_waiters[i], - event_data.event_id, i); - if (ret) - goto fail; - } - -+ /* Check condition once. */ -+ if (test_event_condition(all, num_events, event_waiters)) { -+ if (copy_signaled_event_data(num_events, -+ event_waiters, events)) -+ *wait_result = KFD_WAIT_COMPLETE; -+ else -+ *wait_result = KFD_WAIT_ERROR; -+ free_waiters(num_events, event_waiters); -+ } else { -+ /* Add to wait lists if we need to wait. */ -+ for (i = 0; i < num_events; i++) -+ init_event_waiter_add_to_waitlist(&event_waiters[i]); -+ } -+ - mutex_unlock(&p->event_mutex); - -+ /* Return if all waits were already satisfied. */ -+ if (*wait_result != KFD_WAIT_TIMEOUT) { -+ __set_current_state(TASK_RUNNING); -+ return ret; -+ } -+ - while (true) { - if (fatal_signal_pending(current)) { - ret = -EINTR; -@@ -760,17 +878,17 @@ int kfd_wait_on_events(struct kfd_process *p, - - if (signal_pending(current)) { - /* -- * This is wrong when a nonzero, non-infinite timeout -- * is specified. We need to use -- * ERESTARTSYS_RESTARTBLOCK, but struct restart_block -- * contains a union with data for each user and it's -- * in generic kernel code that I don't want to -- * touch yet. -+ * This is wrong when a nonzero, non-infinite timeout is specified. -+ * We need to use ERESTARTSYS_RESTARTBLOCK, but struct restart_block -+ * contains a union with data for each user and it's in generic -+ * kernel code that I don't want to touch yet. - */ - ret = -ERESTARTSYS; - break; - } - -+ set_current_state(TASK_INTERRUPTIBLE); -+ - if (test_event_condition(all, num_events, event_waiters)) { - if (copy_signaled_event_data(num_events, - event_waiters, events)) -@@ -785,7 +903,7 @@ int kfd_wait_on_events(struct kfd_process *p, - break; - } - -- timeout = schedule_timeout_interruptible(timeout); -+ timeout = schedule_timeout(timeout); - } - __set_current_state(TASK_RUNNING); - -@@ -825,8 +943,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) - page = lookup_signal_page_by_index(p, page_index); - if (!page) { - /* Probably KFD bug, but mmap is user-accessible. */ -- pr_debug("signal page could not be found for page_index %u\n", -- page_index); -+ pr_debug("signal page could not be found for page_index %u\n", page_index); - return -EINVAL; - } - -@@ -858,23 +975,29 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) - static void lookup_events_by_type_and_signal(struct kfd_process *p, - int type, void *event_data) - { -- struct kfd_hsa_memory_exception_data *ev_data; - struct kfd_event *ev; - int bkt; - bool send_signal = true; - -- ev_data = (struct kfd_hsa_memory_exception_data *) event_data; -- -- hash_for_each(p->events, bkt, ev, events) -+ hash_for_each(p->events, bkt, ev, events) { - if (ev->type == type) { - send_signal = false; - dev_dbg(kfd_device, - "Event found: id %X type %d", - ev->event_id, ev->type); - set_event(ev); -- if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data) -- ev->memory_exception_data = *ev_data; -+ if (ev->type == KFD_EVENT_TYPE_MEMORY && event_data) -+ ev->memory_exception_data = -+ *(struct kfd_hsa_memory_exception_data *)event_data; - } -+ } -+ -+ if (type == KFD_EVENT_TYPE_MEMORY) { -+ dev_warn(kfd_device, -+ "Sending SIGSEGV to HSA Process with PID %d ", -+ p->lead_thread->pid); -+ send_sig(SIGSEGV, p->lead_thread, 0); -+ } - - /* Send SIGTERM no event of type "type" has been found*/ - if (send_signal) { -@@ -901,7 +1024,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, - /* - * Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are -- * running so the lookup function returns a locked process. -+ * running so the lookup function returns a read-locked process. - */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - -@@ -916,24 +1039,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, - memory_exception_data.gpu_id = dev->id; - memory_exception_data.va = address; - /* Set failure reason */ -- memory_exception_data.failure.NotPresent = 1; -- memory_exception_data.failure.NoExecute = 0; -- memory_exception_data.failure.ReadOnly = 0; -+ memory_exception_data.failure.NotPresent = true; -+ memory_exception_data.failure.NoExecute = false; -+ memory_exception_data.failure.ReadOnly = false; - if (vma) { - if (vma->vm_start > address) { -- memory_exception_data.failure.NotPresent = 1; -- memory_exception_data.failure.NoExecute = 0; -- memory_exception_data.failure.ReadOnly = 0; -+ memory_exception_data.failure.NotPresent = true; -+ memory_exception_data.failure.NoExecute = false; -+ memory_exception_data.failure.ReadOnly = false; - } else { -- memory_exception_data.failure.NotPresent = 0; -+ memory_exception_data.failure.NotPresent = false; - if (is_write_requested && !(vma->vm_flags & VM_WRITE)) -- memory_exception_data.failure.ReadOnly = 1; -+ memory_exception_data.failure.ReadOnly = true; - else -- memory_exception_data.failure.ReadOnly = 0; -+ memory_exception_data.failure.ReadOnly = false; - if (is_execute_requested && !(vma->vm_flags & VM_EXEC)) -- memory_exception_data.failure.NoExecute = 1; -+ memory_exception_data.failure.NoExecute = true; - else -- memory_exception_data.failure.NoExecute = 0; -+ memory_exception_data.failure.NoExecute = false; - } - } - -@@ -946,7 +1069,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, - &memory_exception_data); - - mutex_unlock(&p->event_mutex); -- mutex_unlock(&p->mutex); -+ up_read(&p->lock); - } - - void kfd_signal_hw_exception_event(unsigned int pasid) -@@ -954,7 +1077,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid) - /* - * Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are -- * running so the lookup function returns a locked process. -+ * running so the lookup function returns a read-locked process. - */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - -@@ -967,5 +1090,42 @@ void kfd_signal_hw_exception_event(unsigned int pasid) - lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL); - - mutex_unlock(&p->event_mutex); -- mutex_unlock(&p->mutex); -+ up_read(&p->lock); -+} -+ -+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, -+ struct kfd_vm_fault_info *info) -+{ -+ struct kfd_event *ev; -+ int bkt; -+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); -+ struct kfd_hsa_memory_exception_data memory_exception_data; -+ -+ if (!p) -+ return; /* Presumably process exited. */ -+ memset(&memory_exception_data, 0, sizeof(memory_exception_data)); -+ memory_exception_data.gpu_id = dev->id; -+ /* Set failure reason */ -+ if (info) { -+ memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; -+ memory_exception_data.failure.NotPresent = -+ info->prot_valid ? true : false; -+ memory_exception_data.failure.NoExecute = -+ info->prot_exec ? true : false; -+ memory_exception_data.failure.ReadOnly = -+ info->prot_write ? true : false; -+ } -+ mutex_lock(&p->event_mutex); -+ -+ hash_for_each(p->events, bkt, ev, events) { -+ if (ev->type == KFD_EVENT_TYPE_MEMORY) { -+ ev->memory_exception_data = memory_exception_data; -+ set_event(ev); -+ } -+ } -+ -+ mutex_unlock(&p->event_mutex); -+ up_read(&p->lock); -+ - } -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h -index 28f6838..d7987eb 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h -@@ -34,8 +34,7 @@ - #define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK - #define KFD_LAST_NONSIGNAL_EVENT_ID UINT_MAX - --/* -- * Written into kfd_signal_slot_t to indicate that the event is not signaled. -+/* Written into kfd_signal_slot_t to indicate that the event is not signaled. - * Since the event protocol may need to write the event ID into memory, this - * must not be a valid event ID. - * For the sake of easy memset-ing, this must be a byte pattern. -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c -index 2b65510..587f847 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c -@@ -33,7 +33,7 @@ - #include <linux/time.h> - #include "kfd_priv.h" - #include <linux/mm.h> --#include <linux/mman.h> -+#include <uapi/asm-generic/mman-common.h> - #include <asm/processor.h> - - /* -@@ -278,21 +278,36 @@ - #define MAKE_GPUVM_APP_BASE(gpu_num) \ - (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) - --#define MAKE_GPUVM_APP_LIMIT(base) \ -- (((uint64_t)(base) & \ -- 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) -+#define MAKE_GPUVM_APP_LIMIT(base, size) \ -+ (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) - --#define MAKE_SCRATCH_APP_BASE(gpu_num) \ -- (((uint64_t)(gpu_num) << 61) + 0x100000000L) -+#define MAKE_SCRATCH_APP_BASE() \ -+ (((uint64_t)(0x1UL) << 61) + 0x100000000L) - - #define MAKE_SCRATCH_APP_LIMIT(base) \ - (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) - --#define MAKE_LDS_APP_BASE(gpu_num) \ -- (((uint64_t)(gpu_num) << 61) + 0x0) -+#define MAKE_LDS_APP_BASE() \ -+ (((uint64_t)(0x1UL) << 61) + 0x0) -+ - #define MAKE_LDS_APP_LIMIT(base) \ - (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) - -+ -+#define DGPU_VM_BASE_DEFAULT 0x100000 -+ -+int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, -+ uint64_t base, uint64_t limit) -+{ -+ if (base < (pdd->qpd.cwsr_base + pdd->dev->cwsr_size)) { -+ pr_err("Set dgpu vm base 0x%llx failed.\n", base); -+ return -EINVAL; -+ } -+ pdd->dgpu_base = base; -+ pdd->dgpu_limit = limit; -+ return 0; -+} -+ - int kfd_init_apertures(struct kfd_process *process) - { - uint8_t id = 0; -@@ -300,13 +315,16 @@ int kfd_init_apertures(struct kfd_process *process) - struct kfd_process_device *pdd; - - /*Iterating over all devices*/ -- while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && -- id < NUM_OF_SUPPORTED_GPUS) { -+ while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { -+ if (!dev) { -+ id++; /* Skip non GPU devices */ -+ continue; -+ } - - pdd = kfd_create_process_device_data(dev, process); - if (pdd == NULL) { - pr_err("Failed to create process device data\n"); -- return -1; -+ goto err; - } - /* - * For 64 bit process aperture will be statically reserved in -@@ -322,19 +340,24 @@ int kfd_init_apertures(struct kfd_process *process) - * node id couldn't be 0 - the three MSB bits of - * aperture shoudn't be 0 - */ -- pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); -+ pdd->lds_base = MAKE_LDS_APP_BASE(); - - pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); - - pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); - -- pdd->gpuvm_limit = -- MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); -+ pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( -+ pdd->gpuvm_base, -+ dev->shared_resources.gpuvm_size); - -- pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); -+ pdd->scratch_base = MAKE_SCRATCH_APP_BASE(); - - pdd->scratch_limit = - MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); -+ -+ if (KFD_IS_DGPU(dev->device_info->asic_family)) -+ pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT; -+ - } - - dev_dbg(kfd_device, "node id %u\n", id); -@@ -350,6 +373,32 @@ int kfd_init_apertures(struct kfd_process *process) - } - - return 0; -+ -+err: -+ return -1; - } - -+void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid) -+{ -+ uint8_t vmid; -+ int first_vmid_to_scan = 8; -+ int last_vmid_to_scan = 15; - -+ const struct kfd2kgd_calls *f2g = dev->kfd2kgd; -+ /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. ATC_VMID15_PASID_MAPPING -+ * to check which VMID the current process is mapped to -+ * and flush TLB for this VMID if found*/ -+ for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { -+ if (f2g->get_atc_vmid_pasid_mapping_valid( -+ dev->kgd, vmid)) { -+ if (f2g->get_atc_vmid_pasid_mapping_pasid( -+ dev->kgd, vmid) == pasid) { -+ dev_dbg(kfd_device, -+ "TLB of vmid %u", vmid); -+ f2g->write_vmid_invalidate_request( -+ dev->kgd, vmid); -+ break; -+ } -+ } -+ } -+} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c -index 7f134aa..a8cdbc8 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c -@@ -172,8 +172,7 @@ static void interrupt_wq(struct work_struct *work) - sizeof(uint32_t))]; - - while (dequeue_ih_ring_entry(dev, ih_ring_entry)) -- dev->device_info->event_interrupt_class->interrupt_wq(dev, -- ih_ring_entry); -+ dev->device_info->event_interrupt_class->interrupt_wq(dev, ih_ring_entry); - } - - bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) -@@ -181,8 +180,7 @@ bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) - /* integer and bitwise OR so there is no boolean short-circuiting */ - unsigned wanted = 0; - -- wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, -- ih_ring_entry); -+ wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, ih_ring_entry); - - return wanted != 0; - } -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c -index d135cd0..513cfe6 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c -@@ -143,7 +143,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, - kq->queue->pipe = KFD_CIK_HIQ_PIPE; - kq->queue->queue = KFD_CIK_HIQ_QUEUE; - kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe, -- kq->queue->queue, NULL); -+ kq->queue->queue, NULL, 0); - } else { - /* allocate fence for DIQ */ - -@@ -213,20 +213,23 @@ static int acquire_packet_buffer(struct kernel_queue *kq, - - BUG_ON(!kq || !buffer_ptr); - -+ /* When rptr == wptr, the buffer is empty. -+ * When rptr == wptr + 1, the buffer is full. -+ * It is always rptr that advances to the position of wptr, rather than -+ * the opposite. So we can only use up to queue_size_dwords - 1 dwords. -+ */ - rptr = *kq->rptr_kernel; - wptr = *kq->wptr_kernel; - queue_address = (unsigned int *)kq->pq_kernel_addr; - queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); - -- pr_debug("rptr: %d\n", rptr); -- pr_debug("wptr: %d\n", wptr); -- pr_debug("queue_address 0x%p\n", queue_address); -+ pr_debug("amdkfd: In func %s\n rptr: %d\n wptr: %d\n queue_address 0x%p\n", -+ __func__, rptr, wptr, queue_address); - -- available_size = (rptr - 1 - wptr + queue_size_dwords) % -+ available_size = (rptr + queue_size_dwords - 1 - wptr) % - queue_size_dwords; - -- if (packet_size_in_dwords >= queue_size_dwords || -- packet_size_in_dwords >= available_size) { -+ if (packet_size_in_dwords > available_size) { - /* - * make sure calling functions know - * acquire_packet_buffer() failed -@@ -236,6 +239,13 @@ static int acquire_packet_buffer(struct kernel_queue *kq, - } - - if (wptr + packet_size_in_dwords >= queue_size_dwords) { -+ /* make sure after rolling back to position 0, there is -+ * still enough space. */ -+ if (packet_size_in_dwords >= rptr) { -+ *buffer_ptr = NULL; -+ return -ENOMEM; -+ } -+ /* fill nops, roll back and start at position 0 */ - while (wptr > 0) { - queue_address[wptr] = kq->nop_packet; - wptr = (wptr + 1) % queue_size_dwords; -@@ -295,6 +305,8 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, - - switch (dev->device_info->asic_family) { - case CHIP_CARRIZO: -+ case CHIP_TONGA: -+ case CHIP_FIJI: - kernel_queue_init_vi(&kq->ops_asic_specific); - break; - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c -index 850a562..e9b886d 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c -@@ -29,10 +29,11 @@ - #define KFD_DRIVER_AUTHOR "AMD Inc. and others" - - #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" --#define KFD_DRIVER_DATE "20150421" --#define KFD_DRIVER_MAJOR 0 --#define KFD_DRIVER_MINOR 7 --#define KFD_DRIVER_PATCHLEVEL 2 -+#define KFD_DRIVER_DATE "20160129" -+#define KFD_DRIVER_MAJOR 1 -+#define KFD_DRIVER_MINOR 8 -+#define KFD_DRIVER_PATCHLEVEL 1 -+#define KFD_DRIVER_RC_LEVEL "" - - static const struct kgd2kfd_calls kgd2kfd = { - .exit = kgd2kfd_exit, -@@ -42,6 +43,10 @@ static const struct kgd2kfd_calls kgd2kfd = { - .interrupt = kgd2kfd_interrupt, - .suspend = kgd2kfd_suspend, - .resume = kgd2kfd_resume, -+ .evict_bo = kgd2kfd_evict_bo, -+ .restore = kgd2kfd_restore, -+ .quiesce_mm = kgd2kfd_quiesce_mm, -+ .resume_mm = kgd2kfd_resume_mm, - }; - - int sched_policy = KFD_SCHED_POLICY_HWS; -@@ -49,6 +54,15 @@ module_param(sched_policy, int, 0444); - MODULE_PARM_DESC(sched_policy, - "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); - -+int hws_max_conc_proc = 0; -+module_param(hws_max_conc_proc, int, 0444); -+MODULE_PARM_DESC(hws_max_conc_proc, -+ "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency (Default), #VMIDs for KFD = Maximum)"); -+ -+int cwsr_enable = 1; -+module_param(cwsr_enable, int, 0444); -+MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); -+ - int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; - module_param(max_num_of_queues_per_device, int, 0444); - MODULE_PARM_DESC(max_num_of_queues_per_device, -@@ -61,6 +75,11 @@ MODULE_PARM_DESC(send_sigterm, - - static int amdkfd_init_completed; - -+int debug_largebar = 0; -+module_param(debug_largebar, int, 0444); -+MODULE_PARM_DESC(debug_largebar, -+ "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); -+ - int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f) - { - if (!amdkfd_init_completed) -@@ -149,4 +168,5 @@ MODULE_DESCRIPTION(KFD_DRIVER_DESC); - MODULE_LICENSE("GPL and additional rights"); - MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "." - __stringify(KFD_DRIVER_MINOR) "." -- __stringify(KFD_DRIVER_PATCHLEVEL)); -+ __stringify(KFD_DRIVER_PATCHLEVEL) -+ KFD_DRIVER_RC_LEVEL); -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c -index b1ef136..ef1dc9b 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c -@@ -31,6 +31,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, - return mqd_manager_init_cik(type, dev); - case CHIP_CARRIZO: - return mqd_manager_init_vi(type, dev); -+ case CHIP_TONGA: -+ case CHIP_FIJI: -+ return mqd_manager_init_vi_tonga(type, dev); - } - - return NULL; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h -index 213a71e..eb60192 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h -@@ -67,7 +67,8 @@ struct mqd_manager { - - int (*load_mqd)(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, -- uint32_t __user *wptr); -+ uint32_t __user *wptr, -+ uint32_t page_table_base); - - int (*update_mqd)(struct mqd_manager *mm, void *mqd, - struct queue_properties *q); -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c -index 6acc431..62dbdca 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c -@@ -31,11 +31,71 @@ - #include "cik_structs.h" - #include "oss/oss_2_4_sh_mask.h" - -+#define AQL_ENABLE 1 -+ - static inline struct cik_mqd *get_mqd(void *mqd) - { - return (struct cik_mqd *)mqd; - } - -+static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) -+{ -+ return (struct cik_sdma_rlc_registers *)mqd; -+} -+ -+static void update_cu_mask(struct mqd_manager *mm, void *mqd, -+ struct queue_properties *q) -+{ -+ struct cik_mqd *m; -+ struct kfd_cu_info cu_info; -+ uint32_t mgmt_se_mask; -+ uint32_t cu_sh_mask, cu_sh_shift; -+ uint32_t cu_mask; -+ int se, sh; -+ -+ if (q->cu_mask == 0) -+ return; -+ -+ m = get_mqd(mqd); -+ m->compute_static_thread_mgmt_se0 = 0; -+ m->compute_static_thread_mgmt_se1 = 0; -+ m->compute_static_thread_mgmt_se2 = 0; -+ m->compute_static_thread_mgmt_se3 = 0; -+ -+ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); -+ cu_mask = q->cu_mask; -+ for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) { -+ mgmt_se_mask = 0; -+ for (sh = 0; sh < 2 && cu_mask; sh++) { -+ cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]); -+ cu_sh_mask = (1 << cu_sh_shift) - 1; -+ mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16); -+ cu_mask >>= cu_sh_shift; -+ } -+ switch (se) { -+ case 0: -+ m->compute_static_thread_mgmt_se0 = mgmt_se_mask; -+ break; -+ case 1: -+ m->compute_static_thread_mgmt_se1 = mgmt_se_mask; -+ break; -+ case 2: -+ m->compute_static_thread_mgmt_se2 = mgmt_se_mask; -+ break; -+ case 3: -+ m->compute_static_thread_mgmt_se3 = mgmt_se_mask; -+ break; -+ default: -+ break; -+ } -+ } -+ pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n", -+ m->compute_static_thread_mgmt_se0, -+ m->compute_static_thread_mgmt_se1, -+ m->compute_static_thread_mgmt_se2, -+ m->compute_static_thread_mgmt_se3); -+} -+ - static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -@@ -152,15 +212,16 @@ static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - } - - static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr) -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t page_table_base) - { - return mm->dev->kfd2kgd->hqd_load -- (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); -+ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base); - } - - static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, -- uint32_t __user *wptr) -+ uint32_t __user *wptr, uint32_t page_table_base) - { - return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); - } -@@ -197,11 +258,14 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, - m->cp_hqd_pq_control |= NO_UPDATE_RPTR; - } - -+ update_cu_mask(mm, mqd, q); -+ - m->cp_hqd_active = 0; - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && -- q->queue_percent > 0) { -+ q->queue_percent > 0 && -+ !q->is_evicted) { - m->cp_hqd_active = 1; - q->is_active = true; - } -@@ -217,8 +281,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, - BUG_ON(!mm || !mqd || !q); - - m = get_sdma_mqd(mqd); -- m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << -- SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | -+ m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) -+ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | - q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | - 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; -@@ -239,7 +303,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && -- q->queue_percent > 0) { -+ q->queue_percent > 0 && -+ !q->is_evicted) { - m->sdma_rlc_rb_cntl |= - 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; - -@@ -388,7 +453,8 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && -- q->queue_percent > 0) { -+ q->queue_percent > 0 && -+ !q->is_evicted) { - m->cp_hqd_active = 1; - q->is_active = true; - } -@@ -396,16 +462,6 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, - return 0; - } - --struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) --{ -- struct cik_sdma_rlc_registers *m; -- -- BUG_ON(!mqd); -- -- m = (struct cik_sdma_rlc_registers *)mqd; -- -- return m; --} - - struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c -index a9b9882..4260c2f 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c -@@ -28,9 +28,9 @@ - #include "kfd_priv.h" - #include "kfd_mqd_manager.h" - #include "vi_structs.h" --#include "gca/gfx_8_0_sh_mask.h" --#include "gca/gfx_8_0_enum.h" -- -+#include "asic_reg/gca/gfx_8_0_sh_mask.h" -+#include "asic_reg/gca/gfx_8_0_enum.h" -+#include "oss/oss_3_0_sh_mask.h" - #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 - - static inline struct vi_mqd *get_mqd(void *mqd) -@@ -38,6 +38,64 @@ static inline struct vi_mqd *get_mqd(void *mqd) - return (struct vi_mqd *)mqd; - } - -+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) -+{ -+ return (struct vi_sdma_mqd *)mqd; -+} -+ -+static void update_cu_mask(struct mqd_manager *mm, void *mqd, -+ struct queue_properties *q) -+{ -+ struct vi_mqd *m; -+ struct kfd_cu_info cu_info; -+ uint32_t mgmt_se_mask; -+ uint32_t cu_sh_mask, cu_sh_shift; -+ uint32_t cu_mask; -+ int se, sh; -+ -+ if (q->cu_mask == 0) -+ return; -+ -+ m = get_mqd(mqd); -+ m->compute_static_thread_mgmt_se0 = 0; -+ m->compute_static_thread_mgmt_se1 = 0; -+ m->compute_static_thread_mgmt_se2 = 0; -+ m->compute_static_thread_mgmt_se3 = 0; -+ -+ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); -+ cu_mask = q->cu_mask; -+ for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) { -+ mgmt_se_mask = 0; -+ for (sh = 0; sh < 2 && cu_mask; sh++) { -+ cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]); -+ cu_sh_mask = (1 << cu_sh_shift) - 1; -+ mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16); -+ cu_mask >>= cu_sh_shift; -+ } -+ switch (se) { -+ case 0: -+ m->compute_static_thread_mgmt_se0 = mgmt_se_mask; -+ break; -+ case 1: -+ m->compute_static_thread_mgmt_se1 = mgmt_se_mask; -+ break; -+ case 2: -+ m->compute_static_thread_mgmt_se2 = mgmt_se_mask; -+ break; -+ case 3: -+ m->compute_static_thread_mgmt_se3 = mgmt_se_mask; -+ break; -+ default: -+ break; -+ } -+ } -+ pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n", -+ m->compute_static_thread_mgmt_se0, -+ m->compute_static_thread_mgmt_se1, -+ m->compute_static_thread_mgmt_se2, -+ m->compute_static_thread_mgmt_se3); -+} -+ - static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -@@ -84,6 +142,25 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, - if (q->format == KFD_QUEUE_FORMAT_AQL) - m->cp_hqd_iq_rptr = 1; - -+ if (q->tba_addr) { -+ m->cp_hqd_persistent_state |= -+ (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); -+ m->compute_pgm_rsrc2 |= -+ (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); -+ m->cp_hqd_ctx_save_base_addr_lo = -+ lower_32_bits(q->ctx_save_restore_area_address); -+ m->cp_hqd_ctx_save_base_addr_hi = -+ upper_32_bits(q->ctx_save_restore_area_address); -+ m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; -+ m->cp_hqd_cntl_stack_size = q->ctl_stack_size; -+ m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; -+ m->cp_hqd_wg_state_offset = q->ctl_stack_size; -+ m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); -+ m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); -+ m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); -+ m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); -+ } -+ - *mqd = m; - if (gart_addr != NULL) - *gart_addr = addr; -@@ -94,10 +171,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, - - static int load_mqd(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, -- uint32_t __user *wptr) -+ uint32_t __user *wptr, uint32_t page_table_base) - { - return mm->dev->kfd2kgd->hqd_load -- (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); -+ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base); - } - - static int __update_mqd(struct mqd_manager *mm, void *mqd, -@@ -155,12 +232,19 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | - 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; - } -+ if (q->tba_addr) -+ m->cp_hqd_ctx_save_control = -+ atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | -+ mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; -+ -+ update_cu_mask(mm, mqd, q); - - m->cp_hqd_active = 0; - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && -- q->queue_percent > 0) { -+ q->queue_percent > 0 && -+ !q->is_evicted) { - m->cp_hqd_active = 1; - q->is_active = true; - } -@@ -175,6 +259,12 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, - return __update_mqd(mm, mqd, q, MTYPE_CC, 1); - } - -+static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, -+ struct queue_properties *q) -+{ -+ return __update_mqd(mm, mqd, q, MTYPE_UC, 0); -+} -+ - static int destroy_mqd(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, -@@ -233,6 +323,111 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, - return retval; - } - -+static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, -+ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, -+ struct queue_properties *q) -+{ -+ int retval; -+ struct vi_sdma_mqd *m; -+ -+ -+ BUG_ON(!mm || !mqd || !mqd_mem_obj); -+ -+ retval = kfd_gtt_sa_allocate(mm->dev, -+ sizeof(struct vi_sdma_mqd), -+ mqd_mem_obj); -+ -+ if (retval != 0) -+ return -ENOMEM; -+ -+ m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; -+ -+ memset(m, 0, sizeof(struct vi_sdma_mqd)); -+ -+ *mqd = m; -+ if (gart_addr != NULL) -+ *gart_addr = (*mqd_mem_obj)->gpu_addr; -+ -+ retval = mm->update_mqd(mm, m, q); -+ -+ return retval; -+} -+ -+static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, -+ struct kfd_mem_obj *mqd_mem_obj) -+{ -+ BUG_ON(!mm || !mqd); -+ kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -+} -+ -+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, -+ uint32_t pipe_id, uint32_t queue_id, -+ uint32_t __user *wptr, uint32_t page_table_base) -+{ -+ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); -+} -+ -+static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, -+ struct queue_properties *q) -+{ -+ struct vi_sdma_mqd *m; -+ BUG_ON(!mm || !mqd || !q); -+ -+ m = get_sdma_mqd(mqd); -+ m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) -+ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | -+ q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | -+ 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | -+ 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; -+ -+ m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); -+ m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); -+ m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); -+ m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -+ m->sdmax_rlcx_doorbell = q->doorbell_off << -+ SDMA0_RLC0_DOORBELL__OFFSET__SHIFT | -+ 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; -+ -+ m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr; -+ -+ m->sdma_engine_id = q->sdma_engine_id; -+ m->sdma_queue_id = q->sdma_queue_id; -+ -+ q->is_active = false; -+ if (q->queue_size > 0 && -+ q->queue_address != 0 && -+ q->queue_percent > 0 && -+ !q->is_evicted) { -+ m->sdmax_rlcx_rb_cntl |= -+ 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; -+ -+ q->is_active = true; -+ } -+ -+ return 0; -+} -+ -+/* -+ * * preempt type here is ignored because there is only one way -+ * * to preempt sdma queue -+ */ -+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, -+ enum kfd_preempt_type type, -+ unsigned int timeout, uint32_t pipe_id, -+ uint32_t queue_id) -+{ -+ return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); -+} -+ -+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, -+ uint64_t queue_address, uint32_t pipe_id, -+ uint32_t queue_id) -+{ -+ return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); -+} -+ -+ -+ - struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) - { -@@ -268,6 +463,12 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - mqd->is_occupied = is_occupied; - break; - case KFD_MQD_TYPE_SDMA: -+ mqd->init_mqd = init_mqd_sdma; -+ mqd->uninit_mqd = uninit_mqd_sdma; -+ mqd->load_mqd = load_mqd_sdma; -+ mqd->update_mqd = update_mqd_sdma; -+ mqd->destroy_mqd = destroy_mqd_sdma; -+ mqd->is_occupied = is_occupied_sdma; - break; - default: - kfree(mqd); -@@ -276,3 +477,17 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - - return mqd; - } -+ -+struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, -+ struct kfd_dev *dev) -+{ -+ struct mqd_manager *mqd; -+ -+ mqd = mqd_manager_init_vi(type, dev); -+ if (!mqd) -+ return NULL; -+ if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) -+ mqd->update_mqd = update_mqd_tonga; -+ return mqd; -+} -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -index 7e92921..55f7098 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -@@ -57,25 +57,37 @@ static void pm_calc_rlib_size(struct packet_manager *pm, - { - unsigned int process_count, queue_count; - unsigned int map_queue_size; -+ unsigned int max_proc_per_quantum = 1; - -- BUG_ON(!pm || !rlib_size || !over_subscription); -+ struct kfd_dev *dev = pm->dqm->dev; -+ -+ BUG_ON(!pm || !rlib_size || !over_subscription || !dev); - - process_count = pm->dqm->processes_count; - queue_count = pm->dqm->queue_count; - -- /* check if there is over subscription*/ -+ /* check if there is over subscription -+ * Note: the arbitration between the number of VMIDs and -+ * hws_max_conc_proc has been done in -+ * kgd2kfd_device_init(). -+ */ -+ - *over_subscription = false; -- if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) { -+ -+ if (dev->max_proc_per_quantum > 1) -+ max_proc_per_quantum = dev->max_proc_per_quantum; -+ -+ if ((process_count > max_proc_per_quantum) || -+ queue_count > get_queues_num(pm->dqm)) { - *over_subscription = true; - pr_debug("kfd: over subscribed runlist\n"); - } - -- map_queue_size = -- (pm->dqm->dev->device_info->asic_family == CHIP_CARRIZO) ? -+ map_queue_size = KFD_IS_VI(pm->dqm->dev->device_info->asic_family) ? - sizeof(struct pm4_mes_map_queues) : - sizeof(struct pm4_map_queues); - /* calculate run list ib allocation size */ -- *rlib_size = process_count * sizeof(struct pm4_map_process) + -+ *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + - queue_count * map_queue_size; - - /* -@@ -102,11 +114,14 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, - - pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); - -+ mutex_lock(&pm->lock); -+ - retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, - &pm->ib_buffer_obj); - - if (retval != 0) { - pr_err("kfd: failed to allocate runlist IB\n"); -+ mutex_unlock(&pm->lock); - return retval; - } - -@@ -115,6 +130,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, - - memset(*rl_buffer, 0, *rl_buffer_size); - pm->allocated = true; -+ -+ mutex_unlock(&pm->lock); - return retval; - } - -@@ -122,9 +139,24 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, - uint64_t ib, size_t ib_size_in_dwords, bool chain) - { - struct pm4_runlist *packet; -+ int concurrent_proc_cnt = 0; -+ struct kfd_dev *kfd = pm->dqm->dev; - - BUG_ON(!pm || !buffer || !ib); - -+ /* Determine the number of processes to map together to HW: -+ * it can not exceed the number of VMIDs available to the -+ * scheduler, and it is determined by the smaller of the number -+ * of processes in the runlist and kfd module parameter -+ * hws_max_conc_proc. -+ * Note: the arbitration between the number of VMIDs and -+ * hws_max_conc_proc has been done in -+ * kgd2kfd_device_init(). -+ */ -+ concurrent_proc_cnt = min(pm->dqm->processes_count, -+ kfd->max_proc_per_quantum); -+ -+ - packet = (struct pm4_runlist *)buffer; - - memset(buffer, 0, sizeof(struct pm4_runlist)); -@@ -135,6 +167,7 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, - packet->bitfields4.chain = chain ? 1 : 0; - packet->bitfields4.offload_polling = 0; - packet->bitfields4.valid = 1; -+ packet->bitfields4.process_cnt = concurrent_proc_cnt; - packet->ordinal2 = lower_32_bits(ib); - packet->bitfields3.ib_base_hi = upper_32_bits(ib); - -@@ -181,6 +214,90 @@ static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, - return 0; - } - -+static int pm_create_map_process_scratch_kv(struct packet_manager *pm, -+ uint32_t *buffer, struct qcm_process_device *qpd) -+{ -+ struct pm4_map_process_scratch_kv *packet; -+ struct queue *cur; -+ uint32_t num_queues; -+ -+ BUG_ON(!pm || !buffer || !qpd); -+ -+ packet = (struct pm4_map_process_scratch_kv *)buffer; -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); -+ -+ packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, -+ sizeof(struct pm4_map_process_scratch_kv)); -+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -+ packet->bitfields2.process_quantum = 1; -+ packet->bitfields2.pasid = qpd->pqm->process->pasid; -+ packet->bitfields3.page_table_base = qpd->page_table_base; -+ packet->bitfields14.gds_size = qpd->gds_size; -+ packet->bitfields14.num_gws = qpd->num_gws; -+ packet->bitfields14.num_oac = qpd->num_oac; -+ num_queues = 0; -+ list_for_each_entry(cur, &qpd->queues_list, list) -+ num_queues++; -+ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues; -+ -+ packet->sh_mem_config = qpd->sh_mem_config; -+ packet->sh_mem_bases = qpd->sh_mem_bases; -+ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -+ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -+ -+ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; -+ -+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -+ -+ return 0; -+} -+ -+static int pm_create_map_process_scratch(struct packet_manager *pm, -+ uint32_t *buffer, struct qcm_process_device *qpd) -+{ -+ struct pm4_map_process_scratch *packet; -+ struct queue *cur; -+ uint32_t num_queues; -+ -+ BUG_ON(!pm || !buffer || !qpd); -+ -+ packet = (struct pm4_map_process_scratch *)buffer; -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ memset(buffer, 0, sizeof(struct pm4_map_process_scratch)); -+ -+ packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, -+ sizeof(struct pm4_map_process_scratch)); -+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -+ packet->bitfields2.process_quantum = 1; -+ packet->bitfields2.pasid = qpd->pqm->process->pasid; -+ packet->bitfields3.page_table_base = qpd->page_table_base; -+ packet->bitfields10.gds_size = qpd->gds_size; -+ packet->bitfields10.num_gws = qpd->num_gws; -+ packet->bitfields10.num_oac = qpd->num_oac; -+ num_queues = 0; -+ list_for_each_entry(cur, &qpd->queues_list, list) -+ num_queues++; -+ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; -+ -+ packet->sh_mem_config = qpd->sh_mem_config; -+ packet->sh_mem_bases = qpd->sh_mem_bases; -+ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -+ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -+ -+ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; -+ -+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -+ -+ return 0; -+} -+ - static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, - struct queue *q, bool is_static) - { -@@ -218,7 +335,7 @@ static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, - queue_type__mes_map_queues__debug_interface_queue_vi; - break; - case KFD_QUEUE_TYPE_SDMA: -- packet->bitfields2.engine_sel = -+ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + - engine_sel__mes_map_queues__sdma0_vi; - use_static = false; /* no static queues under SDMA */ - break; -@@ -278,7 +395,7 @@ static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, - engine_sel__mes_map_queues__compute; - break; - case KFD_QUEUE_TYPE_SDMA: -- packet->bitfields2.engine_sel = -+ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + - engine_sel__mes_map_queues__sdma0; - use_static = false; /* no static queues under SDMA */ - break; -@@ -347,12 +464,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - return -ENOMEM; - } - -- retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); -+ retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); - if (retval != 0) - return retval; - - proccesses_mapped++; -- inc_wptr(&rl_wptr, sizeof(struct pm4_map_process), -+ inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(), - alloc_size_bytes); - - list_for_each_entry(kq, &qpd->priv_queue_list, list) { -@@ -362,8 +479,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n", - kq->queue->queue, qpd->is_debug); - -- if (pm->dqm->dev->device_info->asic_family == -- CHIP_CARRIZO) -+ if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) - retval = pm_create_map_queue_vi(pm, - &rl_buffer[rl_wptr], - kq->queue, -@@ -388,8 +504,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n", - q->queue, qpd->is_debug); - -- if (pm->dqm->dev->device_info->asic_family == -- CHIP_CARRIZO) -+ if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) - retval = pm_create_map_queue_vi(pm, - &rl_buffer[rl_wptr], - q, -@@ -422,7 +537,23 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - return 0; - } - --int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) -+static int get_map_process_packet_size(void) -+{ -+ return sizeof(struct pm4_map_process); -+} -+ -+static int get_map_process_packet_size_scratch_kv(void) -+{ -+ return sizeof(struct pm4_map_process_scratch_kv); -+} -+ -+static int get_map_process_packet_size_scratch(void) -+{ -+ return sizeof(struct pm4_map_process_scratch); -+} -+ -+int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, -+ uint16_t fw_ver) - { - BUG_ON(!dqm); - -@@ -433,8 +564,37 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) - mutex_destroy(&pm->lock); - return -ENOMEM; - } -+ pm->pmf = kzalloc(sizeof(struct packet_manager_firmware), GFP_KERNEL); - pm->allocated = false; - -+ switch (pm->dqm->dev->device_info->asic_family) { -+ case CHIP_KAVERI: -+ if (fw_ver >= KFD_SCRATCH_KV_FW_VER) { -+ pm->pmf->map_process = pm_create_map_process_scratch_kv; -+ pm->pmf->get_map_process_packet_size = -+ get_map_process_packet_size_scratch_kv; -+ } else { -+ pm->pmf->map_process = pm_create_map_process; -+ pm->pmf->get_map_process_packet_size = -+ get_map_process_packet_size; -+ } -+ break; -+ case CHIP_CARRIZO: -+ case CHIP_TONGA: -+ case CHIP_FIJI: -+ if (fw_ver >= KFD_SCRATCH_CZ_FW_VER) { -+ pm->pmf->map_process = pm_create_map_process_scratch; -+ pm->pmf->get_map_process_packet_size = -+ get_map_process_packet_size_scratch; -+ } else { -+ pm->pmf->map_process = pm_create_map_process; -+ pm->pmf->get_map_process_packet_size = -+ get_map_process_packet_size; -+ } -+ break; -+ -+ } -+ - return 0; - } - -@@ -444,6 +604,7 @@ void pm_uninit(struct packet_manager *pm) - - mutex_destroy(&pm->lock); - kernel_queue_uninit(pm->priv_queue); -+ kfree(pm->pmf); - } - - int pm_send_set_resources(struct packet_manager *pm, -@@ -576,7 +737,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, - } - - int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, -- enum kfd_preempt_type_filter mode, -+ enum kfd_unmap_queues_filter filter, - uint32_t filter_param, bool reset, - unsigned int sdma_engine) - { -@@ -596,8 +757,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - - packet = (struct pm4_unmap_queues *)buffer; - memset(buffer, 0, sizeof(struct pm4_unmap_queues)); -- pr_debug("kfd: static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n", -- mode, reset, type); -+ pr_debug("kfd: static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n", -+ filter, reset, type); - packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, - sizeof(struct pm4_unmap_queues)); - switch (type) { -@@ -622,26 +783,26 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - packet->bitfields2.action = - action__mes_unmap_queues__preempt_queues; - -- switch (mode) { -- case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: -+ switch (filter) { -+ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_specified_queues; - packet->bitfields2.num_queues = 1; - packet->bitfields3b.doorbell_offset0 = filter_param; - break; -- case KFD_PREEMPT_TYPE_FILTER_BY_PASID: -+ case KFD_UNMAP_QUEUES_FILTER_BY_PASID: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; - packet->bitfields3a.pasid = filter_param; - break; -- case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES: -+ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; - break; -- case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES: -+ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: - /* in this case, we do not preempt static queues */ -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; - break; - default: - BUG(); -@@ -669,3 +830,4 @@ void pm_release_ib(struct packet_manager *pm) - } - mutex_unlock(&pm->lock); - } -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h -index 5b393f3..e7570cc 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h -@@ -127,7 +127,8 @@ struct pm4_runlist { - uint32_t offload_polling:1; - uint32_t reserved3:1; - uint32_t valid:1; -- uint32_t reserved4:8; -+ uint32_t process_cnt:4; -+ uint32_t reserved4:4; - } bitfields4; - uint32_t ordinal4; - }; -@@ -186,6 +187,123 @@ struct pm4_map_process { - }; - #endif - -+/*--------------------MES_MAP_PROCESS_SCRATCH-------------------- */ -+ -+#ifndef PM4_MES_MAP_PROCESS_SCRATCH_DEFINED -+#define PM4_MES_MAP_PROCESS_SCRATCH_DEFINED -+ -+struct pm4_map_process_scratch { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ uint32_t pasid:16; -+ uint32_t reserved1:8; -+ uint32_t diq_enable:1; -+ uint32_t process_quantum:7; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ union { -+ struct { -+ uint32_t page_table_base:28; -+ uint32_t reserved3:4; -+ } bitfields3; -+ uint32_t ordinal3; -+ }; -+ -+ uint32_t reserved; -+ -+ uint32_t sh_mem_bases; -+ uint32_t sh_mem_config; -+ uint32_t sh_mem_ape1_base; -+ uint32_t sh_mem_ape1_limit; -+ -+ uint32_t sh_hidden_private_base_vmid; -+ -+ uint32_t reserved2; -+ uint32_t reserved3; -+ -+ uint32_t gds_addr_lo; -+ uint32_t gds_addr_hi; -+ -+ union { -+ struct { -+ uint32_t num_gws:6; -+ uint32_t reserved4:2; -+ uint32_t num_oac:4; -+ uint32_t reserved5:4; -+ uint32_t gds_size:6; -+ uint32_t num_queues:10; -+ } bitfields10; -+ uint32_t ordinal10; -+ }; -+ -+ uint32_t completion_signal_lo; -+ uint32_t completion_signal_hi; -+ -+}; -+#endif -+ -+#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH -+#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH -+ -+struct pm4_map_process_scratch_kv { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ uint32_t pasid:16; -+ uint32_t reserved1:8; -+ uint32_t diq_enable:1; -+ uint32_t process_quantum:7; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ union { -+ struct { -+ uint32_t page_table_base:28; -+ uint32_t reserved2:4; -+ } bitfields3; -+ uint32_t ordinal3; -+ }; -+ -+ uint32_t reserved3; -+ uint32_t sh_mem_bases; -+ uint32_t sh_mem_config; -+ uint32_t sh_mem_ape1_base; -+ uint32_t sh_mem_ape1_limit; -+ uint32_t sh_hidden_private_base_vmid; -+ uint32_t reserved4; -+ uint32_t reserved5; -+ uint32_t gds_addr_lo; -+ uint32_t gds_addr_hi; -+ -+ union { -+ struct { -+ uint32_t num_gws:6; -+ uint32_t reserved6:2; -+ uint32_t num_oac:4; -+ uint32_t reserved7:4; -+ uint32_t gds_size:6; -+ uint32_t num_queues:10; -+ } bitfields14; -+ uint32_t ordinal14; -+ }; -+ -+ uint32_t completion_signal_lo32; -+uint32_t completion_signal_hi32; -+}; -+#endif -+ - /*--------------------MES_MAP_QUEUES--------------------*/ - - #ifndef PM4_MES_MAP_QUEUES_DEFINED -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -index 4750cab..c654471 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -@@ -30,13 +30,45 @@ - #include <linux/atomic.h> - #include <linux/workqueue.h> - #include <linux/spinlock.h> -+#include <linux/idr.h> - #include <linux/kfd_ioctl.h> -+#include <linux/pid.h> -+#include <linux/interval_tree.h> - #include <kgd_kfd_interface.h> - -+#include <drm/amd_rdma.h> -+ - #define KFD_SYSFS_FILE_MODE 0444 - --#define KFD_MMAP_DOORBELL_MASK 0x8000000000000 --#define KFD_MMAP_EVENTS_MASK 0x4000000000000 -+/* GPU ID hash width in bits */ -+#define KFD_GPU_ID_HASH_WIDTH 16 -+ -+/* Use upper bits of mmap offset to store KFD driver specific information. -+ * BITS[63:62] - Encode MMAP type -+ * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to -+ * BITS[45:40] - Reserved. Not Used. -+ * BITS[39:0] - MMAP offset value. Used by TTM. -+ * -+ * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these -+ * defines are w.r.t to PAGE_SIZE -+ */ -+#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) -+#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) -+#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) -+#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) -+#define KFD_MMAP_TYPE_MAP_BO (0x1ULL << KFD_MMAP_TYPE_SHIFT) -+#define KFD_MMAP_TYPE_RESERVED_MEM (0x0ULL << KFD_MMAP_TYPE_SHIFT) -+ -+#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) -+#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ -+ << KFD_MMAP_GPU_ID_SHIFT) -+#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ -+ & KFD_MMAP_GPU_ID_MASK) -+#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ -+ >> KFD_MMAP_GPU_ID_SHIFT) -+ -+#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT) -+#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) - - /* - * When working with cp scheduler we should assign the HIQ manually or via -@@ -48,8 +80,6 @@ - #define KFD_CIK_HIQ_PIPE 4 - #define KFD_CIK_HIQ_QUEUE 0 - --/* GPU ID hash width in bits */ --#define KFD_GPU_ID_HASH_WIDTH 16 - - /* Macro for allocating structures */ - #define kfd_alloc_struct(ptr_to_struct) \ -@@ -74,12 +104,26 @@ extern int max_num_of_queues_per_device; - /* Kernel module parameter to specify the scheduling policy */ - extern int sched_policy; - -+extern int cwsr_enable; -+ -+/* -+ * Kernel module parameter to specify the maximum process -+ * number per HW scheduler -+ */ -+extern int hws_max_conc_proc; -+ - /* - * Kernel module parameter to specify whether to send sigterm to HSA process on - * unhandled exception - */ - extern int send_sigterm; - -+/* -+ * This kernel module is used to simulate large bar machine on non-large bar -+ * enabled machines. -+ */ -+extern int debug_largebar; -+ - /** - * enum kfd_sched_policy - * -@@ -114,14 +158,17 @@ enum cache_policy { - - enum asic_family_type { - CHIP_KAVERI = 0, -- CHIP_CARRIZO -+ CHIP_CARRIZO, -+ CHIP_TONGA, -+ CHIP_FIJI - }; - -+#define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_FIJI) -+#define KFD_IS_DGPU(chip) ((chip) >= CHIP_TONGA && (chip) <= CHIP_FIJI) -+ - struct kfd_event_interrupt_class { -- bool (*interrupt_isr)(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry); -- void (*interrupt_wq)(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry); -+ bool (*interrupt_isr)(struct kfd_dev *dev, const uint32_t *ih_ring_entry); -+ void (*interrupt_wq)(struct kfd_dev *dev, const uint32_t *ih_ring_entry); - }; - - struct kfd_device_info { -@@ -132,6 +179,7 @@ struct kfd_device_info { - size_t ih_ring_entry_size; - uint8_t num_of_watch_points; - uint16_t mqd_size_aligned; -+ bool is_need_iommu_device; - }; - - struct kfd_mem_obj { -@@ -141,6 +189,12 @@ struct kfd_mem_obj { - uint32_t *cpu_ptr; - }; - -+struct kfd_vmid_info { -+ uint32_t first_vmid_kfd; -+ uint32_t last_vmid_kfd; -+ uint32_t vmid_num_kfd; -+}; -+ - struct kfd_dev { - struct kgd_dev *kgd; - -@@ -165,11 +219,12 @@ struct kfd_dev { - */ - - struct kgd2kfd_shared_resources shared_resources; -+ struct kfd_vmid_info vm_info; - - const struct kfd2kgd_calls *kfd2kgd; - struct mutex doorbell_mutex; -- DECLARE_BITMAP(doorbell_available_index, -- KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); -+ unsigned long doorbell_available_index[DIV_ROUND_UP( -+ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; - - void *gtt_mem; - uint64_t gtt_start_gpu_addr; -@@ -179,6 +234,11 @@ struct kfd_dev { - unsigned int gtt_sa_chunk_size; - unsigned int gtt_sa_num_of_chunks; - -+ /* QCM Device instance */ -+ struct device_queue_manager *dqm; -+ -+ bool init_complete; -+ - /* Interrupts */ - void *interrupt_ring; - size_t interrupt_ring_size; -@@ -187,10 +247,6 @@ struct kfd_dev { - struct work_struct interrupt_work; - spinlock_t interrupt_lock; - -- /* QCM Device instance */ -- struct device_queue_manager *dqm; -- -- bool init_complete; - /* - * Interrupts of interest to KFD are copied - * from the HW ring into a SW ring. -@@ -198,7 +254,26 @@ struct kfd_dev { - bool interrupts_active; - - /* Debug manager */ -- struct kfd_dbgmgr *dbgmgr; -+ struct kfd_dbgmgr *dbgmgr; -+ -+ /* MEC firmware version*/ -+ uint16_t mec_fw_version; -+ -+ /* Maximum process number mapped to HW scheduler */ -+ unsigned int max_proc_per_quantum; -+ -+ /* cwsr */ -+ bool cwsr_enabled; -+ struct page *cwsr_pages; -+ uint32_t cwsr_size; -+ uint32_t tma_offset; /*Offset for TMA from the start of cwsr_mem*/ -+}; -+ -+struct kfd_bo { -+ void *mem; -+ struct interval_tree_node it; -+ struct kfd_dev *dev; -+ struct list_head cb_data_head; - }; - - /* KGD2KFD callbacks */ -@@ -221,22 +296,22 @@ void kfd_chardev_exit(void); - struct device *kfd_chardev(void); - - /** -- * enum kfd_preempt_type_filter -+ * enum kfd_unmap_queues_filter - * -- * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue. -+ * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue. - * -- * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the -+ * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the - * running queues list. - * -- * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to -+ * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to - * specific process. - * - */ --enum kfd_preempt_type_filter { -- KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE, -- KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, -- KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, -- KFD_PREEMPT_TYPE_FILTER_BY_PASID -+enum kfd_unmap_queues_filter { -+ KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE, -+ KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, -+ KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, -+ KFD_UNMAP_QUEUES_FILTER_BY_PASID - }; - - enum kfd_preempt_type { -@@ -324,6 +399,7 @@ struct queue_properties { - uint32_t __iomem *doorbell_ptr; - uint32_t doorbell_off; - bool is_interop; -+ bool is_evicted; /* true -> queue is evicted */ - bool is_active; - /* Not relevant for user mode queues in cp scheduling */ - unsigned int vmid; -@@ -336,6 +412,11 @@ struct queue_properties { - uint32_t eop_ring_buffer_size; - uint64_t ctx_save_restore_area_address; - uint32_t ctx_save_restore_area_size; -+ uint32_t ctl_stack_size; -+ uint64_t tba_addr; -+ uint64_t tma_addr; -+ /* Relevant for CU */ -+ uint32_t cu_mask; - }; - - /** -@@ -424,6 +505,7 @@ struct qcm_process_device { - unsigned int queue_count; - unsigned int vmid; - bool is_debug; -+ unsigned evicted; /* eviction counter, 0=active */ - /* - * All the memory management data should be here too - */ -@@ -436,8 +518,22 @@ struct qcm_process_device { - uint32_t gds_size; - uint32_t num_gws; - uint32_t num_oac; -+ uint32_t sh_hidden_private_base; -+ -+ /*cwsr memory*/ -+ int cwsr_mem_handle; -+ uint64_t cwsr_base; -+ uint64_t tba_addr; -+ uint64_t tma_addr; -+ void *cwsr_kaddr; - }; - -+/*8 byte handle containing GPU ID in the most significant 4 bytes and -+ * idr_handle in the least significant 4 bytes*/ -+#define MAKE_HANDLE(gpu_id, idr_handle) (((uint64_t)(gpu_id) << 32) + idr_handle) -+#define GET_GPU_ID(handle) (handle >> 32) -+#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) -+ - /* Data that is per-process-per device. */ - struct kfd_process_device { - /* -@@ -449,6 +545,8 @@ struct kfd_process_device { - /* The device that owns this data. */ - struct kfd_dev *dev; - -+ /* The process that owns this kfd_process_device. */ -+ struct kfd_process *process; - - /* per-process-per device QCM data structure */ - struct qcm_process_device qpd; -@@ -460,10 +558,23 @@ struct kfd_process_device { - uint64_t gpuvm_limit; - uint64_t scratch_base; - uint64_t scratch_limit; -+ uint64_t dgpu_base; -+ uint64_t dgpu_limit; -+ uint64_t mapped_size; -+ uint64_t last_eviction; -+ bool evicted; -+ -+ uint64_t sh_hidden_private_base_vmid; - - /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ - bool bound; - -+ /* VM context for GPUVM allocations */ -+ void *vm; -+ -+ /* GPUVM allocations storage */ -+ struct idr alloc_idr; -+ - /* This flag tells if we should reset all - * wavefronts on process termination - */ -@@ -482,7 +593,7 @@ struct kfd_process { - - struct mm_struct *mm; - -- struct mutex mutex; -+ struct rw_semaphore lock; - - /* - * In any process, the thread that started main() is the lead -@@ -513,6 +624,8 @@ struct kfd_process { - /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ - struct kfd_queue **queues; - -+ unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; -+ - /*Is the user space process 32 bit?*/ - bool is_32bit_user_mode; - -@@ -520,10 +633,12 @@ struct kfd_process { - struct mutex event_mutex; - /* All events in process hashed by ID, linked on kfd_event.events. */ - DECLARE_HASHTABLE(events, 4); -- struct list_head signal_event_pages; /* struct slot_page_header. -- event_pages */ -+ struct list_head signal_event_pages; /* struct slot_page_header.event_pages */ - u32 next_nonsignal_event_id; - size_t signal_event_count; -+ size_t debug_event_count; -+ -+ struct rb_root bo_interval_tree; - }; - - /** -@@ -546,9 +661,10 @@ struct amdkfd_ioctl_desc { - - void kfd_process_create_wq(void); - void kfd_process_destroy_wq(void); --struct kfd_process *kfd_create_process(const struct task_struct *); -+struct kfd_process *kfd_create_process(struct file *filep); - struct kfd_process *kfd_get_process(const struct task_struct *); - struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); -+struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); - - struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, - struct kfd_process *p); -@@ -558,6 +674,29 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, - struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, - struct kfd_process *p); - -+int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma); -+ -+/* KFD process API for creating and translating handles */ -+int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, -+ void *mem, uint64_t start, -+ uint64_t length); -+void *kfd_process_device_translate_handle(struct kfd_process_device *p, -+ int handle); -+struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, -+ int handle); -+void *kfd_process_find_bo_from_interval(struct kfd_process *p, -+ uint64_t start_addr, -+ uint64_t last_addr); -+void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, -+ int handle); -+ -+void run_rdma_free_callback(struct kfd_bo *buf_obj); -+struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); -+ -+/* kfd dgpu memory */ -+int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem, -+ struct kfd_process *p, struct kfd_process_device *pdd); -+ - /* Process device data iterator */ - struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p); - struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, -@@ -600,7 +739,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu); - int kfd_topology_remove_device(struct kfd_dev *gpu); - struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); - struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); --struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); -+struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd); -+uint32_t kfd_get_gpu_id(struct kfd_dev *dev); -+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); -+int kfd_numa_node_to_apic_id(int numa_node_id); -+int kfd_get_proximity_domain(const struct pci_bus *bus); - - /* Interrupts */ - int kfd_interrupt_init(struct kfd_dev *dev); -@@ -615,9 +758,12 @@ int kgd2kfd_resume(struct kfd_dev *kfd); - - /* amdkfd Apertures */ - int kfd_init_apertures(struct kfd_process *process); -+int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, -+ uint64_t base, uint64_t limit); - - /* Queue Context Management */ --struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); -+inline uint32_t lower_32(uint64_t x); -+inline uint32_t upper_32(uint64_t x); - - int init_queue(struct queue **q, const struct queue_properties *properties); - void uninit_queue(struct queue *q); -@@ -630,11 +776,15 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); - struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); -+struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, -+ struct kfd_dev *dev); - struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); - void device_queue_manager_uninit(struct device_queue_manager *dqm); - struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, - enum kfd_queue_type type); - void kernel_queue_uninit(struct kernel_queue *kq); -+int kfd_process_vm_fault(struct device_queue_manager *dqm, -+ unsigned int pasid); - - /* Process Queue Manager */ - struct process_queue_node { -@@ -649,18 +799,16 @@ int pqm_create_queue(struct process_queue_manager *pqm, - struct kfd_dev *dev, - struct file *f, - struct queue_properties *properties, -- unsigned int flags, -- enum kfd_queue_type type, - unsigned int *qid); - int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); - int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, - struct queue_properties *p); -+int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, -+ struct queue_properties *p); - struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, - unsigned int qid); -- --int amdkfd_fence_wait_timeout(unsigned int *fence_addr, -- unsigned int fence_value, -- unsigned long timeout); -+int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm); -+int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); - - /* Packet Manager */ - -@@ -668,7 +816,9 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - - #define KFD_FENCE_COMPLETED (100) - #define KFD_FENCE_INIT (10) --#define KFD_UNMAP_LATENCY (150) -+#define KFD_UNMAP_LATENCY (40) -+ -+struct packet_manager_firmware; - - struct packet_manager { - struct device_queue_manager *dqm; -@@ -676,9 +826,19 @@ struct packet_manager { - struct mutex lock; - bool allocated; - struct kfd_mem_obj *ib_buffer_obj; -+ -+ struct packet_manager_firmware *pmf; - }; - --int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); -+struct packet_manager_firmware { -+ /* Support different firmware versions for map process packet */ -+ int (*map_process)(struct packet_manager *pm, uint32_t *buffer, -+ struct qcm_process_device *qpd); -+ int (*get_map_process_packet_size)(void); -+}; -+ -+int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, -+ uint16_t fw_ver); - void pm_uninit(struct packet_manager *pm); - int pm_send_set_resources(struct packet_manager *pm, - struct scheduling_resources *res); -@@ -687,7 +847,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, - uint32_t fence_value); - - int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, -- enum kfd_preempt_type_filter mode, -+ enum kfd_unmap_queues_filter mode, - uint32_t filter_param, bool reset, - unsigned int sdma_engine); - -@@ -696,6 +856,9 @@ void pm_release_ib(struct packet_manager *pm); - uint64_t kfd_get_number_elems(struct kfd_dev *kfd); - phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, - struct kfd_process *process); -+int amdkfd_fence_wait_timeout(unsigned int *fence_addr, -+ unsigned int fence_value, -+ unsigned long timeout); - - /* Events */ - extern const struct kfd_event_interrupt_class event_interrupt_class_cik; -@@ -714,8 +877,7 @@ int kfd_wait_on_events(struct kfd_process *p, - uint32_t num_events, void __user *data, - bool all, uint32_t user_timeout_ms, - enum kfd_event_wait_result *wait_result); --void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, -- uint32_t valid_id_bits); -+void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits); - void kfd_signal_iommu_event(struct kfd_dev *dev, - unsigned int pasid, unsigned long address, - bool is_write_requested, bool is_execute_requested); -@@ -723,11 +885,28 @@ void kfd_signal_hw_exception_event(unsigned int pasid); - int kfd_set_event(struct kfd_process *p, uint32_t event_id); - int kfd_reset_event(struct kfd_process *p, uint32_t event_id); - int kfd_event_create(struct file *devkfd, struct kfd_process *p, -- uint32_t event_type, bool auto_reset, uint32_t node_id, -- uint32_t *event_id, uint32_t *event_trigger_data, -- uint64_t *event_page_offset, uint32_t *event_slot_index); -+ uint32_t event_type, bool auto_reset, uint32_t node_id, -+ uint32_t *event_id, uint32_t *event_trigger_data, -+ uint64_t *event_page_offset, uint32_t *event_slot_index, -+ void *kern_addr); - int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); -+void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle); -+ -+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, -+ struct kfd_vm_fault_info *info); -+ -+void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid); - - int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); -+int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem); -+int kgd2kfd_restore(struct kfd_dev *kfd); -+int evict_size(struct kfd_process *p, int size, int type); -+int evict_bo(struct kfd_dev *dev, void *mem); -+int restore(struct kfd_dev *kfd); -+ -+#define KFD_SCRATCH_CZ_FW_VER 600 -+#define KFD_SCRATCH_KV_FW_VER 413 -+#define KFD_MULTI_PROC_MAPPING_HWS_SUPPORT 600 -+#define KFD_CWSR_CZ_FW_VER 625 - - #endif -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c -index 035bbc9..a069c3d 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c -@@ -28,6 +28,10 @@ - #include <linux/amd-iommu.h> - #include <linux/notifier.h> - #include <linux/compat.h> -+#include <linux/mm.h> -+#include <asm/tlb.h> -+#include <linux/highmem.h> -+#include <uapi/asm-generic/mman-common.h> - - struct mm_struct; - -@@ -41,6 +45,7 @@ struct mm_struct; - */ - #define INITIAL_QUEUE_ARRAY_SIZE 16 - -+static int evict_pdd(struct kfd_process_device *pdd); - /* - * List of struct kfd_process (field kfd_process). - * Unique/indexed by mm_struct* -@@ -58,8 +63,14 @@ struct kfd_process_release_work { - struct kfd_process *p; - }; - --static struct kfd_process *find_process(const struct task_struct *thread); -+#define MIN_IDR_ID 1 -+#define MAX_IDR_ID 0 /*0 - for unlimited*/ -+ -+static struct kfd_process *find_process(const struct task_struct *thread, -+ bool lock); - static struct kfd_process *create_process(const struct task_struct *thread); -+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); -+ - - void kfd_process_create_wq(void) - { -@@ -75,10 +86,12 @@ void kfd_process_destroy_wq(void) - } - } - --struct kfd_process *kfd_create_process(const struct task_struct *thread) -+struct kfd_process *kfd_create_process(struct file *filep) - { - struct kfd_process *process; - -+ struct task_struct *thread = current; -+ - BUG_ON(!kfd_process_wq); - - if (thread->mm == NULL) -@@ -99,7 +112,7 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread) - mutex_lock(&kfd_processes_mutex); - - /* A prior open of /dev/kfd could have already created the process. */ -- process = find_process(thread); -+ process = find_process(thread, false); - if (process) - pr_debug("kfd: process already found\n"); - -@@ -110,6 +123,8 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread) - - up_write(&thread->mm->mmap_sem); - -+ kfd_process_init_cwsr(process, filep); -+ - return process; - } - -@@ -124,7 +139,7 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) - if (thread->group_leader->mm != thread->mm) - return ERR_PTR(-EINVAL); - -- process = find_process(thread); -+ process = find_process(thread, false); - - return process; - } -@@ -141,23 +156,164 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) - return NULL; - } - --static struct kfd_process *find_process(const struct task_struct *thread) -+static struct kfd_process *find_process(const struct task_struct *thread, -+ bool lock) - { - struct kfd_process *p; - int idx; - - idx = srcu_read_lock(&kfd_processes_srcu); - p = find_process_by_mm(thread->mm); -+ if (p && lock) -+ down_read(&p->lock); - srcu_read_unlock(&kfd_processes_srcu, idx); - - return p; - } - -+/* This returns with process->lock read-locked. */ -+struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid) -+{ -+ struct task_struct *task = NULL; -+ struct kfd_process *p = NULL; -+ -+ if (!pid) -+ task = current; -+ else -+ task = get_pid_task(pid, PIDTYPE_PID); -+ -+ if (task) -+ p = find_process(task, true); -+ -+ return p; -+} -+ -+int evict_size(struct kfd_process *process, int size, int type) -+{ -+ struct kfd_process_device *pdd, *temp_pdd = NULL; -+ struct kfd_process *p = process; -+ int temp = 0; -+ -+ down_write(&p->lock); -+ -+ if (type == EVICT_FIRST_PDD) { -+ -+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -+ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", -+ pdd->dev->id, p->pasid); -+ if (pdd->mapped_size >= size) { -+ evict_pdd(pdd); -+ return 0; -+ } -+ -+ } -+ } else if (type == EVICT_BIGGEST_PDD) { -+ -+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -+ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", -+ pdd->dev->id, p->pasid); -+ if (pdd->mapped_size >= temp) { -+ temp = pdd->mapped_size; -+ temp_pdd = pdd; -+ } -+ -+ } -+ if (temp_pdd->mapped_size > size) { -+ evict_pdd(temp_pdd); -+ return 0; -+ } -+ -+ } -+ up_write(&p->lock); -+ return 0; -+ -+} -+ -+int evict_bo(struct kfd_dev *dev, void *mem) -+{ -+ struct kfd_process_device *pdd; -+ -+ pdd = dev->kfd2kgd->get_pdd_from_buffer_object(dev->kgd, -+ ((struct kgd_mem *)mem)); -+ -+ if (pdd) -+ evict_pdd(pdd); -+ -+ return 0; -+} -+ -+static int evict_pdd(struct kfd_process_device *pdd) -+{ -+ void *mem; -+ int id; -+ -+ /*process_evict_queues(struct device_queue_manager *dqm, pdd->qpd)*/ -+ /* -+ * Remove all handles from idr and release appropriate -+ * local memory object -+ */ -+ idr_for_each_entry(&pdd->alloc_idr, mem, id) { -+ pdd->dev->kfd2kgd->unmap_memory_to_gpu( -+ pdd->dev->kgd, mem, pdd->vm); -+ } -+ pdd->last_eviction = jiffies; -+ pdd->mapped_size = 0; -+ pdd->evicted = true; -+ -+ /*flush_tlb_all();*/ -+ -+ return 0; -+} -+ -+int restore(struct kfd_dev *kfd) -+{ -+ struct kfd_process *p = NULL; -+ /* TODO still working on how to get the process */ -+ struct kfd_process_device *pdd = kfd_get_process_device_data(kfd, p); -+ void *mem; -+ int id; -+ -+ /* need to run on all processes*/ -+ down_write(&p->lock); -+ -+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -+ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", -+ pdd->dev->id, p->pasid); -+ -+ /* -+ * Remove all handles from idr and release appropriate -+ * local memory object -+ */ -+ if (pdd->evicted) { -+ idr_for_each_entry(&pdd->alloc_idr, mem, id) { -+ pdd->dev->kfd2kgd->map_memory_to_gpu( -+ pdd->dev->kgd, -+ mem, pdd->vm); -+ pdd->last_eviction = 0; -+ pdd->mapped_size = 0; -+ } -+ -+ /*process_restore_queues -+ * (struct device_queue_manager *dqm, pdd->qpd)*/ -+ } else { -+ pdd->evicted = false; -+ } -+ } -+ up_write(&p->lock); -+ return 0; -+} -+ -+/* No process locking is needed in this function, because the process -+ * is not findable any more. We must assume that no other thread is -+ * using it any more, otherwise we couldn't safely free the process -+ * stucture in the end. */ - static void kfd_process_wq_release(struct work_struct *work) - { - struct kfd_process_release_work *my_work; -- struct kfd_process_device *pdd, *temp; -+ struct kfd_process_device *pdd, *temp, *peer_pdd; - struct kfd_process *p; -+ struct kfd_bo *buf_obj; -+ int id; - - my_work = (struct kfd_process_release_work *) work; - -@@ -166,19 +322,40 @@ static void kfd_process_wq_release(struct work_struct *work) - pr_debug("Releasing process (pasid %d) in workqueue\n", - p->pasid); - -- mutex_lock(&p->mutex); -- -- list_for_each_entry_safe(pdd, temp, &p->per_device_data, -- per_device_list) { -+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", - pdd->dev->id, p->pasid); - -- if (pdd->reset_wavefronts) -- dbgdev_wave_reset_wavefronts(pdd->dev, p); -+ if (pdd->dev->device_info->is_need_iommu_device) -+ amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); -+ -+ /* -+ * Remove all handles from idr and release appropriate -+ * local memory object -+ */ -+ idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) { -+ list_for_each_entry(peer_pdd, -+ &p->per_device_data, per_device_list) { -+ pdd->dev->kfd2kgd->unmap_memory_to_gpu( -+ peer_pdd->dev->kgd, -+ buf_obj->mem, peer_pdd->vm); -+ } -+ -+ run_rdma_free_callback(buf_obj); -+ pdd->dev->kfd2kgd->free_memory_of_gpu( -+ pdd->dev->kgd, buf_obj->mem); -+ kfd_process_device_remove_obj_handle(pdd, id); -+ } -+ } - -- amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); -+ list_for_each_entry_safe(pdd, temp, &p->per_device_data, -+ per_device_list) { -+ radeon_flush_tlb(pdd->dev, p->pasid); -+ /* Destroy the GPUVM VM context */ -+ if (pdd->vm) -+ pdd->dev->kfd2kgd->destroy_process_vm( -+ pdd->dev->kgd, pdd->vm); - list_del(&pdd->per_device_list); -- - kfree(pdd); - } - -@@ -186,15 +363,11 @@ static void kfd_process_wq_release(struct work_struct *work) - - kfd_pasid_free(p->pasid); - -- mutex_unlock(&p->mutex); -- -- mutex_destroy(&p->mutex); -- - kfree(p->queues); - - kfree(p); - -- kfree(work); -+ kfree((void *)work); - } - - static void kfd_process_destroy_delayed(struct rcu_head *rcu) -@@ -223,6 +396,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, - { - struct kfd_process *p; - struct kfd_process_device *pdd = NULL; -+ struct kfd_dev *dev = NULL; -+ long status = -EFAULT; - - /* - * The kfd_process structure can not be free because the -@@ -236,9 +411,31 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, - mutex_unlock(&kfd_processes_mutex); - synchronize_srcu(&kfd_processes_srcu); - -- mutex_lock(&p->mutex); -+ down_write(&p->lock); -+ -+ /* Iterate over all process device data structures and if the pdd is in -+ * debug mode,we should first force unregistration, then we will be -+ * able to destroy the queues */ -+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -+ dev = pdd->dev; -+ mutex_lock(get_dbgmgr_mutex()); -+ -+ if ((dev != NULL) && -+ (dev->dbgmgr) && -+ (dev->dbgmgr->pasid == p->pasid)) { -+ -+ status = kfd_dbgmgr_unregister(dev->dbgmgr, p); -+ if (status == 0) { -+ kfd_dbgmgr_destroy(dev->dbgmgr); -+ dev->dbgmgr = NULL; -+ } -+ } -+ mutex_unlock(get_dbgmgr_mutex()); -+ } -+ -+ -+ /* now we can uninit the pqm: */ - -- /* In case our notifier is called before IOMMU notifier */ - pqm_uninit(&p->pqm); - - /* Iterate over all process device data structure and check -@@ -256,7 +453,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, - } - } - -- mutex_unlock(&p->mutex); -+ up_write(&p->lock); - - /* - * Because we drop mm_count inside kfd_process_destroy_delayed -@@ -272,6 +469,94 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { - .release = kfd_process_notifier_release, - }; - -+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep) -+{ -+ int err; -+ unsigned long offset; -+ struct kfd_process_device *temp, *pdd = NULL; -+ void *mem = NULL; -+ struct kfd_dev *dev = NULL; -+ struct qcm_process_device *qpd = NULL; -+ -+ down_write(&p->lock); -+ list_for_each_entry_safe(pdd, temp, &p->per_device_data, -+ per_device_list) { -+ dev = pdd->dev; -+ qpd = &pdd->qpd; -+ if (!dev->cwsr_enabled || qpd->tba_addr) -+ continue; -+ if (qpd->cwsr_base) { -+ /* cwsr_base is only set for DGPU */ -+ -+ /* can't hold the process lock while -+ * allocating from KGD */ -+ up_write(&p->lock); -+ -+ err = dev->kfd2kgd->alloc_memory_of_gpu( -+ dev->kgd, qpd->cwsr_base, dev->cwsr_size, -+ pdd->vm, (struct kgd_mem **)&mem, -+ NULL, &qpd->cwsr_kaddr, pdd, -+ ALLOC_MEM_FLAGS_GTT | -+ ALLOC_MEM_FLAGS_NONPAGED | -+ ALLOC_MEM_FLAGS_EXECUTE_ACCESS | -+ ALLOC_MEM_FLAGS_NO_SUBSTITUTE); -+ if (err) -+ goto err_alloc_tba; -+ err = kfd_map_memory_to_gpu(dev, mem, p, pdd); -+ if (err) -+ goto err_map_tba; -+ -+ down_write(&p->lock); -+ /* Check if someone else allocated the memory -+ * while we weren't looking */ -+ if (qpd->tba_addr) { -+ up_write(&p->lock); -+ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, -+ (struct kgd_mem *)mem, pdd->vm); -+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem); -+ down_write(&p->lock); -+ } else { -+ qpd->cwsr_mem_handle = -+ kfd_process_device_create_obj_handle( -+ pdd, mem, qpd->cwsr_base, -+ dev->cwsr_size); -+ if (qpd->cwsr_mem_handle < 0) -+ goto err_create_handle; -+ -+ memcpy(qpd->cwsr_kaddr, kmap(dev->cwsr_pages), -+ PAGE_SIZE); -+ kunmap(dev->cwsr_pages); -+ qpd->tba_addr = qpd->cwsr_base; -+ } -+ } else { -+ offset = (kfd_get_gpu_id(dev) | -+ KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT; -+ qpd->tba_addr = (uint64_t)vm_mmap(filep, 0, -+ dev->cwsr_size, PROT_READ | PROT_EXEC, -+ MAP_SHARED, offset); -+ qpd->cwsr_kaddr = (void *)qpd->tba_addr; -+ } -+ if (IS_ERR_VALUE(qpd->tba_addr)) { -+ pr_err("Failure to set tba address. error -%d.\n", -+ (int)qpd->tba_addr); -+ qpd->tba_addr = 0; -+ qpd->cwsr_kaddr = NULL; -+ } else -+ qpd->tma_addr = qpd->tba_addr + dev->tma_offset; -+ pr_debug("set tba :0x%llx, tma:0x%llx for pqm.\n", -+ qpd->tba_addr, qpd->tma_addr); -+ } -+ -+err_create_handle: -+ up_write(&p->lock); -+ return err; -+ -+err_map_tba: -+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem); -+err_alloc_tba: -+ return err; -+} -+ - static struct kfd_process *create_process(const struct task_struct *thread) - { - struct kfd_process *process; -@@ -282,6 +567,8 @@ static struct kfd_process *create_process(const struct task_struct *thread) - if (!process) - goto err_alloc_process; - -+ process->bo_interval_tree = RB_ROOT; -+ - process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, - sizeof(process->queues[0]), GFP_KERNEL); - if (!process->queues) -@@ -291,7 +578,7 @@ static struct kfd_process *create_process(const struct task_struct *thread) - if (process->pasid == 0) - goto err_alloc_pasid; - -- mutex_init(&process->mutex); -+ init_rwsem(&process->lock); - - process->mm = thread->mm; - -@@ -364,8 +651,22 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, - INIT_LIST_HEAD(&pdd->qpd.queues_list); - INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); - pdd->qpd.dqm = dev->dqm; -+ pdd->qpd.pqm = &p->pqm; -+ pdd->qpd.evicted = 0; - pdd->reset_wavefronts = false; -+ pdd->process = p; - list_add(&pdd->per_device_list, &p->per_device_data); -+ -+ /* Init idr used for memory handle translation */ -+ idr_init(&pdd->alloc_idr); -+ -+ /* Create the GPUVM context for this specific device */ -+ if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm)) { -+ pr_err("Failed to create process VM object\n"); -+ list_del(&pdd->per_device_list); -+ kfree(pdd); -+ pdd = NULL; -+ } - } - - return pdd; -@@ -393,9 +694,11 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, - if (pdd->bound) - return pdd; - -- err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); -- if (err < 0) -- return ERR_PTR(err); -+ if (dev->device_info->is_need_iommu_device) { -+ err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); -+ if (err < 0) -+ return ERR_PTR(err); -+ } - - pdd->bound = true; - -@@ -420,18 +723,21 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) - - pr_debug("Unbinding process %d from IOMMU\n", pasid); - -- if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) -- kfd_dbgmgr_destroy(dev->dbgmgr); -- -- pqm_uninit(&p->pqm); -+ mutex_lock(get_dbgmgr_mutex()); - -- pdd = kfd_get_process_device_data(dev, p); -+ if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) { - -- if (!pdd) { -- mutex_unlock(&p->mutex); -- return; -+ if (kfd_dbgmgr_unregister(dev->dbgmgr, p) == 0) { -+ kfd_dbgmgr_destroy(dev->dbgmgr); -+ dev->dbgmgr = NULL; -+ } - } - -+ mutex_unlock(get_dbgmgr_mutex()); -+ -+ pqm_uninit(&p->pqm); -+ -+ pdd = kfd_get_process_device_data(dev, p); - if (pdd->reset_wavefronts) { - dbgdev_wave_reset_wavefronts(pdd->dev, p); - pdd->reset_wavefronts = false; -@@ -444,9 +750,10 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) - * We don't call amd_iommu_unbind_pasid() here - * because the IOMMU called us. - */ -- pdd->bound = false; -+ if (pdd) -+ pdd->bound = false; - -- mutex_unlock(&p->mutex); -+ up_write(&p->lock); - } - - struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p) -@@ -469,7 +776,116 @@ bool kfd_has_process_device_data(struct kfd_process *p) - return !(list_empty(&p->per_device_data)); - } - --/* This returns with process->mutex locked. */ -+/* Create specific handle mapped to mem from process local memory idr -+ * Assumes that the process lock is held. */ -+int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, -+ void *mem, uint64_t start, -+ uint64_t length) -+{ -+ int handle; -+ struct kfd_bo *buf_obj; -+ struct kfd_process *p; -+ -+ BUG_ON(pdd == NULL); -+ BUG_ON(mem == NULL); -+ -+ p = pdd->process; -+ -+ buf_obj = kmalloc(sizeof(*buf_obj), GFP_KERNEL); -+ -+ if (!buf_obj) -+ return -ENOMEM; -+ -+ buf_obj->it.start = start; -+ buf_obj->it.last = start + length - 1; -+ interval_tree_insert(&buf_obj->it, &p->bo_interval_tree); -+ -+ buf_obj->mem = mem; -+ buf_obj->dev = pdd->dev; -+ -+ INIT_LIST_HEAD(&buf_obj->cb_data_head); -+ -+ idr_preload(GFP_KERNEL); -+ -+ handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID, -+ GFP_NOWAIT); -+ -+ idr_preload_end(); -+ -+ return handle; -+} -+ -+struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, -+ int handle) -+{ -+ BUG_ON(pdd == NULL); -+ -+ if (handle < 0) -+ return NULL; -+ -+ return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle); -+} -+ -+/* Translate specific handle from process local memory idr -+ * Assumes that the process lock is held. */ -+void *kfd_process_device_translate_handle(struct kfd_process_device *pdd, -+ int handle) -+{ -+ struct kfd_bo *buf_obj; -+ -+ buf_obj = kfd_process_device_find_bo(pdd, handle); -+ -+ return buf_obj->mem; -+} -+ -+void *kfd_process_find_bo_from_interval(struct kfd_process *p, -+ uint64_t start_addr, -+ uint64_t last_addr) -+{ -+ struct interval_tree_node *it_node; -+ struct kfd_bo *buf_obj; -+ -+ it_node = interval_tree_iter_first(&p->bo_interval_tree, -+ start_addr, last_addr); -+ if (!it_node) { -+ pr_err("%llu - %llu does not relate to an existing buffer\n", -+ start_addr, last_addr); -+ return NULL; -+ } -+ -+ BUG_ON(NULL != interval_tree_iter_next(it_node, -+ start_addr, last_addr)); -+ -+ buf_obj = container_of(it_node, struct kfd_bo, it); -+ -+ return buf_obj; -+} -+ -+/* Remove specific handle from process local memory idr -+ * Assumes that the process lock is held. */ -+void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, -+ int handle) -+{ -+ struct kfd_bo *buf_obj; -+ struct kfd_process *p; -+ -+ BUG_ON(pdd == NULL); -+ -+ p = pdd->process; -+ -+ if (handle < 0) -+ return; -+ -+ buf_obj = kfd_process_device_find_bo(pdd, handle); -+ -+ idr_remove(&pdd->alloc_idr, handle); -+ -+ interval_tree_remove(&buf_obj->it, &p->bo_interval_tree); -+ -+ kfree(buf_obj); -+} -+ -+/* This returns with process->lock read-locked. */ - struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) - { - struct kfd_process *p; -@@ -479,7 +895,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - if (p->pasid == pasid) { -- mutex_lock(&p->mutex); -+ down_read(&p->lock); - break; - } - } -@@ -488,3 +904,53 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) - - return p; - } -+ -+/* This returns with process->lock read-locked. */ -+struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) -+{ -+ struct kfd_process *p; -+ -+ int idx = srcu_read_lock(&kfd_processes_srcu); -+ -+ p = find_process_by_mm(mm); -+ if (p != NULL) -+ down_read(&p->lock); -+ -+ srcu_read_unlock(&kfd_processes_srcu, idx); -+ -+ return p; -+} -+ -+int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma) -+{ -+ unsigned long pfn, i; -+ int ret = 0; -+ struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); -+ -+ if (dev == NULL) -+ return -EINVAL; -+ if ((vma->vm_start & (PAGE_SIZE - 1)) || -+ (vma->vm_end & (PAGE_SIZE - 1))) { -+ pr_err("KFD only support page aligned memory map.\n"); -+ return -EINVAL; -+ } -+ -+ pr_debug("kfd reserved mem mmap been called.\n"); -+ /* We supported two reserved memory mmap in the future . -+ 1. Trap handler code and parameter (TBA and TMA , 2 pages total) -+ 2. Relaunch stack (control block, 1 page for Carrizo) -+ */ -+ -+ for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); ++i) { -+ pfn = page_to_pfn(&dev->cwsr_pages[i]); -+ vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND -+ | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; -+ /* mapping the page to user process */ -+ ret = remap_pfn_range(vma, vma->vm_start + (i << PAGE_SHIFT), -+ pfn, PAGE_SIZE, vma->vm_page_prot); -+ if (ret) -+ break; -+ } -+ return ret; -+} -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c -index 46f497e..e79cd42 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c -@@ -89,23 +89,36 @@ void pqm_uninit(struct process_queue_manager *pqm) - { - int retval; - struct process_queue_node *pqn, *next; -+ struct kfd_process_device *pdd; -+ struct kfd_dev *dev = NULL; - - BUG_ON(!pqm); - - pr_debug("In func %s\n", __func__); - - list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { -- retval = pqm_destroy_queue( -- pqm, -- (pqn->q != NULL) ? -- pqn->q->properties.queue_id : -- pqn->kq->queue->properties.queue_id); -- -- if (retval != 0) { -- pr_err("kfd: failed to destroy queue\n"); -- return; -+ if (pqn->q) -+ dev = pqn->q->device; -+ else if (pqn->kq) -+ dev = pqn->kq->dev; -+ else -+ BUG(); -+ -+ pdd = kfd_get_process_device_data(dev, pqm->process); -+ if (pdd) { -+ retval = dev->dqm->ops.process_termination -+ (dev->dqm, &pdd->qpd); -+ if (retval != 0) -+ pdd->reset_wavefronts = true; - } - } -+ -+ list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { -+ uninit_queue(pqn->q); -+ list_del(&pqn->process_queue_list); -+ kfree(pqn); -+ } -+ - kfree(pqm->queue_slot_bitmap); - pqm->queue_slot_bitmap = NULL; - } -@@ -148,23 +161,19 @@ int pqm_create_queue(struct process_queue_manager *pqm, - struct kfd_dev *dev, - struct file *f, - struct queue_properties *properties, -- unsigned int flags, -- enum kfd_queue_type type, - unsigned int *qid) - { - int retval; - struct kfd_process_device *pdd; -- struct queue_properties q_properties; - struct queue *q; - struct process_queue_node *pqn; - struct kernel_queue *kq; - int num_queues = 0; - struct queue *cur; -+ enum kfd_queue_type type = properties->type; - - BUG_ON(!pqm || !dev || !properties || !qid); - -- memset(&q_properties, 0, sizeof(struct queue_properties)); -- memcpy(&q_properties, properties, sizeof(struct queue_properties)); - q = NULL; - kq = NULL; - -@@ -192,10 +201,9 @@ int pqm_create_queue(struct process_queue_manager *pqm, - if (retval != 0) - return retval; - -- if (list_empty(&pqm->queues)) { -- pdd->qpd.pqm = pqm; -+ if (list_empty(&pdd->qpd.queues_list) && -+ list_empty(&pdd->qpd.priv_queue_list)) - dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); -- } - - pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL); - if (!pqn) { -@@ -205,18 +213,34 @@ int pqm_create_queue(struct process_queue_manager *pqm, - - switch (type) { - case KFD_QUEUE_TYPE_SDMA: -+ if (dev->dqm->sdma_queue_count >= CIK_SDMA_QUEUES) { -+ pr_err("kfd: over-subscription is not allowed for SDMA.\n"); -+ retval = -EPERM; -+ goto err_create_queue; -+ } -+ -+ retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); -+ if (retval != 0) -+ goto err_create_queue; -+ pqn->q = q; -+ pqn->kq = NULL; -+ retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, -+ &q->properties.vmid); -+ pr_debug("DQM returned %d for create_queue\n", retval); -+ print_queue(q); -+ break; - - case KFD_QUEUE_TYPE_COMPUTE: - /* check if there is over subscription */ - if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && -- ((dev->dqm->processes_count >= VMID_PER_DEVICE) || -+ ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || - (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) { - pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); - retval = -EPERM; - goto err_create_queue; - } - -- retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid); -+ retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); - if (retval != 0) - goto err_create_queue; - pqn->q = q; -@@ -253,9 +277,8 @@ int pqm_create_queue(struct process_queue_manager *pqm, - list_add(&pqn->process_queue_list, &pqm->queues); - - if (q) { -- *properties = q->properties; - pr_debug("kfd: PQM done creating queue\n"); -- print_queue_properties(properties); -+ print_queue_properties(&q->properties); - } - - return retval; -@@ -265,7 +288,8 @@ int pqm_create_queue(struct process_queue_manager *pqm, - err_allocate_pqn: - /* check if queues list is empty unregister process from device */ - clear_bit(*qid, pqm->queue_slot_bitmap); -- if (list_empty(&pqm->queues)) -+ if (list_empty(&pdd->qpd.queues_list) && -+ list_empty(&pdd->qpd.priv_queue_list)) - dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); - return retval; - } -@@ -314,9 +338,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) - if (pqn->q) { - dqm = pqn->q->device->dqm; - retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); -- if (retval != 0) -+ if (retval != 0) { -+ if (retval == -ETIME) -+ pdd->reset_wavefronts = true; - return retval; -- -+ } - uninit_queue(pqn->q); - } - -@@ -324,7 +350,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) - kfree(pqn); - clear_bit(qid, pqm->queue_slot_bitmap); - -- if (list_empty(&pqm->queues)) -+ if (list_empty(&pdd->qpd.queues_list) && -+ list_empty(&pdd->qpd.priv_queue_list)) - dqm->ops.unregister_process(dqm, &pdd->qpd); - - return retval; -@@ -358,6 +385,31 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, - return 0; - } - -+int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, -+ struct queue_properties *p) -+{ -+ int retval; -+ struct process_queue_node *pqn; -+ -+ BUG_ON(!pqm); -+ -+ pqn = get_queue_by_qid(pqm, qid); -+ if (!pqn) { -+ pr_debug("amdkfd: No queue %d exists for update operation\n", -+ qid); -+ return -EFAULT; -+ } -+ -+ pqn->q->properties.cu_mask = p->cu_mask; -+ -+ retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, -+ pqn->q); -+ if (retval != 0) -+ return retval; -+ -+ return 0; -+} -+ - struct kernel_queue *pqm_get_kernel_queue( - struct process_queue_manager *pqm, - unsigned int qid) -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c -new file mode 100644 -index 0000000..69bdaf1 ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c -@@ -0,0 +1,296 @@ -+/* -+ * Copyright 2015 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#include <linux/device.h> -+#include <linux/export.h> -+#include <linux/pid.h> -+#include <linux/err.h> -+#include <linux/slab.h> -+#include "amd_rdma.h" -+#include "kfd_priv.h" -+ -+ -+struct rdma_cb { -+ struct list_head node; -+ struct amd_p2p_info amd_p2p_data; -+ void (*free_callback)(void *client_priv); -+ void *client_priv; -+}; -+ -+/** -+ * This function makes the pages underlying a range of GPU virtual memory -+ * accessible for DMA operations from another PCIe device -+ * -+ * \param address - The start address in the Unified Virtual Address -+ * space in the specified process -+ * \param length - The length of requested mapping -+ * \param pid - Pointer to structure pid to which address belongs. -+ * Could be NULL for current process address space. -+ * \param p2p_data - On return: Pointer to structure describing -+ * underlying pages/locations -+ * \param free_callback - Pointer to callback which will be called when access -+ * to such memory must be stopped immediately: Memory -+ * was freed, GECC events, etc. -+ * Client should immediately stop any transfer -+ * operations and returned as soon as possible. -+ * After return all resources associated with address -+ * will be release and no access will be allowed. -+ * \param client_priv - Pointer to be passed as parameter on -+ * 'free_callback; -+ * -+ * \return 0 if operation was successful -+ */ -+static int get_pages(uint64_t address, uint64_t length, struct pid *pid, -+ struct amd_p2p_info **amd_p2p_data, -+ void (*free_callback)(void *client_priv), -+ void *client_priv) -+{ -+ struct kfd_bo *buf_obj; -+ struct kgd_mem *mem; -+ struct sg_table *sg_table_tmp; -+ struct kfd_dev *dev; -+ uint64_t last = address + length - 1; -+ uint64_t offset; -+ struct kfd_process *p; -+ struct rdma_cb *rdma_cb_data; -+ int ret = 0; -+ -+ p = kfd_lookup_process_by_pid(pid); -+ if (!p) { -+ pr_err("could not find the process in %s.\n", -+ __func__); -+ return -EINVAL; -+ } -+ -+ buf_obj = kfd_process_find_bo_from_interval(p, address, last); -+ if (!buf_obj) { -+ pr_err("can not find a kfd_bo for the range\n"); -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL); -+ if (!rdma_cb_data) { -+ *amd_p2p_data = NULL; -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ mem = buf_obj->mem; -+ dev = buf_obj->dev; -+ offset = address - buf_obj->it.start; -+ -+ ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem, -+ offset, length, &sg_table_tmp); -+ -+ if (ret) { -+ pr_err("pin_get_sg_table_bo failed.\n"); -+ *amd_p2p_data = NULL; -+ goto free_mem; -+ } -+ -+ rdma_cb_data->amd_p2p_data.va = address; -+ rdma_cb_data->amd_p2p_data.size = length; -+ rdma_cb_data->amd_p2p_data.pid = pid; -+ rdma_cb_data->amd_p2p_data.priv = buf_obj; -+ rdma_cb_data->amd_p2p_data.pages = sg_table_tmp; -+ -+ rdma_cb_data->free_callback = free_callback; -+ rdma_cb_data->client_priv = client_priv; -+ -+ list_add(&rdma_cb_data->node, &buf_obj->cb_data_head); -+ -+ *amd_p2p_data = &rdma_cb_data->amd_p2p_data; -+ -+ goto out; -+ -+free_mem: -+ kfree(rdma_cb_data); -+out: -+ up_read(&p->lock); -+ -+ return ret; -+} -+ -+static int put_pages_helper(struct amd_p2p_info *p2p_data) -+{ -+ struct kfd_bo *buf_obj; -+ struct kfd_dev *dev; -+ struct sg_table *sg_table_tmp; -+ struct rdma_cb *rdma_cb_data; -+ -+ if (!p2p_data) { -+ pr_err("amd_p2p_info pointer is invalid.\n"); -+ return -EINVAL; -+ } -+ -+ rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data); -+ -+ buf_obj = p2p_data->priv; -+ dev = buf_obj->dev; -+ sg_table_tmp = p2p_data->pages; -+ -+ list_del(&rdma_cb_data->node); -+ kfree(rdma_cb_data); -+ -+ dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp); -+ -+ -+ return 0; -+} -+ -+void run_rdma_free_callback(struct kfd_bo *buf_obj) -+{ -+ struct rdma_cb *tmp, *rdma_cb_data; -+ -+ list_for_each_entry_safe(rdma_cb_data, tmp, -+ &buf_obj->cb_data_head, node) { -+ if (rdma_cb_data->free_callback) -+ rdma_cb_data->free_callback( -+ rdma_cb_data->client_priv); -+ -+ put_pages_helper(&rdma_cb_data->amd_p2p_data); -+ } -+} -+ -+/** -+ * -+ * This function release resources previously allocated by get_pages() call. -+ * -+ * \param p_p2p_data - A pointer to pointer to amd_p2p_info entries -+ * allocated by get_pages() call. -+ * -+ * \return 0 if operation was successful -+ */ -+static int put_pages(struct amd_p2p_info **p_p2p_data) -+{ -+ struct kfd_process *p = NULL; -+ int ret = 0; -+ -+ if (!(*p_p2p_data)) { -+ pr_err("amd_p2p_info pointer is invalid.\n"); -+ return -EINVAL; -+ } -+ -+ p = kfd_lookup_process_by_pid((*p_p2p_data)->pid); -+ if (!p) { -+ pr_err("could not find the process in %s\n", -+ __func__); -+ return -EINVAL; -+ } -+ -+ ret = put_pages_helper(*p_p2p_data); -+ -+ if (!ret) -+ *p_p2p_data = NULL; -+ -+ up_read(&p->lock); -+ -+ return ret; -+} -+ -+/** -+ * Check if given address belongs to GPU address space. -+ * -+ * \param address - Address to check -+ * \param pid - Process to which given address belongs. -+ * Could be NULL if current one. -+ * -+ * \return 0 - This is not GPU address managed by AMD driver -+ * 1 - This is GPU address managed by AMD driver -+ */ -+static int is_gpu_address(uint64_t address, struct pid *pid) -+{ -+ struct kfd_bo *buf_obj; -+ struct kfd_process *p; -+ -+ p = kfd_lookup_process_by_pid(pid); -+ if (!p) { -+ pr_err("could not find the process in %s.\n", -+ __func__); -+ return 0; -+ } -+ -+ buf_obj = kfd_process_find_bo_from_interval(p, address, address); -+ -+ up_read(&p->lock); -+ if (!buf_obj) -+ return 0; -+ else -+ return 1; -+} -+ -+/** -+ * Return the single page size to be used when building scatter/gather table -+ * for given range. -+ * -+ * \param address - Address -+ * \param length - Range length -+ * \param pid - Process id structure. Could be NULL if current one. -+ * \param page_size - On return: Page size -+ * -+ * \return 0 if operation was successful -+ */ -+static int get_page_size(uint64_t address, uint64_t length, struct pid *pid, -+ unsigned long *page_size) -+{ -+ /* -+ * As local memory is always consecutive, we can assume the local -+ * memory page size to be arbitrary. -+ * Currently we assume the local memory page size to be the same -+ * as system memory, which is 4KB. -+ */ -+ *page_size = PAGE_SIZE; -+ -+ return 0; -+} -+ -+ -+/** -+ * Singleton object: rdma interface function pointers -+ */ -+static const struct amd_rdma_interface rdma_ops = { -+ .get_pages = get_pages, -+ .put_pages = put_pages, -+ .is_gpu_address = is_gpu_address, -+ .get_page_size = get_page_size, -+}; -+ -+/** -+ * amdkfd_query_rdma_interface - Return interface (function pointers table) for -+ * rdma interface -+ * -+ * -+ * \param interace - OUT: Pointer to interface -+ * -+ * \return 0 if operation was successful. -+ */ -+int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops) -+{ -+ *ops = &rdma_ops; -+ -+ return 0; -+} -+EXPORT_SYMBOL(amdkfd_query_rdma_interface); -+ -+ -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -index 1e50647..ba1c61c 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -@@ -28,16 +28,19 @@ - #include <linux/hash.h> - #include <linux/cpufreq.h> - #include <linux/log2.h> -+#include <linux/dmi.h> -+#include <linux/atomic.h> - - #include "kfd_priv.h" - #include "kfd_crat.h" - #include "kfd_topology.h" - --static struct list_head topology_device_list; --static int topology_crat_parsed; -+/* topology_device_list - Master list of all topology devices */ -+struct list_head topology_device_list; - static struct kfd_system_properties sys_props; - - static DECLARE_RWSEM(topology_lock); -+static atomic_t topology_crat_proximity_domain; - - struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) - { -@@ -57,311 +60,61 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) - return device; - } - --struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) -+uint32_t kfd_get_gpu_id(struct kfd_dev *dev) - { - struct kfd_topology_device *top_dev; -- struct kfd_dev *device = NULL; -+ uint32_t gpu_id = 0; - - down_read(&topology_lock); - - list_for_each_entry(top_dev, &topology_device_list, list) -- if (top_dev->gpu->pdev == pdev) { -- device = top_dev->gpu; -+ if (top_dev->gpu == dev) { -+ gpu_id = top_dev->gpu_id; - break; - } - - up_read(&topology_lock); - -- return device; --} -- --static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) --{ -- struct acpi_table_header *crat_table; -- acpi_status status; -- -- if (!size) -- return -EINVAL; -- -- /* -- * Fetch the CRAT table from ACPI -- */ -- status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); -- if (status == AE_NOT_FOUND) { -- pr_warn("CRAT table not found\n"); -- return -ENODATA; -- } else if (ACPI_FAILURE(status)) { -- const char *err = acpi_format_exception(status); -- -- pr_err("CRAT table error: %s\n", err); -- return -EINVAL; -- } -- -- if (*size >= crat_table->length && crat_image != NULL) -- memcpy(crat_image, crat_table, crat_table->length); -- -- *size = crat_table->length; -- -- return 0; -+ return gpu_id; - } - --static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, -- struct crat_subtype_computeunit *cu) --{ -- BUG_ON(!dev); -- BUG_ON(!cu); -- -- dev->node_props.cpu_cores_count = cu->num_cpu_cores; -- dev->node_props.cpu_core_id_base = cu->processor_id_low; -- if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) -- dev->node_props.capability |= HSA_CAP_ATS_PRESENT; -- -- pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, -- cu->processor_id_low); --} -- --static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, -- struct crat_subtype_computeunit *cu) --{ -- BUG_ON(!dev); -- BUG_ON(!cu); -- -- dev->node_props.simd_id_base = cu->processor_id_low; -- dev->node_props.simd_count = cu->num_simd_cores; -- dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; -- dev->node_props.max_waves_per_simd = cu->max_waves_simd; -- dev->node_props.wave_front_size = cu->wave_front_size; -- dev->node_props.mem_banks_count = cu->num_banks; -- dev->node_props.array_count = cu->num_arrays; -- dev->node_props.cu_per_simd_array = cu->num_cu_per_array; -- dev->node_props.simd_per_cu = cu->num_simd_per_cu; -- dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; -- if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) -- dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; -- pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, -- cu->processor_id_low); --} -- --/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ --static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) --{ -- struct kfd_topology_device *dev; -- int i = 0; -- -- BUG_ON(!cu); -- -- pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", -- cu->proximity_domain, cu->hsa_capability); -- list_for_each_entry(dev, &topology_device_list, list) { -- if (cu->proximity_domain == i) { -- if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) -- kfd_populated_cu_info_cpu(dev, cu); -- -- if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) -- kfd_populated_cu_info_gpu(dev, cu); -- break; -- } -- i++; -- } -- -- return 0; --} -- --/* -- * kfd_parse_subtype_mem is called when the topology mutex is -- * already acquired -- */ --static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) -+struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) - { -- struct kfd_mem_properties *props; -- struct kfd_topology_device *dev; -- int i = 0; -- -- BUG_ON(!mem); -- -- pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", -- mem->promixity_domain); -- list_for_each_entry(dev, &topology_device_list, list) { -- if (mem->promixity_domain == i) { -- props = kfd_alloc_struct(props); -- if (props == NULL) -- return -ENOMEM; -- -- if (dev->node_props.cpu_cores_count == 0) -- props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; -- else -- props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; -- -- if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) -- props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; -- if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) -- props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; -- -- props->size_in_bytes = -- ((uint64_t)mem->length_high << 32) + -- mem->length_low; -- props->width = mem->width; -+ struct kfd_topology_device *top_dev; -+ struct kfd_dev *device = NULL; - -- dev->mem_bank_count++; -- list_add_tail(&props->list, &dev->mem_props); -+ down_read(&topology_lock); - -+ list_for_each_entry(top_dev, &topology_device_list, list) -+ if (top_dev->gpu && top_dev->gpu->pdev == pdev) { -+ device = top_dev->gpu; - break; - } -- i++; -- } -- -- return 0; --} -- --/* -- * kfd_parse_subtype_cache is called when the topology mutex -- * is already acquired -- */ --static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) --{ -- struct kfd_cache_properties *props; -- struct kfd_topology_device *dev; -- uint32_t id; -- -- BUG_ON(!cache); -- -- id = cache->processor_id_low; - -- pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); -- list_for_each_entry(dev, &topology_device_list, list) -- if (id == dev->node_props.cpu_core_id_base || -- id == dev->node_props.simd_id_base) { -- props = kfd_alloc_struct(props); -- if (props == NULL) -- return -ENOMEM; -- -- props->processor_id_low = id; -- props->cache_level = cache->cache_level; -- props->cache_size = cache->cache_size; -- props->cacheline_size = cache->cache_line_size; -- props->cachelines_per_tag = cache->lines_per_tag; -- props->cache_assoc = cache->associativity; -- props->cache_latency = cache->cache_latency; -- -- if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) -- props->cache_type |= HSA_CACHE_TYPE_DATA; -- if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) -- props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; -- if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) -- props->cache_type |= HSA_CACHE_TYPE_CPU; -- if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) -- props->cache_type |= HSA_CACHE_TYPE_HSACU; -- -- dev->cache_count++; -- dev->node_props.caches_count++; -- list_add_tail(&props->list, &dev->cache_props); -- -- break; -- } -+ up_read(&topology_lock); - -- return 0; -+ return device; - } - --/* -- * kfd_parse_subtype_iolink is called when the topology mutex -- * is already acquired -- */ --static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) -+struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd) - { -- struct kfd_iolink_properties *props; -- struct kfd_topology_device *dev; -- uint32_t i = 0; -- uint32_t id_from; -- uint32_t id_to; -- -- BUG_ON(!iolink); -- -- id_from = iolink->proximity_domain_from; -- id_to = iolink->proximity_domain_to; -+ struct kfd_topology_device *top_dev; -+ struct kfd_dev *device = NULL; - -- pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); -- list_for_each_entry(dev, &topology_device_list, list) { -- if (id_from == i) { -- props = kfd_alloc_struct(props); -- if (props == NULL) -- return -ENOMEM; -- -- props->node_from = id_from; -- props->node_to = id_to; -- props->ver_maj = iolink->version_major; -- props->ver_min = iolink->version_minor; -- -- /* -- * weight factor (derived from CDIR), currently always 1 -- */ -- props->weight = 1; -- -- props->min_latency = iolink->minimum_latency; -- props->max_latency = iolink->maximum_latency; -- props->min_bandwidth = iolink->minimum_bandwidth_mbs; -- props->max_bandwidth = iolink->maximum_bandwidth_mbs; -- props->rec_transfer_size = -- iolink->recommended_transfer_size; -- -- dev->io_link_count++; -- dev->node_props.io_links_count++; -- list_add_tail(&props->list, &dev->io_link_props); -+ down_read(&topology_lock); - -+ list_for_each_entry(top_dev, &topology_device_list, list) -+ if (top_dev->gpu && top_dev->gpu->kgd == kgd) { -+ device = top_dev->gpu; - break; - } -- i++; -- } - -- return 0; --} -- --static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) --{ -- struct crat_subtype_computeunit *cu; -- struct crat_subtype_memory *mem; -- struct crat_subtype_cache *cache; -- struct crat_subtype_iolink *iolink; -- int ret = 0; -- -- BUG_ON(!sub_type_hdr); -- -- switch (sub_type_hdr->type) { -- case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: -- cu = (struct crat_subtype_computeunit *)sub_type_hdr; -- ret = kfd_parse_subtype_cu(cu); -- break; -- case CRAT_SUBTYPE_MEMORY_AFFINITY: -- mem = (struct crat_subtype_memory *)sub_type_hdr; -- ret = kfd_parse_subtype_mem(mem); -- break; -- case CRAT_SUBTYPE_CACHE_AFFINITY: -- cache = (struct crat_subtype_cache *)sub_type_hdr; -- ret = kfd_parse_subtype_cache(cache); -- break; -- case CRAT_SUBTYPE_TLB_AFFINITY: -- /* -- * For now, nothing to do here -- */ -- pr_info("Found TLB entry in CRAT table (not processing)\n"); -- break; -- case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: -- /* -- * For now, nothing to do here -- */ -- pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); -- break; -- case CRAT_SUBTYPE_IOLINK_AFFINITY: -- iolink = (struct crat_subtype_iolink *)sub_type_hdr; -- ret = kfd_parse_subtype_iolink(iolink); -- break; -- default: -- pr_warn("Unknown subtype (%d) in CRAT\n", -- sub_type_hdr->type); -- } -+ up_read(&topology_lock); - -- return ret; -+ return device; - } - -+/* Called with write topology_lock acquired */ - static void kfd_release_topology_device(struct kfd_topology_device *dev) - { - struct kfd_mem_properties *mem; -@@ -398,20 +151,22 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) - sys_props.num_devices--; - } - --static void kfd_release_live_view(void) -+void kfd_release_live_view(void) - { - struct kfd_topology_device *dev; - -+ down_write(&topology_lock); - while (topology_device_list.next != &topology_device_list) { - dev = container_of(topology_device_list.next, - struct kfd_topology_device, list); - kfd_release_topology_device(dev); --} -- -+ } -+ up_write(&topology_lock); - memset(&sys_props, 0, sizeof(sys_props)); - } - --static struct kfd_topology_device *kfd_create_topology_device(void) -+struct kfd_topology_device *kfd_create_topology_device( -+ struct list_head *device_list) - { - struct kfd_topology_device *dev; - -@@ -425,65 +180,12 @@ static struct kfd_topology_device *kfd_create_topology_device(void) - INIT_LIST_HEAD(&dev->cache_props); - INIT_LIST_HEAD(&dev->io_link_props); - -- list_add_tail(&dev->list, &topology_device_list); -+ list_add_tail(&dev->list, device_list); - sys_props.num_devices++; - - return dev; - } - --static int kfd_parse_crat_table(void *crat_image) --{ -- struct kfd_topology_device *top_dev; -- struct crat_subtype_generic *sub_type_hdr; -- uint16_t node_id; -- int ret; -- struct crat_header *crat_table = (struct crat_header *)crat_image; -- uint16_t num_nodes; -- uint32_t image_len; -- -- if (!crat_image) -- return -EINVAL; -- -- num_nodes = crat_table->num_domains; -- image_len = crat_table->length; -- -- pr_info("Parsing CRAT table with %d nodes\n", num_nodes); -- -- for (node_id = 0; node_id < num_nodes; node_id++) { -- top_dev = kfd_create_topology_device(); -- if (!top_dev) { -- kfd_release_live_view(); -- return -ENOMEM; -- } -- } -- -- sys_props.platform_id = -- (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; -- sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); -- sys_props.platform_rev = crat_table->revision; -- -- sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); -- while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < -- ((char *)crat_image) + image_len) { -- if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { -- ret = kfd_parse_subtype(sub_type_hdr); -- if (ret != 0) { -- kfd_release_live_view(); -- return ret; -- } -- } -- -- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -- sub_type_hdr->length); -- } -- -- sys_props.generation_count++; -- topology_crat_parsed = 1; -- -- return 0; --} -- -- - #define sysfs_show_gen_prop(buffer, fmt, ...) \ - snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) - #define sysfs_show_32bit_prop(buffer, name, value) \ -@@ -593,7 +295,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, - char *buffer) - { - ssize_t ret; -- uint32_t i; -+ uint32_t i, j; - struct kfd_cache_properties *cache; - - /* Making sure that the buffer is an empty string */ -@@ -611,12 +313,18 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, - sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); - sysfs_show_32bit_prop(buffer, "type", cache->cache_type); - snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); -- for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) -- ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", -- buffer, cache->sibling_map[i], -- (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? -- "\n" : ","); -- -+ for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) -+ for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { -+ /* Check each bit */ -+ if (cache->sibling_map[i] & (1 << j)) -+ ret = snprintf(buffer, PAGE_SIZE, -+ "%s%d%s", buffer, 1, ","); -+ else -+ ret = snprintf(buffer, PAGE_SIZE, -+ "%s%d%s", buffer, 0, ","); -+ } -+ /* Replace the last "," with end of line */ -+ *(buffer + strlen(buffer) - 1) = 0xA; - return ret; - } - -@@ -635,6 +343,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, - char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; - uint32_t i; - uint32_t log_max_watch_addr; -+ struct kfd_local_mem_info local_mem_info; - - /* Making sure that the buffer is an empty string */ - buffer[0] = 0; -@@ -674,7 +383,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, - } else { - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->node_props.mem_banks_count); -- } - - sysfs_show_32bit_prop(buffer, "caches_count", - dev->node_props.caches_count); -@@ -723,17 +431,30 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, - HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); - } - -+ if (dev->gpu->device_info->asic_family == CHIP_TONGA) -+ dev->node_props.capability |= -+ HSA_CAP_AQL_QUEUE_DOUBLE_MAP; -+ - sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", -- dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( -- dev->gpu->kgd)); -+ dev->node_props.max_engine_clk_fcompute); - -- sysfs_show_64bit_prop(buffer, "local_mem_size", -- (unsigned long long int) 0); -+ /* -+ * If the ASIC is CZ, set local memory size to 0 to disable -+ * local memory support -+ */ -+ if (dev->gpu->device_info->asic_family != CHIP_CARRIZO) { -+ dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, -+ &local_mem_info); -+ sysfs_show_64bit_prop(buffer, "local_mem_size", -+ local_mem_info.local_mem_size_private + -+ local_mem_info.local_mem_size_public); -+ } -+ else -+ sysfs_show_64bit_prop(buffer, "local_mem_size", -+ (unsigned long long int) 0); - - sysfs_show_32bit_prop(buffer, "fw_version", -- dev->gpu->kfd2kgd->get_fw_version( -- dev->gpu->kgd, -- KGD_ENGINE_MEC1)); -+ dev->gpu->mec_fw_version); - sysfs_show_32bit_prop(buffer, "capability", - dev->node_props.capability); - } -@@ -928,6 +649,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, - return 0; - } - -+/* Called with write topology lock acquired */ - static int kfd_build_sysfs_node_tree(void) - { - struct kfd_topology_device *dev; -@@ -944,6 +666,7 @@ static int kfd_build_sysfs_node_tree(void) - return 0; - } - -+/* Called with write topology lock acquired */ - static void kfd_remove_sysfs_node_tree(void) - { - struct kfd_topology_device *dev; -@@ -1015,88 +738,200 @@ static void kfd_topology_release_sysfs(void) - } - } - -+/* Called with write topology_lock acquired */ -+static int kfd_topology_update_device_list(struct list_head *temp_list, -+ struct list_head *master_list) -+{ -+ int num = 0; -+ -+ while (!list_empty(temp_list)) { -+ list_move_tail(temp_list->next, master_list); -+ num++; -+ } -+ return num; -+} -+ -+static void kfd_debug_print_topology(void) -+{ -+ struct kfd_topology_device *dev; -+ -+ down_read(&topology_lock); -+ -+ dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); -+ if (dev) { -+ if (dev->node_props.cpu_cores_count && dev->node_props.simd_count) { -+ pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", -+ dev->node_props.device_id, dev->node_props.vendor_id); -+ } -+ else if (dev->node_props.cpu_cores_count) -+ pr_info("Topology: Add CPU node\n"); -+ else if (dev->node_props.simd_count) -+ pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", -+ dev->node_props.device_id, dev->node_props.vendor_id); -+ } -+ up_read(&topology_lock); -+} -+ -+/* Helper function for intializing platform_xx members of kfd_system_properties -+ */ -+static void kfd_update_system_properties(void) -+{ -+ struct kfd_topology_device *dev; -+ -+ down_read(&topology_lock); -+ dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); -+ if (dev) { -+ sys_props.platform_id = -+ (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; -+ sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); -+ sys_props.platform_rev = dev->oem_revision; -+ } -+ up_read(&topology_lock); -+} -+ -+static void find_system_memory(const struct dmi_header *dm, -+ void *private) -+{ -+ struct kfd_mem_properties *mem; -+ u16 mem_width, mem_clock; -+ struct kfd_topology_device *kdev = -+ (struct kfd_topology_device *)private; -+ const u8 *dmi_data = (const u8 *)(dm + 1); -+ -+ if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { -+ mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); -+ mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); -+ list_for_each_entry(mem, &kdev->mem_props, list) { -+ if (mem_width != 0xFFFF && mem_width != 0) -+ mem->width = mem_width; -+ if (mem_clock != 0) -+ mem->mem_clk_max = mem_clock; -+ } -+ } -+} -+/* kfd_add_non_crat_information - Add information that is not currently -+ * defined in CRAT but is necessary for KFD topology -+ * @dev - topology device to which addition info is added -+ */ -+static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) -+{ -+ /* Check if CPU only node. */ -+ if (kdev->gpu == NULL) { -+ /* Add system memory information */ -+ dmi_walk(find_system_memory, kdev); -+ } -+ /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ -+} -+ - int kfd_topology_init(void) - { - void *crat_image = NULL; - size_t image_size = 0; - int ret; -+ struct list_head temp_topology_device_list; -+ int cpu_only_node = 0; -+ struct kfd_topology_device *kdev; -+ int proximity_domain; -+ int num_nodes; -+ -+ /* topology_device_list - Master list of all topology devices -+ * temp_topology_device_list - temporary list created while parsing CRAT -+ * or VCRAT. Once parsing is complete the contents of list is moved to -+ * topology_device_list -+ */ - -- /* -- * Initialize the head for the topology device list -+ /* Initialize the head for the both the lists - */ - INIT_LIST_HEAD(&topology_device_list); -+ INIT_LIST_HEAD(&temp_topology_device_list); - init_rwsem(&topology_lock); -- topology_crat_parsed = 0; - - memset(&sys_props, 0, sizeof(sys_props)); - -+ /* Proximity domains in ACPI CRAT tables start counting at -+ * 0. The same should be true for virtual CRAT tables created -+ * at this stage. GPUs added later in kfd_topology_add_device -+ * use a counter. */ -+ proximity_domain = 0; -+ - /* -- * Get the CRAT image from the ACPI -+ * Get the CRAT image from the ACPI. If ACPI doesn't have one -+ * create a virtual CRAT. -+ * NOTE: The current implementation expects all AMD APUs to have -+ * CRAT. If no CRAT is available, it is assumed to be a CPU - */ -- ret = kfd_topology_get_crat_acpi(crat_image, &image_size); -- if (ret == 0 && image_size > 0) { -- pr_info("Found CRAT image with size=%zd\n", image_size); -- crat_image = kmalloc(image_size, GFP_KERNEL); -- if (!crat_image) { -- ret = -ENOMEM; -- pr_err("No memory for allocating CRAT image\n"); -- goto err; -- } -- ret = kfd_topology_get_crat_acpi(crat_image, &image_size); -- -- if (ret == 0) { -- down_write(&topology_lock); -- ret = kfd_parse_crat_table(crat_image); -- if (ret == 0) -- ret = kfd_topology_update_sysfs(); -- up_write(&topology_lock); -- } else { -- pr_err("Couldn't get CRAT table size from ACPI\n"); -- } -- kfree(crat_image); -- } else if (ret == -ENODATA) { -- ret = 0; -- } else { -- pr_err("Couldn't get CRAT table size from ACPI\n"); -+ ret = kfd_create_crat_image_acpi(&crat_image, &image_size); -+ if (ret != 0) { -+ ret = kfd_create_crat_image_virtual(&crat_image, &image_size, -+ COMPUTE_UNIT_CPU, NULL, -+ proximity_domain); -+ cpu_only_node = 1; -+ } -+ -+ if (ret == 0) -+ ret = kfd_parse_crat_table(crat_image, -+ &temp_topology_device_list, -+ proximity_domain); -+ else { -+ pr_err("Error getting/creating CRAT table\n"); -+ goto err; -+ } -+ -+ down_write(&topology_lock); -+ num_nodes = kfd_topology_update_device_list(&temp_topology_device_list, -+ &topology_device_list); -+ atomic_set(&topology_crat_proximity_domain, num_nodes-1); -+ ret = kfd_topology_update_sysfs(); -+ up_write(&topology_lock); -+ -+ if (ret == 0) { -+ sys_props.generation_count++; -+ kfd_update_system_properties(); -+ kfd_debug_print_topology(); -+ pr_info("Finished initializing topology\n"); -+ } -+ else -+ pr_err("Failed to update topology in sysfs ret=%d\n", ret); -+ -+ /* For nodes with GPU, this information gets added -+ * when GPU is detected (kfd_topology_add_device). */ -+ if (cpu_only_node) { -+ /* Add additional information to CPU only node created above */ -+ down_write(&topology_lock); -+ kdev = list_first_entry(&topology_device_list, -+ struct kfd_topology_device, list); -+ up_write(&topology_lock); -+ kfd_add_non_crat_information(kdev); - } - - err: -- pr_info("Finished initializing topology ret=%d\n", ret); -+ kfd_destroy_crat_image(crat_image); - return ret; - } - - void kfd_topology_shutdown(void) - { -+ down_write(&topology_lock); - kfd_topology_release_sysfs(); -+ up_write(&topology_lock); - kfd_release_live_view(); - } - --static void kfd_debug_print_topology(void) --{ -- struct kfd_topology_device *dev; -- uint32_t i = 0; -- -- pr_info("DEBUG PRINT OF TOPOLOGY:"); -- list_for_each_entry(dev, &topology_device_list, list) { -- pr_info("Node: %d\n", i); -- pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); -- pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); -- pr_info("\tSIMD count: %d", dev->node_props.simd_count); -- i++; -- } --} -- - static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) - { - uint32_t hashout; - uint32_t buf[7]; - uint64_t local_mem_size; - int i; -+ struct kfd_local_mem_info local_mem_info; - - if (!gpu) - return 0; - -- local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd); -+ gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info); -+ -+ local_mem_size = local_mem_info.local_mem_size_private + -+ local_mem_info.local_mem_size_public; - - buf[0] = gpu->pdev->devfn; - buf[1] = gpu->pdev->subsystem_vendor; -@@ -1111,7 +946,13 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) - - return hashout; - } -- -+/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If -+ * the GPU device is not already present in the topology device list -+ * then return NULL. This means a new topology device has to be -+ * created for this GPU. -+ * TODO: Rather than assiging @gpu to first topology device withtout -+ * gpu attached, it will better to have more stringent check. -+ */ - static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) - { - struct kfd_topology_device *dev; -@@ -1119,13 +960,14 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) - - BUG_ON(!gpu); - -+ down_write(&topology_lock); - list_for_each_entry(dev, &topology_device_list, list) - if (dev->gpu == NULL && dev->node_props.simd_count > 0) { - dev->gpu = gpu; - out_dev = dev; - break; - } -- -+ up_write(&topology_lock); - return out_dev; - } - -@@ -1137,70 +979,146 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) - */ - } - -+/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, -+ * patch this after CRAT parsing. -+ */ -+static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) -+{ -+ struct kfd_mem_properties *mem; -+ struct kfd_local_mem_info local_mem_info; -+ -+ if (dev == NULL) -+ return; -+ -+ /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with -+ * single bank of VRAM local memory. -+ * for dGPUs - VCRAT reports only one bank of Local Memory -+ * for APUs - If CRAT from ACPI reports more than one bank, then -+ * all the banks will report the same mem_clk_max information -+ */ -+ dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, -+ &local_mem_info); -+ -+ list_for_each_entry(mem, &dev->mem_props, list) -+ mem->mem_clk_max = local_mem_info.mem_clk_max; -+} -+ - int kfd_topology_add_device(struct kfd_dev *gpu) - { - uint32_t gpu_id; - struct kfd_topology_device *dev; -- int res; -+ struct kfd_cu_info cu_info; -+ int res = 0; -+ struct list_head temp_topology_device_list; -+ void *crat_image = NULL; -+ size_t image_size = 0; -+ int proximity_domain; - - BUG_ON(!gpu); - -+ INIT_LIST_HEAD(&temp_topology_device_list); -+ - gpu_id = kfd_generate_gpu_id(gpu); - - pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id); - -- down_write(&topology_lock); -- /* -- * Try to assign the GPU to existing topology device (generated from -- * CRAT table -+ proximity_domain = atomic_inc_return(& -+ topology_crat_proximity_domain); -+ -+ /* Check to see if this gpu device exists in the topology_device_list. -+ * If so, assign the gpu to that device, -+ * else create a Virtual CRAT for this gpu device and then parse that CRAT -+ * to create a new topology device. Once created assign the gpu to that -+ * topology device - */ - dev = kfd_assign_gpu(gpu); - if (!dev) { -- pr_info("GPU was not found in the current topology. Extending.\n"); -- kfd_debug_print_topology(); -- dev = kfd_create_topology_device(); -- if (!dev) { -- res = -ENOMEM; -+ res = kfd_create_crat_image_virtual(&crat_image, &image_size, -+ COMPUTE_UNIT_GPU, -+ gpu, proximity_domain); -+ if (res == 0) -+ res = kfd_parse_crat_table(crat_image, -+ &temp_topology_device_list, proximity_domain); -+ else { -+ pr_err("Error in VCRAT for GPU (ID: 0x%x)\n", gpu_id); - goto err; - } -- dev->gpu = gpu; - -- /* -- * TODO: Make a call to retrieve topology information from the -- * GPU vBIOS -- */ -+ down_write(&topology_lock); -+ kfd_topology_update_device_list(&temp_topology_device_list, -+ &topology_device_list); - - /* - * Update the SYSFS tree, since we added another topology device - */ -- if (kfd_topology_update_sysfs() < 0) -- kfd_topology_release_sysfs(); -- -+ res = kfd_topology_update_sysfs(); -+ up_write(&topology_lock); -+ -+ if (res == 0) -+ sys_props.generation_count++; -+ else -+ pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", -+ gpu_id, res); -+ dev = kfd_assign_gpu(gpu); -+ BUG_ON(!dev); - } - - dev->gpu_id = gpu_id; - gpu->id = gpu_id; -+ -+ /* TODO: Move the following lines to function -+ * kfd_add_non_crat_information */ -+ -+ /* Fill-in additional information that is not available in CRAT but -+ * needed for the topology */ -+ -+ dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); -+ dev->node_props.simd_arrays_per_engine = cu_info.num_shader_arrays_per_engine; -+ - dev->node_props.vendor_id = gpu->pdev->vendor; - dev->node_props.device_id = gpu->pdev->device; -- dev->node_props.location_id = (gpu->pdev->bus->number << 24) + -- (gpu->pdev->devfn & 0xffffff); -- /* -- * TODO: Retrieve max engine clock values from KGD -- */ -- -- if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { -- dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; -- pr_info("amdkfd: adding doorbell packet type capability\n"); -+ dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, -+ gpu->pdev->devfn); -+ dev->node_props.max_engine_clk_fcompute = -+ dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); -+ dev->node_props.max_engine_clk_ccompute = -+ cpufreq_quick_get_max(0) / 1000; -+ -+ kfd_fill_mem_clk_max_info(dev); -+ -+ switch (dev->gpu->device_info->asic_family) { -+ case CHIP_KAVERI: -+ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << -+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & -+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); -+ break; -+ case CHIP_CARRIZO: -+ case CHIP_TONGA: -+ case CHIP_FIJI: -+ pr_debug("amdkfd: adding doorbell packet type capability\n"); -+ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << -+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & -+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); -+ break; - } - -- res = 0; -+ /* Fix errors in CZ CRAT. -+ * simd_count: Carrizo CRAT reports wrong simd_count, probably because it -+ * doesn't consider masked out CUs -+ * capability flag: Carrizo CRAT doesn't report IOMMU flags. -+ */ -+ if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { -+ dev->node_props.simd_count = -+ cu_info.simd_per_cu * cu_info.cu_active_number; -+ dev->node_props.capability |= HSA_CAP_ATS_PRESENT; -+ } - -+ kfd_debug_print_topology(); - err: -- up_write(&topology_lock); -- - if (res == 0) - kfd_notify_gpu_change(gpu_id, 1); - -+ kfd_destroy_crat_image(crat_image); - return res; - } - -@@ -1233,22 +1151,26 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) - return res; - } - --/* -- * When idx is out of bounds, the function will return NULL -+/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD -+ * topology. If GPU device is found @idx, then valid kfd_dev pointer is -+ * returned through @kdev -+ * Return - 0: On success (@kdev will be NULL for non GPU nodes) -+ * -1: If end of list - */ --struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) -+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) - { - - struct kfd_topology_device *top_dev; -- struct kfd_dev *device = NULL; - uint8_t device_idx = 0; - -+ *kdev = NULL; - down_read(&topology_lock); - - list_for_each_entry(top_dev, &topology_device_list, list) { - if (device_idx == idx) { -- device = top_dev->gpu; -- break; -+ *kdev = top_dev->gpu; -+ up_read(&topology_lock); -+ return 0; - } - - device_idx++; -@@ -1256,6 +1178,57 @@ struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) - - up_read(&topology_lock); - -- return device; -+ return -1; -+ -+} -+ -+static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) -+{ -+ const struct cpuinfo_x86 *cpuinfo; -+ int first_cpu_of_nuna_node; -+ -+ if (cpumask == NULL || cpumask == cpu_none_mask) -+ return -1; -+ first_cpu_of_nuna_node = cpumask_first(cpumask); -+ cpuinfo = &cpu_data(first_cpu_of_nuna_node); -+ -+ return cpuinfo->apicid; -+} -+ -+/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor -+ * of the given NUMA node (numa_node_id) -+ * Return -1 on failure -+ */ -+int kfd_numa_node_to_apic_id(int numa_node_id) -+{ -+ if (numa_node_id == -1) { -+ pr_warn("Invalid NUMA Node. Use online CPU mask\n"); -+ return kfd_cpumask_to_apic_id(cpu_online_mask); -+ } -+ return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); -+} -+ -+/* kfd_get_proximity_domain - Find proximity_domain (node id) to which -+ * given PCI bus belongs to. CRAT table contains only the APIC ID -+ * of the parent NUMA node. So use that as the search parameter. -+ * Return -1 on failure -+ */ -+int kfd_get_proximity_domain(const struct pci_bus *bus) -+{ -+ struct kfd_topology_device *dev; -+ int proximity_domain = -1; -+ -+ down_read(&topology_lock); -+ -+ list_for_each_entry(dev, &topology_device_list, list) -+ if (dev->node_props.cpu_cores_count && -+ dev->node_props.cpu_core_id_base == -+ kfd_cpumask_to_apic_id(cpumask_of_pcibus(bus))) { -+ proximity_domain = dev->proximity_domain; -+ break; -+ } -+ -+ up_read(&topology_lock); - -+ return proximity_domain; - } -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -index c3ddb9b..ab28188 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -@@ -39,8 +39,16 @@ - #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 - #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 - #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 --#define HSA_CAP_RESERVED 0xfffff000 -+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 -+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 -+#define HSA_CAP_RESERVED 0xffffc000 -+ -+#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 -+#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 -+#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 -+#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 - #define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 -+#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 - - struct kfd_node_properties { - uint32_t cpu_cores_count; -@@ -91,8 +99,6 @@ struct kfd_mem_properties { - struct attribute attr; - }; - --#define KFD_TOPOLOGY_CPU_SIBLINGS 256 -- - #define HSA_CACHE_TYPE_DATA 0x00000001 - #define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 - #define HSA_CACHE_TYPE_CPU 0x00000004 -@@ -109,7 +115,7 @@ struct kfd_cache_properties { - uint32_t cache_assoc; - uint32_t cache_latency; - uint32_t cache_type; -- uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; -+ uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; - struct kobject *kobj; - struct attribute attr; - }; -@@ -135,8 +141,8 @@ struct kfd_iolink_properties { - struct kfd_topology_device { - struct list_head list; - uint32_t gpu_id; -+ uint32_t proximity_domain; - struct kfd_node_properties node_props; -- uint32_t mem_bank_count; - struct list_head mem_props; - uint32_t cache_count; - struct list_head cache_props; -@@ -150,6 +156,9 @@ struct kfd_topology_device { - struct attribute attr_gpuid; - struct attribute attr_name; - struct attribute attr_props; -+ uint8_t oem_id[CRAT_OEMID_LENGTH]; -+ uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; -+ uint32_t oem_revision; - }; - - struct kfd_system_properties { -@@ -164,6 +173,8 @@ struct kfd_system_properties { - struct attribute attr_props; - }; - -- -+struct kfd_topology_device *kfd_create_topology_device( -+ struct list_head *device_list); -+void kfd_release_live_view(void); - - #endif /* __KFD_TOPOLOGY_H__ */ -diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h -index 36f3766..5403164 100644 ---- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h -+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h -@@ -40,6 +40,41 @@ struct kfd_dev; - struct kgd_dev; - - struct kgd_mem; -+struct kfd_process_device; -+struct amdgpu_bo; -+ -+struct kfd_vm_fault_info { -+ uint64_t page_addr; -+ uint32_t vmid; -+ uint32_t mc_id; -+ uint32_t status; -+ bool prot_valid; -+ bool prot_read; -+ bool prot_write; -+ bool prot_exec; -+}; -+ -+struct kfd_cu_info { -+ uint32_t num_shader_engines; -+ uint32_t num_shader_arrays_per_engine; -+ uint32_t num_cu_per_sh; -+ uint32_t cu_active_number; -+ uint32_t cu_ao_mask; -+ uint32_t simd_per_cu; -+ uint32_t max_waves_per_simd; -+ uint32_t wave_front_size; -+ uint32_t max_scratch_slots_per_cu; -+ uint32_t lds_size; -+ uint32_t cu_bitmap[4][4]; -+}; -+ -+/* For getting GPU local memory information from KGD */ -+struct kfd_local_mem_info { -+ uint64_t local_mem_size_private; -+ uint64_t local_mem_size_public; -+ uint32_t vram_width; -+ uint32_t mem_clk_max; -+}; - - enum kgd_memory_pool { - KGD_POOL_SYSTEM_CACHEABLE = 1, -@@ -80,8 +115,28 @@ struct kgd2kfd_shared_resources { - - /* Number of bytes at start of aperture reserved for KGD. */ - size_t doorbell_start_offset; -+ -+ /* GPUVM address space size in bytes */ -+ uint64_t gpuvm_size; - }; - -+/* -+ * Allocation flag domains currently only VRAM and GTT domain supported -+ */ -+#define ALLOC_MEM_FLAGS_VRAM (1 << 0) -+#define ALLOC_MEM_FLAGS_GTT (1 << 1) -+#define ALLOC_MEM_FLAGS_USERPTR (1 << 2) -+ -+/* -+ * Allocation flags attributes/access options. -+ */ -+#define ALLOC_MEM_FLAGS_NONPAGED (1 << 31) -+#define ALLOC_MEM_FLAGS_READONLY (1 << 30) -+#define ALLOC_MEM_FLAGS_PUBLIC (1 << 29) -+#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) -+#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) -+#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) -+ - /** - * struct kfd2kgd_calls - * -@@ -90,7 +145,7 @@ struct kgd2kfd_shared_resources { - * - * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture - * -- * @get_vmem_size: Retrieves (physical) size of VRAM -+ * @get_local_mem_info: Retrieves information about GPU local memory - * - * @get_gpu_clock_counter: Retrieves GPU clock counter - * -@@ -121,8 +176,23 @@ struct kgd2kfd_shared_resources { - * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that - * SDMA hqd slot. - * -+ * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs -+ * -+ * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs -+ * - * @get_fw_version: Returns FW versions from the header - * -+ * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to -+ * IOMMU when address translation failed -+ * -+ * @get_cu_info: Retrieves activated cu info -+ * -+ * @get_dmabuf_info: Returns information about a dmabuf if it was -+ * created by the GPU driver -+ * -+ * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object -+ * Supports only DMA buffers created by GPU driver on the same GPU -+ * - * This structure contains function pointers to services that the kgd driver - * provides to amdkfd driver. - * -@@ -134,11 +204,23 @@ struct kfd2kgd_calls { - - void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); - -+ void(*get_local_mem_info)(struct kgd_dev *kgd, -+ struct kfd_local_mem_info *mem_info); - uint64_t (*get_vmem_size)(struct kgd_dev *kgd); - uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd); - - uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd); - -+ int (*create_process_vm)(struct kgd_dev *kgd, void **vm); -+ void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm); -+ -+ int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem); -+ void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem); -+ -+ uint32_t (*get_process_page_dir)(void *vm); -+ -+ int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); -+ - /* Register access functions */ - void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid, - uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, -@@ -151,9 +233,11 @@ struct kfd2kgd_calls { - uint32_t hpd_size, uint64_t hpd_gpu_addr); - - int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); -+ - - int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr); -+ uint32_t queue_id, uint32_t __user *wptr, -+ uint32_t page_table_base); - - int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd); - -@@ -168,7 +252,7 @@ struct kfd2kgd_calls { - - int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd, - unsigned int timeout); -- -+ - int (*address_watch_disable)(struct kgd_dev *kgd); - int (*address_watch_execute)(struct kgd_dev *kgd, - unsigned int watch_point_id, -@@ -189,9 +273,53 @@ struct kfd2kgd_calls { - uint8_t vmid); - void (*write_vmid_invalidate_request)(struct kgd_dev *kgd, - uint8_t vmid); -+ int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va, -+ size_t size, void *vm, -+ struct kgd_mem **mem, uint64_t *offset, -+ void **kptr, struct kfd_process_device *pdd, -+ uint32_t flags); -+ int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem); -+ int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, -+ void *vm); -+ int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, -+ void *vm); - - uint16_t (*get_fw_version)(struct kgd_dev *kgd, - enum kgd_engine_type type); -+ -+ void (*set_num_of_requests)(struct kgd_dev *kgd, -+ uint8_t num_of_requests); -+ int (*alloc_memory_of_scratch)(struct kgd_dev *kgd, -+ uint64_t va, uint32_t vmid); -+ int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable, -+ uint8_t element_size, uint8_t index_stride, uint8_t mtype); -+ void (*get_cu_info)(struct kgd_dev *kgd, -+ struct kfd_cu_info *cu_info); -+ int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma); -+ int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd, -+ struct kgd_mem *mem, void **kptr); -+ void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid, -+ uint32_t page_table_base); -+ struct kfd_process_device* (*get_pdd_from_buffer_object) -+ (struct kgd_dev *kgd, struct kgd_mem *mem); -+ int (*return_bo_size)(struct kgd_dev *kgd, struct kgd_mem *mem); -+ -+ int (*pin_get_sg_table_bo)(struct kgd_dev *kgd, -+ struct kgd_mem *mem, uint64_t offset, -+ uint64_t size, struct sg_table **ret_sg); -+ void (*unpin_put_sg_table_bo)(struct kgd_mem *mem, -+ struct sg_table *sg); -+ -+ int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd, -+ struct kgd_dev **dma_buf_kgd, uint64_t *bo_size, -+ void *metadata_buffer, size_t buffer_size, -+ uint32_t *metadata_size, uint32_t *flags); -+ int (*import_dmabuf)(struct kgd_dev *kgd, int dma_buf_fd, uint64_t va, -+ void *vm, struct kgd_mem **mem, uint64_t *size); -+ -+ int (*get_vm_fault_info)(struct kgd_dev *kgd, -+ struct kfd_vm_fault_info *info); -+ - }; - - /** -@@ -210,6 +338,10 @@ struct kfd2kgd_calls { - * - * @resume: Notifies amdkfd about a resume action done to a kgd device - * -+ * @quiesce_mm: Quiesce all user queue access to specified MM address space -+ * -+ * @resume_mm: Resume user queue access to specified MM address space -+ * - * This structure contains function callback pointers so the kgd driver - * will notify to the amdkfd about certain status changes. - * -@@ -224,9 +356,13 @@ struct kgd2kfd_calls { - void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); - void (*suspend)(struct kfd_dev *kfd); - int (*resume)(struct kfd_dev *kfd); -+ int (*evict_bo)(struct kfd_dev *dev, void *ptr); -+ int (*restore)(struct kfd_dev *kfd); -+ int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm); -+ int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm); - }; - - int kgd2kfd_init(unsigned interface_version, - const struct kgd2kfd_calls **g2f); - --#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ -+#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ --- -2.7.4 - |