diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch | 16828 |
1 files changed, 16828 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch new file mode 100644 index 00000000..c037b8f2 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1370-port-in-all-files-amdkfd-source-files-snapshot-at.patch @@ -0,0 +1,16828 @@ +From 879030b8b91026fde404c0ab73293655d0684333 Mon Sep 17 00:00:00 2001 +From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> +Date: Thu, 18 Oct 2018 18:30:48 +0530 +Subject: [PATCH 1370/4131] port in all files amdkfd source files snapshot at + +commit 9918a8f15a957dff68d8bb7d88a2e6485368b626 +Author: shaoyunl <Shaoyun.Liu@amd.com> +Date: Mon Mar 28 16:13:27 2016 -0400 + + drm/amdkfd: Assign SDMA engine in an alternative order when creating + sdma queues + +Change-Id: I705be5e2d78cfe8c4035eb9493432f466aefb007 +Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 271 +++- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 104 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 307 +++- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 241 ++- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 1619 ++++++++++++++++++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 +- + drivers/gpu/drm/amd/amdkfd/Kconfig | 1 + + drivers/gpu/drm/amd/amdkfd/Makefile | 2 +- + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 43 +- + drivers/gpu/drm/amd/amdkfd/cik_int.h | 22 +- + drivers/gpu/drm/amd/amdkfd/cik_regs.h | 175 ++- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h | 1377 +++++++++++++++++ + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1357 +++++++++++++--- + drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 1163 ++++++++++++++ + drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 40 +- + drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 972 ++++++------ + drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h | 66 +- + drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c | 247 ++- + drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h | 313 ++-- + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 283 +++- + .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 497 +++++- + .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 29 +- + .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c | 2 + + .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c | 106 ++ + drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 3 +- + drivers/gpu/drm/amd/amdkfd/kfd_events.c | 522 ++++--- + drivers/gpu/drm/amd/amdkfd/kfd_events.h | 3 +- + drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 79 +- + drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 6 +- + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 26 +- + drivers/gpu/drm/amd/amdkfd/kfd_module.c | 30 +- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 3 + + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 3 +- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 92 +- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 227 ++- + drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 212 ++- + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h | 120 +- + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 269 +++- + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 542 ++++++- + .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 102 +- + drivers/gpu/drm/amd/amdkfd/kfd_rdma.c | 296 ++++ + drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 841 +++++----- + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 23 +- + drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 144 +- + 44 files changed, 10790 insertions(+), 1992 deletions(-) + create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_crat.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_rdma.c + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +index ef56352..daeb85f 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +@@ -21,12 +21,14 @@ + */ + + #include "amdgpu_amdkfd.h" +-#include "amd_shared.h" ++#include <linux/dma-buf.h> + #include <drm/drmP.h> + #include "amdgpu.h" + #include "amdgpu_gfx.h" + #include <linux/module.h> + ++#define AMDKFD_SKIP_UNCOMPILED_CODE 1 ++ + const struct kfd2kgd_calls *kfd2kgd; + const struct kgd2kfd_calls *kgd2kfd; + bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); +@@ -64,12 +66,12 @@ int amdgpu_amdkfd_init(void) + bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev) + { + switch (adev->asic_type) { +-#ifdef CONFIG_DRM_AMDGPU_CIK + case CHIP_KAVERI: + kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions(); + break; +-#endif + case CHIP_CARRIZO: ++ case CHIP_TONGA: ++ case CHIP_FIJI: + kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); + break; + default: +@@ -102,7 +104,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) + struct kgd2kfd_shared_resources gpu_resources = { + .compute_vmid_bitmap = 0xFF00, + .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec, +- .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe ++ .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe, ++ .gpuvm_size = (uint64_t)amdgpu_vm_size << 30 + }; + + /* this is going to have a few of the MSBs set that we need to +@@ -167,6 +170,115 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev) + return r; + } + ++int amdgpu_amdkfd_evict_mem(struct amdgpu_device *adev, struct kgd_mem *mem, ++ struct mm_struct *mm) ++{ ++ int r; ++ ++ if (!adev->kfd) ++ return -ENODEV; ++ ++ mutex_lock(&mem->data2.lock); ++ ++ if (mem->data2.evicted == 1 && delayed_work_pending(&mem->data2.work)) ++ /* Cancelling a scheduled restoration */ ++ cancel_delayed_work(&mem->data2.work); ++ ++ if (++mem->data2.evicted > 1) { ++ mutex_unlock(&mem->data2.lock); ++ return 0; ++ } ++ ++ r = amdgpu_amdkfd_gpuvm_evict_mem(mem, mm); ++ ++ if (r != 0) ++ /* First eviction failed, setting count back to 0 will ++ * make the corresponding restore fail gracefully */ ++ mem->data2.evicted = 0; ++ else ++ /* First eviction counts as 2. Eviction counter == 1 ++ * means that restoration is scheduled. */ ++ mem->data2.evicted = 2; ++ ++ mutex_unlock(&mem->data2.lock); ++ ++ return r; ++} ++ ++static void amdgdu_amdkfd_restore_mem_worker(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct kgd_mem *mem = container_of(dwork, struct kgd_mem, data2.work); ++ struct amdgpu_device *adev; ++ struct mm_struct *mm; ++ ++ mutex_lock(&mem->data2.lock); ++ ++ adev = mem->data2.bo->adev; ++ mm = mem->data2.mm; ++ ++ /* Restoration may have been canceled by another eviction or ++ * could already be done by a restore scheduled earlier */ ++ if (mem->data2.evicted == 1) { ++ amdgpu_amdkfd_gpuvm_restore_mem(mem, mm); ++ mem->data2.evicted = 0; ++ } ++ ++ mutex_unlock(&mem->data2.lock); ++} ++ ++int amdgpu_amdkfd_schedule_restore_mem(struct amdgpu_device *adev, ++ struct kgd_mem *mem, ++ struct mm_struct *mm, ++ unsigned long delay) ++{ ++ int r = 0; ++ ++ if (!adev->kfd) ++ return -ENODEV; ++ ++ mutex_lock(&mem->data2.lock); ++ ++ if (mem->data2.evicted <= 1) { ++ /* Buffer is not evicted (== 0) or its restoration is ++ * already scheduled (== 1) */ ++ pr_err("Unbalanced restore of evicted buffer %p\n", mem); ++ mutex_unlock(&mem->data2.lock); ++ return -EFAULT; ++ } else if (--mem->data2.evicted > 1) { ++ mutex_unlock(&mem->data2.lock); ++ return 0; ++ } ++ ++ /* mem->data2.evicted is 1 after decrememting. Schedule ++ * restoration. */ ++ if (delayed_work_pending(&mem->data2.work)) ++ cancel_delayed_work(&mem->data2.work); ++ mem->data2.mm = mm; ++ INIT_DELAYED_WORK(&mem->data2.work, ++ amdgdu_amdkfd_restore_mem_worker); ++ schedule_delayed_work(&mem->data2.work, delay); ++ ++ mutex_unlock(&mem->data2.lock); ++ ++ return r; ++} ++ ++void amdgpu_amdkfd_cancel_restore_mem(struct amdgpu_device *adev, ++ struct kgd_mem *mem) ++{ ++ if (delayed_work_pending(&mem->data2.work)) ++ cancel_delayed_work_sync(&mem->data2.work); ++} ++ ++u32 pool_to_domain(enum kgd_memory_pool p) ++{ ++ switch (p) { ++ case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM; ++ default: return AMDGPU_GEM_DOMAIN_GTT; ++ } ++} ++ + int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + void **mem_obj, uint64_t *gpu_addr, + void **cpu_ptr) +@@ -192,38 +304,38 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + } + + /* map the buffer */ +- r = amdgpu_bo_reserve((*mem)->bo, true); ++ r = amdgpu_bo_reserve((*mem)->data1.bo, true); + if (r) { + dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r); + goto allocate_mem_reserve_bo_failed; + } + +- r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT, +- &(*mem)->gpu_addr); ++ r = amdgpu_bo_pin((*mem)->data1.bo, AMDGPU_GEM_DOMAIN_GTT, ++ &(*mem)->data1.gpu_addr); + if (r) { + dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r); + goto allocate_mem_pin_bo_failed; + } +- *gpu_addr = (*mem)->gpu_addr; ++ *gpu_addr = (*mem)->data1.gpu_addr; + +- r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); ++ r = amdgpu_bo_kmap((*mem)->data1.bo, &(*mem)->data1.cpu_ptr); + if (r) { + dev_err(adev->dev, + "(%d) failed to map bo to kernel for amdkfd\n", r); + goto allocate_mem_kmap_bo_failed; + } +- *cpu_ptr = (*mem)->cpu_ptr; ++ *cpu_ptr = (*mem)->data1.cpu_ptr; + +- amdgpu_bo_unreserve((*mem)->bo); ++ amdgpu_bo_unreserve((*mem)->data1.bo); + + return 0; + + allocate_mem_kmap_bo_failed: +- amdgpu_bo_unpin((*mem)->bo); ++ amdgpu_bo_unpin((*mem)->data1.bo); + allocate_mem_pin_bo_failed: +- amdgpu_bo_unreserve((*mem)->bo); ++ amdgpu_bo_unreserve((*mem)->data1.bo); + allocate_mem_reserve_bo_failed: +- amdgpu_bo_unref(&(*mem)->bo); ++ amdgpu_bo_unref(&(*mem)->data1.bo); + + return r; + } +@@ -234,22 +346,44 @@ void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) + + BUG_ON(mem == NULL); + +- amdgpu_bo_reserve(mem->bo, true); +- amdgpu_bo_kunmap(mem->bo); +- amdgpu_bo_unpin(mem->bo); +- amdgpu_bo_unreserve(mem->bo); +- amdgpu_bo_unref(&(mem->bo)); ++ amdgpu_bo_reserve(mem->data1.bo, true); ++ amdgpu_bo_kunmap(mem->data1.bo); ++ amdgpu_bo_unpin(mem->data1.bo); ++ amdgpu_bo_unreserve(mem->data1.bo); ++ amdgpu_bo_unref(&(mem->data1.bo)); + kfree(mem); + } + +-uint64_t get_vmem_size(struct kgd_dev *kgd) ++void get_local_mem_info(struct kgd_dev *kgd, ++ struct kfd_local_mem_info *mem_info) + { +- struct amdgpu_device *adev = +- (struct amdgpu_device *)kgd; ++ uint64_t address_mask; ++ resource_size_t aper_limit; ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + + BUG_ON(kgd == NULL); + +- return adev->mc.real_vram_size; ++ address_mask = ~((1UL << 40) - 1); ++ aper_limit = adev->mc.aper_base + adev->mc.aper_size; ++ memset(mem_info, 0, sizeof(*mem_info)); ++ if (!(adev->mc.aper_base & address_mask || ++ aper_limit & address_mask)) { ++ mem_info->local_mem_size_public = adev->mc.visible_vram_size; ++ mem_info->local_mem_size_private = adev->mc.real_vram_size - ++ adev->mc.visible_vram_size; ++ mem_info->vram_width = adev->mc.vram_width; ++ } else { ++ pr_err("amdgpu: vram aperture is out of 40bit address base: 0x%llx limit 0x%llx\n", ++ adev->mc.aper_base, aper_limit); ++ } ++ ++ pr_debug("amdgpu: address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n", ++ adev->mc.aper_base, aper_limit, ++ mem_info->local_mem_size_public, ++ mem_info->local_mem_size_private); ++ ++ if (amdgpu_powerplay || adev->pm.funcs->get_mclk) ++ mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100; + } + + uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) +@@ -265,5 +399,94 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd) + { + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + /* The sclk is in quantas of 10kHz */ +- return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100; ++ if (amdgpu_powerplay) ++ return amdgpu_dpm_get_sclk(adev, false) / 100; ++ else ++ return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100; ++} ++ ++void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ struct amdgpu_cu_info acu_info; ++ ++ memset(cu_info, 0, sizeof(*cu_info)); ++ if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap)) ++ return; ++ ++ memset(&acu_info, 0, sizeof(acu_info)); ++ ++ cu_info->cu_active_number = acu_info.number; ++ cu_info->cu_ao_mask = acu_info.ao_cu_mask; ++ memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], sizeof(acu_info.bitmap)); ++ cu_info->num_shader_engines = adev->gfx.config.max_shader_engines; ++ cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; ++ cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh; ++ cu_info->simd_per_cu = acu_info.simd_per_cu; ++ cu_info->max_waves_per_simd = acu_info.max_waves_per_simd; ++ cu_info->wave_front_size = acu_info.wave_front_size; ++ cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu; ++ cu_info->lds_size = acu_info.lds_size; ++} ++ ++int map_gtt_bo_to_kernel(struct kgd_dev *kgd, ++ struct kgd_mem *mem, void **kptr) ++{ ++ return 0; ++} ++ ++int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, ++ struct kgd_dev **dma_buf_kgd, ++ uint64_t *bo_size, void *metadata_buffer, ++ size_t buffer_size, uint32_t *metadata_size, ++ uint32_t *flags) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ struct dma_buf *dma_buf; ++ struct drm_gem_object *obj; ++ struct amdgpu_bo *bo; ++ uint64_t metadata_flags; ++ int r = -EINVAL; ++ ++ dma_buf = dma_buf_get(dma_buf_fd); ++ if (IS_ERR(dma_buf)) ++ return PTR_ERR(dma_buf); ++ ++ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) ++ /* Can't handle non-graphics buffers */ ++ goto out_put; ++ ++ obj = dma_buf->priv; ++ if (obj->dev->driver != adev->ddev->driver) ++ /* Can't handle buffers from different drivers */ ++ goto out_put; ++ ++ adev = obj->dev->dev_private; ++ bo = gem_to_amdgpu_bo(obj); ++ if (!(bo->initial_domain & (AMDGPU_GEM_DOMAIN_VRAM | ++ AMDGPU_GEM_DOMAIN_GTT))) ++ /* Only VRAM and GTT BOs are supported */ ++ goto out_put; ++ ++ r = 0; ++ if (dma_buf_kgd) ++ *dma_buf_kgd = (struct kgd_dev *)adev; ++ if (bo_size) ++ *bo_size = amdgpu_bo_size(bo); ++ if (metadata_size) ++ *metadata_size = bo->metadata_size; ++ if (metadata_buffer) ++ r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size, ++ metadata_size, &metadata_flags); ++ if (flags) { ++ *flags = (bo->initial_domain & AMDGPU_GEM_DOMAIN_VRAM) ? ++ ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT; ++ ++ if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) ++ *flags |= ALLOC_MEM_FLAGS_PUBLIC; ++ } ++ ++out_put: ++ dma_buf_put(dma_buf); ++ return r; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +index 8e8c10e..5fa506d 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +@@ -27,14 +27,46 @@ + + #include <linux/types.h> + #include <linux/mm.h> ++#include <linux/workqueue.h> + #include <kgd_kfd_interface.h> + ++extern const struct kgd2kfd_calls *kgd2kfd; ++ + struct amdgpu_device; + ++struct kfd_bo_va_list { ++ struct list_head bo_list; ++ struct amdgpu_bo_va *bo_va; ++ void *kgd_dev; ++ bool is_mapped; ++}; ++ + struct kgd_mem { +- struct amdgpu_bo *bo; +- uint64_t gpu_addr; +- void *cpu_ptr; ++ union { ++ struct { ++ struct amdgpu_bo *bo; ++ uint64_t gpu_addr; ++ void *cpu_ptr; ++ } data1; ++ struct { ++ struct mutex lock; ++ struct amdgpu_bo *bo; ++ struct list_head bo_va_list; ++ uint32_t domain; ++ unsigned int mapped_to_gpu_memory; ++ void *kptr; ++ uint64_t va; ++ unsigned evicted; /* eviction counter */ ++ struct delayed_work work; /* for restore evicted mem */ ++ struct mm_struct *mm; /* for restore */ ++ /* flags bitfield */ ++ bool readonly : 1; ++ bool execute : 1; ++ bool no_substitute : 1; ++ bool aql_queue : 1; ++ } data2; ++ }; ++ + }; + + +@@ -51,17 +83,81 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); + void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); + void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); + ++int amdgpu_amdkfd_evict_mem(struct amdgpu_device *adev, struct kgd_mem *mem, ++ struct mm_struct *mm); ++int amdgpu_amdkfd_schedule_restore_mem(struct amdgpu_device *adev, ++ struct kgd_mem *mem, ++ struct mm_struct *mm, ++ unsigned long delay); ++void amdgpu_amdkfd_cancel_restore_mem(struct amdgpu_device *adev, ++ struct kgd_mem *mem); ++ + struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); + struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); + + /* Shared API */ ++int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm, ++ struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va); + int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + void **mem_obj, uint64_t *gpu_addr, + void **cpu_ptr); + void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); +-uint64_t get_vmem_size(struct kgd_dev *kgd); ++void get_local_mem_info(struct kgd_dev *kgd, ++ struct kfd_local_mem_info *mem_info); + uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); + + uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd); ++void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info); ++int map_gtt_bo_to_kernel(struct kgd_dev *kgd, ++ struct kgd_mem *mem, void **kptr); ++int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, ++ struct kgd_dev **dmabuf_kgd, ++ uint64_t *bo_size, void *metadata_buffer, ++ size_t buffer_size, uint32_t *metadata_size, ++ uint32_t *flags); ++ ++/* GPUVM API */ ++int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( ++ struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem, ++ uint64_t *offset, void **kptr, ++ struct kfd_process_device *pdd, uint32_t flags); ++int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem); ++int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); ++int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); ++ ++int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm); ++void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm); ++ ++uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm); ++ ++int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, ++ struct kfd_vm_fault_info *info); ++ ++int amdgpu_amdkfd_gpuvm_mmap_bo( ++ struct kgd_dev *kgd, struct vm_area_struct *vma); ++ ++int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, ++ struct kgd_mem *mem, void **kptr); ++ ++struct kfd_process_device *amdgpu_amdkfd_gpuvm_get_pdd_from_buffer_object( ++ struct kgd_dev *kgd, struct kgd_mem *mem); ++int amdgpu_amdkfd_gpuvm_return_bo_size( ++ struct kgd_dev *kgd, struct kgd_mem *mem); ++ ++int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, ++ struct kgd_mem *mem, uint64_t offset, ++ uint64_t size, struct sg_table **ret_sg); ++void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( ++ struct kgd_mem *mem, struct sg_table *sg); ++int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, int dma_buf_fd, ++ uint64_t va, void *vm, ++ struct kgd_mem **mem, uint64_t *size); ++int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm); ++int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm); + + #endif /* AMDGPU_AMDKFD_H_INCLUDED */ ++ +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +index e283d31..873e2b7 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +@@ -38,6 +38,9 @@ + #include "gmc/gmc_7_1_sh_mask.h" + #include "cik_structs.h" + ++ ++#define AMDKFD_SKIP_UNCOMPILED_CODE 1 ++ + enum { + MAX_TRAPID = 8, /* 3 bits in the bitfield. */ + MAX_WATCH_ADDRESSES = 4 +@@ -54,8 +57,8 @@ enum { + enum { + ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, + ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, +- ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, +- /* extend the mask to 26 bits to match the low address field */ ++ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000, ++ /* extend the mask to 26 bits in order to match the low address field. */ + ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, + ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF + }; +@@ -80,30 +83,43 @@ union TCP_WATCH_CNTL_BITS { + float f32All; + }; + ++static int create_process_vm(struct kgd_dev *kgd, void **vm); ++static void destroy_process_vm(struct kgd_dev *kgd, void *vm); ++ ++static uint32_t get_process_page_dir(void *vm); ++ ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); ++static int map_memory_to_gpu(struct kgd_dev *kgd, struct kgd_mem *mem, ++ void *vm); ++static int unmap_memory_from_gpu(struct kgd_dev *kgd, struct kgd_mem *mem, ++ void *vm); ++static int alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem, ++ uint64_t *offset, void **kptr, struct kfd_process_device *pdd, ++ uint32_t flags); ++static int free_memory_of_gpu(struct kgd_dev *kgd, struct kgd_mem *mem); ++ ++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); ++ + /* + * Register access functions + */ + +-static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, +- uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); +- +-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, +- unsigned int vmid); +- +-static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, +- uint32_t hpd_size, uint64_t hpd_gpu_addr); ++static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, uint32_t sh_mem_config, ++ uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); ++static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, unsigned int vmid); ++static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, uint32_t hpd_size, uint64_t hpd_gpu_addr); + static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr); ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t page_table_base); + static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, +- uint32_t pipe_id, uint32_t queue_id); +- ++ uint32_t pipe_id, uint32_t queue_id); ++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); + static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id); +-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); + static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout); + static int kgd_address_watch_disable(struct kgd_dev *kgd); +@@ -123,15 +139,25 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid); + static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + uint8_t vmid); + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +- +-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); ++static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req); ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid); ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype); ++static int mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma); ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base); + + static const struct kfd2kgd_calls kfd2kgd = { + .init_gtt_mem_allocation = alloc_gtt_mem, + .free_gtt_mem = free_gtt_mem, +- .get_vmem_size = get_vmem_size, ++ .get_local_mem_info = get_local_mem_info, + .get_gpu_clock_counter = get_gpu_clock_counter, + .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, ++ .create_process_vm = create_process_vm, ++ .destroy_process_vm = destroy_process_vm, ++ .get_process_page_dir = get_process_page_dir, ++ .open_graphic_handle = open_graphic_handle, + .program_sh_mem_settings = kgd_program_sh_mem_settings, + .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, + .init_pipeline = kgd_init_pipeline, +@@ -149,14 +175,103 @@ static const struct kfd2kgd_calls kfd2kgd = { + .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid, + .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, + .write_vmid_invalidate_request = write_vmid_invalidate_request, +- .get_fw_version = get_fw_version ++ .alloc_memory_of_gpu = alloc_memory_of_gpu, ++ .free_memory_of_gpu = free_memory_of_gpu, ++ .map_memory_to_gpu = map_memory_to_gpu, ++ .unmap_memory_to_gpu = unmap_memory_from_gpu, ++ .get_fw_version = get_fw_version, ++ .set_num_of_requests = set_num_of_requests, ++ .get_cu_info = get_cu_info, ++ .alloc_memory_of_scratch = alloc_memory_of_scratch, ++ .write_config_static_mem = write_config_static_mem, ++ .mmap_bo = mmap_bo, ++ .map_gtt_bo_to_kernel = map_gtt_bo_to_kernel, ++ .set_vm_context_page_table_base = set_vm_context_page_table_base, ++ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, ++ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info + }; + +-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void) ++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions() + { + return (struct kfd2kgd_calls *)&kfd2kgd; + } + ++/* ++ * Creates a VM context for HSA process ++ */ ++static int create_process_vm(struct kgd_dev *kgd, void **vm) ++{ ++ int ret; ++ struct amdgpu_vm *new_vm; ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ BUG_ON(kgd == NULL); ++ BUG_ON(vm == NULL); ++ ++ new_vm = kzalloc(sizeof(struct amdgpu_vm), GFP_KERNEL); ++ if (new_vm == NULL) ++ return -ENOMEM; ++ ++ /* Initialize the VM context, allocate the page directory and zero it */ ++ ret = amdgpu_vm_init(adev, new_vm); ++ if (ret != 0) { ++ /* Undo everything related to the new VM context */ ++ amdgpu_vm_fini(adev, new_vm); ++ kfree(new_vm); ++ new_vm = NULL; ++ } ++ ++ /* Pin the PD directory*/ ++ amdgpu_bo_reserve(new_vm->page_directory, true); ++ amdgpu_bo_pin(new_vm->page_directory, AMDGPU_GEM_DOMAIN_VRAM, NULL); ++ amdgpu_bo_unreserve(new_vm->page_directory); ++#if 0 ++ new_vm->pd_gpu_addr = amdgpu_bo_gpu_offset(new_vm->page_directory); ++#endif ++ *vm = (void *) new_vm; ++ ++ return ret; ++} ++ ++/* ++ * Destroys a VM context of HSA process ++ */ ++static void destroy_process_vm(struct kgd_dev *kgd, void *vm) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ struct amdgpu_vm *rvm = (struct amdgpu_vm *) vm; ++ ++ BUG_ON(kgd == NULL); ++ BUG_ON(vm == NULL); ++ ++ /* Unpin the PD directory*/ ++ amdgpu_bo_reserve(rvm->page_directory, true); ++ amdgpu_bo_unpin(rvm->page_directory); ++ amdgpu_bo_unreserve(rvm->page_directory); ++ ++ /* Release the VM context */ ++ amdgpu_vm_fini(adev, rvm); ++ kfree(vm); ++} ++ ++static uint32_t get_process_page_dir(void *vm) ++{ ++#if 0 ++ struct amdgpu_vm *rvm = (struct amdgpu_vm *) vm; ++ ++ BUG_ON(vm == NULL); ++ ++ return rvm->pd_gpu_addr >> AMDGPU_GPU_PAGE_SHIFT; ++#endif ++ return 0; ++} ++ ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, ++ int fd, uint32_t handle, struct kgd_mem **mem) ++{ ++ return 0; ++} ++ + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) + { + return (struct amdgpu_device *)kgd; +@@ -221,12 +336,11 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, + + /* + * We have to assume that there is no outstanding mapping. +- * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because +- * a mapping is in progress or because a mapping finished and the +- * SW cleared it. So the protocol is to always wait & clear. ++ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a mapping ++ * is in progress or because a mapping finished and the SW cleared it. ++ * So the protocol is to always wait & clear. + */ +- uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | +- ATC_VMID0_PASID_MAPPING__VALID_MASK; ++ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; + + WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); + +@@ -253,7 +367,7 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) + uint32_t mec; + uint32_t pipe; + +- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + lock_srbm(kgd, mec, pipe, 0, 0); +@@ -272,8 +386,7 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) + + retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + + m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; +- +- pr_debug("kfd: sdma base address: 0x%x\n", retval); ++ pr_err("kfd: sdma base address: 0x%x\n", retval); + + return retval; + } +@@ -289,7 +402,8 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) + } + + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr) ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t page_table_base) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t wptr_shadow, is_wptr_shadow_valid; +@@ -363,24 +477,13 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); + +- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, +- m->sdma_rlc_virtual_addr); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, +- m->sdma_rlc_rb_base); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, +- m->sdma_rlc_virtual_addr); +- +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, +- m->sdma_rlc_rb_base_hi); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, +- m->sdma_rlc_rb_rptr_addr_lo); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, +- m->sdma_rlc_rb_rptr_addr_hi); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, +- m->sdma_rlc_doorbell); +- +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, +- m->sdma_rlc_rb_cntl); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, m->sdma_rlc_virtual_addr); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdma_rlc_rb_base_hi); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdma_rlc_rb_rptr_addr_lo); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdma_rlc_rb_rptr_addr_hi); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, m->sdma_rlc_doorbell); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, m->sdma_rlc_rb_cntl); + + return 0; + } +@@ -440,10 +543,11 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, + + while (true) { + temp = RREG32(mmCP_HQD_ACTIVE); +- if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) ++ if (temp & CP_HQD_ACTIVE__ACTIVE__SHIFT) + break; + if (timeout <= 0) { +- pr_err("kfd: cp queue preemption time out.\n"); ++ pr_err("kfd: cp queue preemption time out (%dms)\n", ++ temp); + release_queue(kgd); + return -ETIME; + } +@@ -503,8 +607,8 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd) + + /* Turning off this address until we set all the registers */ + for (i = 0; i < MAX_WATCH_ADDRESSES; i++) +- WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_CNTL], cntl.u32All); ++ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); + + return 0; + } +@@ -522,20 +626,20 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, + + /* Turning off this watch point until we set all the registers */ + cntl.bitfields.valid = 0; +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_CNTL], cntl.u32All); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); + +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_ADDR_HI], addr_hi); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], ++ addr_hi); + +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_ADDR_LO], addr_lo); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], ++ addr_lo); + + /* Enable the watch point */ + cntl.bitfields.valid = 1; + +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_CNTL], cntl.u32All); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); + + return 0; + } +@@ -589,7 +693,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); +- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; ++ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; + } + + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) +@@ -599,6 +703,56 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) + WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); + } + ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype) ++{ ++ uint32_t reg; ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | ++ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | ++ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | ++ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; ++ ++ WREG32(mmSH_STATIC_MEM_CONFIG, reg); ++ return 0; ++} ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ lock_srbm(kgd, 0, 0, 0, vmid); ++ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); ++ unlock_srbm(kgd); ++ ++ return 0; ++} ++ ++ ++static int alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem, uint64_t *offset, ++ void **kptr, struct kfd_process_device *pdd, uint32_t flags) ++{ ++ return -EFAULT; ++} ++ ++static int free_memory_of_gpu(struct kgd_dev *kgd, struct kgd_mem *mem) ++{ ++ return -EFAULT; ++} ++ ++static int map_memory_to_gpu(struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) ++{ ++ return -EFAULT; ++} ++ ++static int unmap_memory_from_gpu(struct kgd_dev *kgd, struct kgd_mem *mem, ++ void *vm) ++{ ++ return -EFAULT; ++} ++ + static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + { + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +@@ -639,12 +793,12 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + + case KGD_ENGINE_SDMA1: + hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[0].fw->data; ++ adev->sdma[0].fw->data; + break; + + case KGD_ENGINE_SDMA2: + hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[1].fw->data; ++ adev->sdma[1].fw->data; + break; + + default: +@@ -658,3 +812,32 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + return hdr->common.ucode_version; + } + ++static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req) ++{ ++ uint32_t value; ++ struct amdgpu_device *adev = get_amdgpu_device(dev); ++ ++ value = RREG32(mmATC_ATS_DEBUG); ++ value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK; ++ value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT); ++ ++ WREG32(mmATC_ATS_DEBUG, value); ++} ++ ++static int mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) ++{ ++ return 0; ++} ++ ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ /* TODO: Don't use hardcoded VMIDs */ ++ if (vmid < 8 || vmid > 15) { ++ pr_err("amdkfd: trying to set page table base for wrong VMID\n"); ++ return; ++ } ++ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); ++} ++ +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +index e00fadd..aeca2b6 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +@@ -28,6 +28,7 @@ + #include "amdgpu.h" + #include "amdgpu_amdkfd.h" + #include "amdgpu_ucode.h" ++#include "amdgpu_amdkfd_gfx_v8.h" + #include "gca/gfx_8_0_sh_mask.h" + #include "gca/gfx_8_0_d.h" + #include "gca/gfx_8_0_enum.h" +@@ -38,7 +39,24 @@ + #include "vi_structs.h" + #include "vid.h" + +-struct cik_sdma_rlc_registers; ++static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { ++ mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, ++ mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, ++ mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, ++ mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL ++}; ++ ++ ++struct vi_sdma_mqd; ++ ++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem); ++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); ++ ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, ++ int fd, uint32_t handle, struct kgd_mem **mem); ++ ++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); + + /* + * Register access functions +@@ -54,7 +72,8 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, + uint32_t hpd_size, uint64_t hpd_gpu_addr); + static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr); ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t page_table_base); + static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + uint32_t pipe_id, uint32_t queue_id); +@@ -83,14 +102,27 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, + static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + uint8_t vmid); + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); ++static void set_num_of_requests(struct kgd_dev *kgd, ++ uint8_t num_of_requests); ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid); ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype); ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base); + + static const struct kfd2kgd_calls kfd2kgd = { + .init_gtt_mem_allocation = alloc_gtt_mem, + .free_gtt_mem = free_gtt_mem, +- .get_vmem_size = get_vmem_size, ++ .get_local_mem_info = get_local_mem_info, + .get_gpu_clock_counter = get_gpu_clock_counter, + .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, ++ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, ++ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, ++ .create_process_gpumem = create_process_gpumem, ++ .destroy_process_gpumem = destroy_process_gpumem, ++ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, ++ .open_graphic_handle = open_graphic_handle, + .program_sh_mem_settings = kgd_program_sh_mem_settings, + .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, + .init_pipeline = kgd_init_pipeline, +@@ -110,14 +142,52 @@ static const struct kfd2kgd_calls kfd2kgd = { + .get_atc_vmid_pasid_mapping_valid = + get_atc_vmid_pasid_mapping_valid, + .write_vmid_invalidate_request = write_vmid_invalidate_request, +- .get_fw_version = get_fw_version ++ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, ++ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, ++ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, ++ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, ++ .get_fw_version = get_fw_version, ++ .set_num_of_requests = set_num_of_requests, ++ .get_cu_info = get_cu_info, ++ .set_num_of_requests = set_num_of_requests, ++ .alloc_memory_of_scratch = alloc_memory_of_scratch, ++ .write_config_static_mem = write_config_static_mem, ++ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, ++ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, ++ .set_vm_context_page_table_base = set_vm_context_page_table_base, ++ .get_pdd_from_buffer_object = ++ amdgpu_amdkfd_gpuvm_get_pdd_from_buffer_object, ++ .return_bo_size = amdgpu_amdkfd_gpuvm_return_bo_size, ++ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, ++ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, ++ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, ++ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, ++ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info + }; + +-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void) ++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions() + { + return (struct kfd2kgd_calls *)&kfd2kgd; + } + ++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem) ++{ ++ return 0; ++} ++ ++/* Destroys the GPU allocation and frees the kgd_mem structure */ ++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) ++{ ++ ++} ++ ++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, ++ int fd, uint32_t handle, struct kgd_mem **mem) ++{ ++ return 0; ++} ++ + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) + { + return (struct amdgpu_device *)kgd; +@@ -227,9 +297,15 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) + return 0; + } + +-static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) ++static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m) + { +- return 0; ++ uint32_t retval; ++ ++ retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + ++ m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET; ++ pr_debug("kfd: sdma base address: 0x%x\n", retval); ++ ++ return retval; + } + + static inline struct vi_mqd *get_mqd(void *mqd) +@@ -237,13 +313,14 @@ static inline struct vi_mqd *get_mqd(void *mqd) + return (struct vi_mqd *)mqd; + } + +-static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) ++static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) + { +- return (struct cik_sdma_rlc_registers *)mqd; ++ return (struct vi_sdma_mqd *)mqd; + } + + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr) ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t page_table_base) + { + struct vi_mqd *m; + uint32_t shadow_wptr, valid_wptr; +@@ -306,6 +383,49 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, + + static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) + { ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ struct vi_sdma_mqd *m; ++ uint32_t sdma_base_addr; ++ uint32_t temp, timeout = 2000; ++ uint32_t data; ++ ++ ++ m = get_sdma_mqd(mqd); ++ sdma_base_addr = get_sdma_base_addr(m); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, ++ m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); ++ ++ while (true) { ++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); ++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) ++ break; ++ if (timeout == 0) ++ return -ETIME; ++ msleep(10); ++ timeout -= 10; ++ } ++ if (m->sdma_engine_id) { ++ data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); ++ data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, ++ RESUME_CTX, 0); ++ WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); ++ } else { ++ data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); ++ data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, ++ RESUME_CTX, 0); ++ WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); ++ } ++ ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, m->sdmax_rlcx_doorbell); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, m->sdmax_rlcx_virtual_addr); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdmax_rlcx_rb_base_hi); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdmax_rlcx_rb_rptr_addr_lo); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdmax_rlcx_rb_rptr_addr_hi); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, m->sdmax_rlcx_rb_cntl); ++ + return 0; + } + +@@ -334,7 +454,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct cik_sdma_rlc_registers *m; ++ struct vi_sdma_mqd *m; + uint32_t sdma_base_addr; + uint32_t sdma_rlc_rb_cntl; + +@@ -382,7 +502,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct cik_sdma_rlc_registers *m; ++ struct vi_sdma_mqd *m; + uint32_t sdma_base_addr; + uint32_t temp; + int timeout = utimeout; +@@ -396,7 +516,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + + while (true) { + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) ++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + break; + if (timeout <= 0) + return -ETIME; +@@ -405,9 +525,9 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + } + + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, ++ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | ++ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); + + return 0; + } +@@ -429,7 +549,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); +- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; ++ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; + } + + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) +@@ -441,6 +561,21 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) + + static int kgd_address_watch_disable(struct kgd_dev *kgd) + { ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ union TCP_WATCH_CNTL_BITS cntl; ++ unsigned int i; ++ ++ cntl.u32All = 0; ++ ++ cntl.bitfields.valid = 0; ++ cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; ++ cntl.bitfields.atc = 1; ++ ++ /* Turning off this address until we set all the registers */ ++ for (i = 0; i < MAX_WATCH_ADDRESSES; i++) ++ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); ++ + return 0; + } + +@@ -450,6 +585,28 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, + uint32_t addr_hi, + uint32_t addr_lo) + { ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ union TCP_WATCH_CNTL_BITS cntl; ++ ++ cntl.u32All = cntl_val; ++ ++ /* Turning off this watch point until we set all the registers */ ++ cntl.bitfields.valid = 0; ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); ++ ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], ++ addr_hi); ++ ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], ++ addr_lo); ++ ++ /* Enable the watch point */ ++ cntl.bitfields.valid = 1; ++ ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], ++ cntl.u32All); ++ + return 0; + } + +@@ -482,6 +639,32 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, + unsigned int watch_point_id, + unsigned int reg_offset) + { ++ return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; ++} ++ ++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype) ++{ ++ uint32_t reg; ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | ++ element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | ++ index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | ++ mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; ++ ++ WREG32(mmSH_STATIC_MEM_CONFIG, reg); ++ return 0; ++} ++static int alloc_memory_of_scratch(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ ++ lock_srbm(kgd, 0, 0, 0, vmid); ++ WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); ++ unlock_srbm(kgd); ++ + return 0; + } + +@@ -525,12 +708,12 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + + case KGD_ENGINE_SDMA1: + hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[0].fw->data; ++ adev->sdma[0].fw->data; + break; + + case KGD_ENGINE_SDMA2: + hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[1].fw->data; ++ adev->sdma[1].fw->data; + break; + + default: +@@ -543,3 +726,21 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + /* Only 12 bit in use*/ + return hdr->common.ucode_version; + } ++ ++static void set_num_of_requests(struct kgd_dev *kgd, ++ uint8_t num_of_requests) ++{ ++ pr_debug("in %s this is a stub\n", __func__); ++} ++ ++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base) ++{ ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ /* TODO: Don't use hardcoded VMIDs */ ++ if (vmid < 8 || vmid > 15) { ++ pr_err("amdkfd: trying to set page table base for wrong VMID\n"); ++ return; ++ } ++ WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); ++} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +new file mode 100644 +index 0000000..454c247 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +@@ -0,0 +1,1619 @@ ++/* ++ * Copyright 2014 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include <linux/module.h> ++#include <linux/fdtable.h> ++#include <linux/uaccess.h> ++#include <linux/firmware.h> ++#include <linux/list.h> ++#include <drm/drmP.h> ++#include <linux/dma-buf.h> ++#include "amdgpu.h" ++#include "amdgpu_amdkfd.h" ++#include "amdgpu_ucode.h" ++#include "gca/gfx_8_0_sh_mask.h" ++#include "gca/gfx_8_0_d.h" ++#include "gca/gfx_8_0_enum.h" ++#include "oss/oss_3_0_sh_mask.h" ++#include "oss/oss_3_0_d.h" ++#include "gmc/gmc_8_1_sh_mask.h" ++#include "gmc/gmc_8_1_d.h" ++#include "vi_structs.h" ++#include "vid.h" ++ ++/* Special VM and GART address alignment needed for VI pre-Fiji due to ++ * a HW bug. */ ++#define VI_BO_SIZE_ALIGN (0x8000) ++ ++static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) ++{ ++ return (struct amdgpu_device *)kgd; ++} ++ ++struct kfd_process_device *amdgpu_amdkfd_gpuvm_get_pdd_from_buffer_object( ++ struct kgd_dev *kgd, struct kgd_mem *mem) ++{ ++ return mem->data2.bo->pdd; ++} ++ ++static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, ++ struct list_head *list_bo_va) ++{ ++ struct kfd_bo_va_list *entry; ++ ++ list_for_each_entry(entry, list_bo_va, bo_list) ++ if (entry->bo_va->vm == avm) ++ return false; ++ ++ return true; ++} ++ ++static int add_bo_to_vm(struct amdgpu_device *adev, uint64_t va, ++ struct amdgpu_vm *avm, struct amdgpu_bo *bo, ++ struct list_head *list_bo_va, ++ bool readonly, bool execute) ++{ ++ int ret; ++ struct kfd_bo_va_list *bo_va_entry; ++ uint32_t flags; ++ ++ bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL); ++ if (!bo_va_entry) ++ return -ENOMEM; ++ ++ BUG_ON(va == 0); ++ ++ pr_debug("amdkfd: adding bo_va to bo %p and va 0x%llx id 0x%x\n", ++ bo, va, adev->dev->id); ++ ++ amdgpu_bo_reserve(bo, true); ++ ++ /* Add BO to VM internal data structures*/ ++ bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo); ++ if (bo_va_entry->bo_va == NULL) { ++ ret = -EINVAL; ++ pr_err("amdkfd: Failed to add BO object to VM. ret == %d\n", ++ ret); ++ goto err_vmadd; ++ } ++ ++ flags = AMDGPU_PTE_READABLE | AMDGPU_PTE_WRITEABLE; ++ if (readonly) ++ flags = AMDGPU_PTE_READABLE; ++ if (execute) ++ flags |= AMDGPU_PTE_EXECUTABLE; ++ ++ /* Set virtual address for the allocation, allocate PTs, ++ * if needed, and zero them */ ++ ret = amdgpu_vm_bo_map(adev, bo_va_entry->bo_va, ++ va, 0, amdgpu_bo_size(bo), ++ flags | AMDGPU_PTE_VALID); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to set virtual address for BO. ret == %d (0x%llx)\n", ++ ret, va); ++ goto err_vmsetaddr; ++ } ++ ++ bo_va_entry->kgd_dev = (void *)adev; ++ bo_va_entry->is_mapped = false; ++ list_add(&bo_va_entry->bo_list, list_bo_va); ++ ++ return 0; ++ ++err_vmsetaddr: ++ amdgpu_bo_reserve(bo, true); ++ amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va); ++ /* This will put the bo_va_mapping on the vm->freed ++ * list. amdgpu_vm_clear_freed needs the PTs to be reserved so ++ * we don't call it here. That can wait until the next time ++ * the page tables are updated for a map or unmap. */ ++ kfree(bo_va_entry); ++err_vmadd: ++ amdgpu_bo_unreserve(bo); ++ return ret; ++} ++ ++static void remove_bo_from_vm(struct amdgpu_device *adev, ++ struct amdgpu_bo *bo, struct amdgpu_bo_va *bo_va) ++{ ++ amdgpu_bo_reserve(bo, true); ++ amdgpu_vm_bo_rmv(adev, bo_va); ++ amdgpu_bo_unreserve(bo); ++} ++ ++ ++static int try_pin_bo(struct amdgpu_bo *bo, uint64_t *mc_address, bool resv, ++ uint32_t domain) ++{ ++ int ret = 0; ++ uint64_t temp; ++ ++ if (resv) { ++ ret = amdgpu_bo_reserve(bo, true); ++ if (ret != 0) ++ return ret; ++ } ++ ++ if (!amdgpu_ttm_tt_has_userptr(bo->tbo.ttm)) { ++ ret = amdgpu_bo_pin(bo, domain, &temp); ++ if (mc_address) ++ *mc_address = temp; ++ if (ret != 0) ++ goto error; ++ if (domain == AMDGPU_GEM_DOMAIN_GTT) { ++ ret = amdgpu_bo_kmap(bo, NULL); ++ if (ret != 0) { ++ pr_err("amdgpu: failed kmap GTT BO\n"); ++ goto error; ++ } ++ } ++ } else { ++ /* amdgpu_bo_pin doesn't support userptr. Therefore we ++ * can use the bo->pin_count for our version of ++ * pinning without conflict. */ ++ if (bo->pin_count == 0) { ++ amdgpu_ttm_placement_from_domain(bo, domain); ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, ++ true, false); ++ if (ret != 0) { ++ pr_err("amdgpu: failed to validate BO\n"); ++ goto error; ++ } ++ } ++ bo->pin_count++; ++ } ++ ++error: ++ if (resv) ++ amdgpu_bo_unreserve(bo); ++ ++ return ret; ++} ++ ++static int unpin_bo(struct amdgpu_bo *bo, bool resv) ++{ ++ int ret = 0; ++ ++ if (resv) { ++ ret = amdgpu_bo_reserve(bo, true); ++ if (ret != 0) ++ return ret; ++ } ++ ++ amdgpu_bo_kunmap(bo); ++ ++ if (!amdgpu_ttm_tt_has_userptr(bo->tbo.ttm)) { ++ ret = amdgpu_bo_unpin(bo); ++ if (ret != 0) ++ goto error; ++ } else if (--bo->pin_count == 0) { ++ amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); ++ if (ret != 0) { ++ pr_err("amdgpu: failed to validate BO\n"); ++ goto error; ++ } ++ } ++ ++error: ++ if (resv) ++ amdgpu_bo_unreserve(bo); ++ ++ return ret; ++} ++ ++ ++static int try_pin_pts(struct amdgpu_bo_va *bo_va, bool resv) ++{ ++ int ret; ++ uint64_t pt_idx, start, last, failed; ++ struct amdgpu_vm *vm; ++ struct amdgpu_bo_va_mapping *mapping; ++ ++ vm = bo_va->vm; ++ list_for_each_entry(mapping, &bo_va->valids, list) { ++ start = mapping->it.start >> amdgpu_vm_block_size; ++ last = mapping->it.last >> amdgpu_vm_block_size; ++ ++ pr_debug("start PT index %llu last PT index %llu\n", start, last); ++ ++ /* walk over the address space and pin the page tables BOs*/ ++ for (pt_idx = start; pt_idx <= last; pt_idx++) { ++ ret = try_pin_bo(vm->page_tables[pt_idx].bo, NULL, resv, ++ AMDGPU_GEM_DOMAIN_VRAM); ++ if (ret != 0) { ++ failed = pt_idx; ++ goto err; ++ } ++ } ++ } ++ ++ list_for_each_entry(mapping, &bo_va->invalids, list) { ++ start = mapping->it.start >> amdgpu_vm_block_size; ++ last = mapping->it.last >> amdgpu_vm_block_size; ++ ++ pr_debug("start PT index %llu last PT index %llu\n", start, last); ++ ++ /* walk over the address space and pin the page tables BOs*/ ++ for (pt_idx = start; pt_idx <= last; pt_idx++) { ++ ret = try_pin_bo(vm->page_tables[pt_idx].bo, NULL, resv, ++ AMDGPU_GEM_DOMAIN_VRAM); ++ if (ret != 0) { ++ failed = pt_idx; ++ goto err; ++ } ++ } ++ } ++ ++ return 0; ++ ++err: ++ pr_err("amdgpu: Failed to pin BO's PTEs\n"); ++ /* Unpin all already pinned BOs*/ ++ if (failed > 0) { ++ for (pt_idx = start; pt_idx <= failed - 1; pt_idx++) ++ unpin_bo(vm->page_tables[pt_idx].bo, resv); ++ } ++ return ret; ++} ++ ++static void unpin_pts(struct amdgpu_bo_va *bo_va, struct amdgpu_vm *vm, ++ bool resv) ++{ ++ uint64_t pt_idx, start, last; ++ struct amdgpu_bo_va_mapping *mapping; ++ ++ list_for_each_entry(mapping, &bo_va->valids, list) { ++ start = mapping->it.start >> amdgpu_vm_block_size; ++ last = mapping->it.last >> amdgpu_vm_block_size; ++ ++ pr_debug("start PT index %llu last PT index %llu\n", start, last); ++ ++ /* walk over the address space and unpin the page tables BOs*/ ++ for (pt_idx = start; pt_idx <= last; pt_idx++) ++ unpin_bo(vm->page_tables[pt_idx].bo, resv); ++ } ++ ++ list_for_each_entry(mapping, &bo_va->invalids, list) { ++ start = mapping->it.start >> amdgpu_vm_block_size; ++ last = mapping->it.last >> amdgpu_vm_block_size; ++ ++ pr_debug("start PT index %llu last PT index %llu\n", start, last); ++ ++ /* walk over the address space and unpin the page tables BOs*/ ++ for (pt_idx = start; pt_idx <= last; pt_idx++) ++ unpin_bo(vm->page_tables[pt_idx].bo, resv); ++ } ++} ++ ++static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, ++ size_t size, void *vm, struct kgd_mem **mem, ++ uint64_t *offset, void **kptr, struct kfd_process_device *pdd, ++ u32 domain, u64 flags, bool aql_queue, ++ bool readonly, bool execute, bool no_sub, bool userptr) ++{ ++ struct amdgpu_device *adev; ++ int ret; ++ struct amdgpu_bo *bo; ++ uint64_t user_addr = 0; ++ int byte_align; ++ ++ BUG_ON(kgd == NULL); ++ BUG_ON(size == 0); ++ BUG_ON(mem == NULL); ++ BUG_ON(vm == NULL); ++ ++ if (aql_queue) ++ size = size >> 1; ++ if (userptr) { ++ if (!offset || !*offset) ++ return -EINVAL; ++ user_addr = *offset; ++ } ++ ++ adev = get_amdgpu_device(kgd); ++ byte_align = adev->asic_type != CHIP_FIJI ? VI_BO_SIZE_ALIGN : 1; ++ ++ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); ++ if (*mem == NULL) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ INIT_LIST_HEAD(&(*mem)->data2.bo_va_list); ++ mutex_init(&(*mem)->data2.lock); ++ (*mem)->data2.readonly = readonly; ++ (*mem)->data2.execute = execute; ++ (*mem)->data2.no_substitute = no_sub; ++ (*mem)->data2.aql_queue = aql_queue; ++ ++ pr_debug("amdkfd: allocating GTT BO size %lu\n", size); ++ ++ /* Allocate buffer object. Userptr objects need to start out ++ * in the CPU domain, get moved to GTT when pinned. */ ++ ret = amdgpu_bo_create(adev, size, byte_align, false, ++ userptr ? AMDGPU_GEM_DOMAIN_CPU : domain, ++ flags, NULL, NULL, &bo); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to create BO object on GTT. ret == %d\n", ++ ret); ++ goto err_bo_create; ++ } ++ bo->kfd_bo = *mem; ++ bo->pdd = pdd; ++ (*mem)->data2.bo = bo; ++ ++ pr_debug("Created BO on GTT with size %zu bytes\n", size); ++ ++ if (userptr) { ++ ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, ++ AMDGPU_GEM_USERPTR_ANONONLY); ++ if (ret) { ++ dev_err(adev->dev, ++ "(%d) failed to set userptr\n", ret); ++ goto allocate_mem_set_userptr_failed; ++ } ++ ++ ret = amdgpu_mn_register(bo, user_addr); ++ if (ret) { ++ dev_err(adev->dev, ++ "(%d) failed to register MMU notifier\n", ret); ++ goto allocate_mem_set_userptr_failed; ++ } ++ } ++ ++ ret = add_bo_to_vm(adev, va, vm, bo, &(*mem)->data2.bo_va_list, ++ (*mem)->data2.readonly, (*mem)->data2.execute); ++ if (ret != 0) ++ goto err_map; ++ ++ if (aql_queue) { ++ ret = add_bo_to_vm(adev, va + size, ++ vm, bo, &(*mem)->data2.bo_va_list, ++ (*mem)->data2.readonly, (*mem)->data2.execute); ++ if (ret != 0) ++ goto err_map; ++ } ++ ++ pr_debug("Set BO to VA %p\n", (void *) va); ++ ++ if (kptr) { ++ ret = amdgpu_bo_reserve(bo, true); ++ if (ret) { ++ dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", ret); ++ goto allocate_mem_reserve_bo_failed; ++ } ++ ++ ret = amdgpu_bo_pin(bo, domain, ++ NULL); ++ if (ret) { ++ dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", ret); ++ goto allocate_mem_pin_bo_failed; ++ } ++ ++ ret = amdgpu_bo_kmap(bo, kptr); ++ if (ret) { ++ dev_err(adev->dev, ++ "(%d) failed to map bo to kernel for amdkfd\n", ret); ++ goto allocate_mem_kmap_bo_failed; ++ } ++ (*mem)->data2.kptr = *kptr; ++ ++ amdgpu_bo_unreserve(bo); ++ } ++ ++ (*mem)->data2.va = va; ++ (*mem)->data2.domain = domain; ++ (*mem)->data2.mapped_to_gpu_memory = 0; ++ ++ if (offset) ++ *offset = amdgpu_bo_mmap_offset(bo); ++ ++ return 0; ++ ++allocate_mem_kmap_bo_failed: ++ amdgpu_bo_unpin(bo); ++allocate_mem_pin_bo_failed: ++ amdgpu_bo_unreserve(bo); ++allocate_mem_reserve_bo_failed: ++err_map: ++ if (userptr) ++ amdgpu_mn_unregister(bo); ++allocate_mem_set_userptr_failed: ++ amdgpu_bo_unref(&bo); ++err_bo_create: ++ kfree(*mem); ++err: ++ return ret; ++} ++ ++/* Reserving a BO and its page table BOs must happen atomically to ++ * avoid deadlocks. When updating userptrs we need to temporarily ++ * back-off the reservation and then reacquire it. Track all the ++ * reservation info in a context structure. Buffers can be mapped to ++ * multiple VMs simultaneously (buffers being restored on multiple ++ * GPUs). */ ++struct bo_vm_reservation_context { ++ struct amdgpu_bo_list_entry kfd_bo; ++ unsigned n_vms; ++ struct amdgpu_bo_list_entry **vm_bos; ++ struct ww_acquire_ctx ticket; ++ struct list_head list, duplicates; ++ bool reserved; ++}; ++ ++static int reserve_bo_and_vms(struct amdgpu_device *adev, struct amdgpu_bo *bo, ++ struct list_head *bo_va_list, ++ struct amdgpu_vm *vm, bool is_mapped, ++ struct bo_vm_reservation_context *ctx) ++{ ++ struct kfd_bo_va_list *entry; ++ unsigned i; ++ int ret; ++ ++ INIT_LIST_HEAD(&ctx->list); ++ INIT_LIST_HEAD(&ctx->duplicates); ++ ++ ctx->kfd_bo.robj = bo; ++ ctx->kfd_bo.prefered_domains = bo->initial_domain; ++ ctx->kfd_bo.allowed_domains = bo->initial_domain; ++ ctx->kfd_bo.priority = 0; ++ ctx->kfd_bo.tv.bo = &bo->tbo; ++ ctx->kfd_bo.tv.shared = true; ++ ctx->kfd_bo.user_pages = NULL; ++ list_add(&ctx->kfd_bo.tv.head, &ctx->list); ++ ++ ctx->reserved = false; ++ ++ ctx->n_vms = 0; ++ list_for_each_entry(entry, bo_va_list, bo_list) { ++ if ((vm && vm != entry->bo_va->vm) || ++ entry->is_mapped != is_mapped) ++ continue; ++ ctx->n_vms++; ++ } ++ if (ctx->n_vms == 0) ++ ctx->vm_bos = NULL; ++ else { ++ ctx->vm_bos = kzalloc(sizeof(struct amdgpu_bo_list_entry *) ++ * ctx->n_vms, GFP_KERNEL); ++ if (ctx->vm_bos == NULL) ++ return -ENOMEM; ++ } ++ ++ i = 0; ++ list_for_each_entry(entry, bo_va_list, bo_list) { ++ if ((vm && vm != entry->bo_va->vm) || ++ entry->is_mapped != is_mapped) ++ continue; ++ ++ ctx->vm_bos[i] = amdgpu_vm_get_bos(adev, entry->bo_va->vm, ++ &ctx->list); ++ if (!ctx->vm_bos[i]) { ++ pr_err("amdkfd: Failed to get bos from vm\n"); ++ ret = -ENOMEM; ++ goto out; ++ } ++ i++; ++ } ++ ++ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, ++ false, &ctx->duplicates); ++ if (!ret) ++ ctx->reserved = true; ++ else ++ pr_err("amdkfd: Failed to reserve buffers in ttm\n"); ++ ++out: ++ if (ret) { ++ for (i = 0; i < ctx->n_vms; i++) { ++ if (ctx->vm_bos[i]) ++ drm_free_large(ctx->vm_bos[i]); ++ } ++ kfree(ctx->vm_bos); ++ ctx->vm_bos = NULL; ++ } ++ ++ return ret; ++} ++ ++static void unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, ++ bool wait) ++{ ++ if (wait) { ++ struct ttm_validate_buffer *entry; ++ int ret; ++ ++ list_for_each_entry(entry, &ctx->list, head) { ++ ret = ttm_bo_wait(entry->bo, false, false, false); ++ if (ret != 0) ++ pr_err("amdkfd: Failed to wait for PT/PD update (err == %d)\n", ++ ret); ++ } ++ } ++ if (ctx->reserved) ++ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); ++ if (ctx->vm_bos) { ++ unsigned i; ++ ++ for (i = 0; i < ctx->n_vms; i++) { ++ if (ctx->vm_bos[i]) ++ drm_free_large(ctx->vm_bos[i]); ++ } ++ kfree(ctx->vm_bos); ++ } ++ ctx->reserved = false; ++ ctx->vm_bos = NULL; ++} ++ ++/* Must be called with mem->data2.lock held and a BO/VM reservation ++ * context. Temporarily drops the lock and reservation for updating ++ * user pointers, to avoid circular lock dependencies between MM locks ++ * and buffer reservations. If user pages are invalidated while the ++ * lock and reservation are dropped, try again. */ ++static int update_user_pages(struct kgd_mem *mem, struct mm_struct *mm, ++ struct bo_vm_reservation_context *ctx) ++{ ++ struct amdgpu_bo *bo; ++ unsigned tries = 10; ++ int ret; ++ ++ bo = mem->data2.bo; ++ if (!amdgpu_ttm_tt_has_userptr(bo->tbo.ttm)) ++ return 0; ++ ++ if (bo->tbo.ttm->state != tt_bound) { ++ struct page **pages; ++ int invalidated; ++ ++ /* get user pages without locking the BO to avoid ++ * circular lock dependency with MMU notifier. Retry ++ * until we have the current version. */ ++ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); ++ ctx->reserved = false; ++ pages = drm_calloc_large(bo->tbo.ttm->num_pages, ++ sizeof(struct page *)); ++ if (!pages) ++ return -ENOMEM; ++ ++ mutex_unlock(&mem->data2.lock); ++ ++ while (true) { ++ down_read(&mm->mmap_sem); ++ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, pages); ++ up_read(&mm->mmap_sem); ++ ++ mutex_lock(&mem->data2.lock); ++ if (ret != 0) ++ return ret; ++ ++ BUG_ON(bo != mem->data2.bo); ++ ++ ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, ++ false, &ctx->duplicates); ++ if (unlikely(ret != 0)) { ++ release_pages(pages, bo->tbo.ttm->num_pages, 0); ++ drm_free_large(pages); ++ return ret; ++ } ++ ctx->reserved = true; ++ if (!amdgpu_ttm_tt_userptr_invalidated(bo->tbo.ttm, ++ &invalidated) || ++ bo->tbo.ttm->state == tt_bound || ++ --tries == 0) ++ break; ++ ++ release_pages(pages, bo->tbo.ttm->num_pages, 0); ++ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); ++ ctx->reserved = false; ++ mutex_unlock(&mem->data2.lock); ++ } ++ ++ /* If someone else already bound it, release our pages ++ * array, otherwise copy it into the ttm BO. */ ++ if (bo->tbo.ttm->state == tt_bound || tries == 0) ++ release_pages(pages, bo->tbo.ttm->num_pages, 0); ++ else ++ memcpy(bo->tbo.ttm->pages, pages, ++ sizeof(struct page *) * bo->tbo.ttm->num_pages); ++ drm_free_large(pages); ++ } ++ ++ if (tries == 0) { ++ pr_err("Gave up trying to update user pages\n"); ++ return -EDEADLK; ++ } ++ ++ return 0; ++} ++ ++static int map_bo_to_gpuvm(struct amdgpu_device *adev, struct amdgpu_bo *bo, ++ struct amdgpu_bo_va *bo_va) ++{ ++ struct amdgpu_vm_id *vm_id; ++ struct amdgpu_vm *vm; ++ int ret; ++ ++ /* Pin PTs */ ++ ret = try_pin_pts(bo_va, false); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to pin PTs\n"); ++ goto err_failed_to_pin_pts; ++ } ++ ++ /* Pin the PD directory*/ ++ vm = bo_va->vm; ++ vm_id = &vm->ids[7]; ++ ret = try_pin_bo(vm->page_directory, &vm_id->pd_gpu_addr, false, ++ AMDGPU_GEM_DOMAIN_VRAM); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to pin PD\n"); ++ goto err_failed_to_pin_pd; ++ } ++ ++ mutex_lock(&vm->mutex); ++ ++ /* Update the page directory */ ++ ret = amdgpu_vm_update_page_directory(adev, vm); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to radeon_vm_update_page_directory\n"); ++ goto err_failed_to_update_pd; ++ } ++ ++ /* ++ * The previously "released" BOs are really released and their VAs are ++ * removed from PT. This function is called here because it requires ++ * the radeon_vm::mutex to be locked and PT to be reserved ++ */ ++ ret = amdgpu_vm_clear_freed(adev, vm); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to radeon_vm_clear_freed\n"); ++ goto err_failed_vm_clear_freed; ++ } ++ ++ /* Update the page tables */ ++ ret = amdgpu_vm_bo_update(adev, bo_va, &bo->tbo.mem); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to radeon_vm_bo_update\n"); ++ goto err_failed_to_update_pts; ++ } ++ ++ ret = amdgpu_vm_clear_invalids(adev, vm, NULL); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to radeon_vm_clear_invalids\n"); ++ goto err_failed_to_vm_clear_invalids; ++ } ++ ++ mutex_unlock(&vm->mutex); ++ ++ return 0; ++ ++err_failed_to_vm_clear_invalids: ++ amdgpu_vm_bo_update(adev, bo_va, NULL); ++err_failed_to_update_pts: ++err_failed_vm_clear_freed: ++err_failed_to_update_pd: ++ mutex_unlock(&vm->mutex); ++ unpin_bo(vm->page_directory, false); ++err_failed_to_pin_pd: ++ unpin_pts(bo_va, vm, false); ++err_failed_to_pin_pts: ++ ++ return ret; ++} ++ ++#define BOOL_TO_STR(b) (b == true) ? "true" : "false" ++ ++int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( ++ struct kgd_dev *kgd, uint64_t va, size_t size, ++ void *vm, struct kgd_mem **mem, ++ uint64_t *offset, void **kptr, ++ struct kfd_process_device *pdd, uint32_t flags) ++{ ++ bool aql_queue, public, readonly, execute, no_sub, userptr; ++ u64 alloc_flag; ++ uint32_t domain; ++ uint64_t *temp_offset; ++ ++ if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) { ++ pr_err("amdgpu: current hw doesn't support paged memory\n"); ++ return -EINVAL; ++ } ++ ++ domain = 0; ++ alloc_flag = 0; ++ temp_offset = NULL; ++ ++ aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false; ++ public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false; ++ readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false; ++ execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false; ++ no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false; ++ userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false; ++ ++ if (userptr && kptr) { ++ pr_err("amdgpu: userptr can't be mapped to kernel\n"); ++ return -EINVAL; ++ } ++ ++ /* ++ * Check on which domain to allocate BO ++ */ ++ if (offset && !userptr) ++ *offset = 0; ++ if (flags & ALLOC_MEM_FLAGS_VRAM) { ++ domain = AMDGPU_GEM_DOMAIN_VRAM; ++ alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; ++ if (public) { ++ alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; ++ temp_offset = offset; ++ } ++ } else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) { ++ domain = AMDGPU_GEM_DOMAIN_GTT; ++ alloc_flag = 0; ++ temp_offset = offset; ++ } ++ ++ pr_debug("amdgpu: allocating BO domain %d alloc_flag 0x%llu public %s readonly %s execute %s no substitue %s va 0x%llx\n", ++ domain, ++ alloc_flag, ++ BOOL_TO_STR(public), ++ BOOL_TO_STR(readonly), ++ BOOL_TO_STR(execute), ++ BOOL_TO_STR(no_sub), ++ va); ++ ++ return __alloc_memory_of_gpu(kgd, va, size, vm, mem, ++ temp_offset, kptr, pdd, domain, ++ alloc_flag, ++ aql_queue, readonly, execute, ++ no_sub, userptr); ++} ++ ++int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem) ++{ ++ struct amdgpu_device *adev; ++ struct kfd_bo_va_list *entry, *tmp; ++ ++ BUG_ON(kgd == NULL); ++ BUG_ON(mem == NULL); ++ ++ adev = get_amdgpu_device(kgd); ++ ++ mutex_lock(&mem->data2.lock); ++ ++ if (mem->data2.mapped_to_gpu_memory > 0) { ++ pr_err("BO with size %lu bytes is mapped to GPU. Need to unmap it before release va 0x%llx\n", ++ mem->data2.bo->tbo.mem.size, mem->data2.va); ++ mutex_unlock(&mem->data2.lock); ++ return -EBUSY; ++ } ++ ++ mutex_unlock(&mem->data2.lock); ++ /* lock is not needed after this, since mem is unused and will ++ * be freed anyway */ ++ ++ amdgpu_mn_unregister(mem->data2.bo); ++ if (mem->data2.work.work.func) ++ cancel_delayed_work_sync(&mem->data2.work); ++ ++ /* Remove from VM internal data structures */ ++ list_for_each_entry_safe(entry, tmp, &mem->data2.bo_va_list, bo_list) { ++ pr_debug("Releasing BO with VA %p, size %lu bytes\n", ++ entry->bo_va, ++ mem->data2.bo->tbo.mem.size); ++ if (entry->bo_va->vm != NULL) ++ remove_bo_from_vm( ++ (struct amdgpu_device *)entry->kgd_dev, ++ mem->data2.bo, entry->bo_va); ++ list_del(&entry->bo_list); ++ kfree(entry); ++ } ++ ++ /* Free the BO*/ ++ amdgpu_bo_unref(&mem->data2.bo); ++ kfree(mem); ++ ++ return 0; ++} ++int amdgpu_amdkfd_gpuvm_return_bo_size(struct kgd_dev *kgd, struct kgd_mem *mem) ++{ ++ struct amdgpu_bo *bo; ++ ++ BUG_ON(mem == NULL); ++ ++ bo = mem->data2.bo; ++ return bo->tbo.mem.size; ++ ++} ++int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) ++{ ++ struct amdgpu_device *adev; ++ int ret; ++ struct amdgpu_bo *bo; ++ uint32_t domain; ++ struct kfd_bo_va_list *entry; ++ struct bo_vm_reservation_context ctx; ++ ++ BUG_ON(kgd == NULL); ++ BUG_ON(mem == NULL); ++ ++ adev = get_amdgpu_device(kgd); ++ ++ mutex_lock(&mem->data2.lock); ++ ++ bo = mem->data2.bo; ++ ++ BUG_ON(bo == NULL); ++ ++ domain = mem->data2.domain; ++ ++ pr_debug("amdgpu: try to map VA 0x%llx domain %d\n", ++ mem->data2.va, domain); ++ ++ if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, ++ &mem->data2.bo_va_list)) { ++ pr_debug("amdkfd: add new BO_VA to list 0x%llx\n", ++ mem->data2.va); ++ ret = add_bo_to_vm(adev, mem->data2.va, (struct amdgpu_vm *)vm, ++ bo, &mem->data2.bo_va_list, ++ mem->data2.readonly, mem->data2.execute); ++ if (ret != 0) ++ goto add_bo_to_vm_failed; ++ if (mem->data2.aql_queue) { ++ ret = add_bo_to_vm(adev, ++ mem->data2.va + bo->tbo.mem.size, ++ (struct amdgpu_vm *)vm, ++ bo, &mem->data2.bo_va_list, ++ mem->data2.readonly, ++ mem->data2.execute); ++ if (ret != 0) ++ goto add_bo_to_vm_failed; ++ } ++ } ++ ++ if (!mem->data2.evicted) { ++ ret = reserve_bo_and_vms(adev, bo, &mem->data2.bo_va_list, ++ vm, false, &ctx); ++ if (unlikely(ret != 0)) ++ goto bo_reserve_failed; ++ ++ ret = update_user_pages(mem, current->mm, &ctx); ++ if (ret != 0) ++ goto update_user_pages_failed; ++ } ++ ++ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { ++ if (entry->bo_va->vm == vm && entry->is_mapped == false) { ++ if (mem->data2.evicted) { ++ /* If the BO is evicted, just mark the ++ * mapping as mapped and stop the GPU's ++ * queues until the BO is restored. */ ++ ret = kgd2kfd->quiesce_mm(adev->kfd, ++ current->mm); ++ if (ret != 0) ++ goto quiesce_failed; ++ entry->is_mapped = true; ++ mem->data2.mapped_to_gpu_memory++; ++ continue; ++ } ++ ++ pr_debug("amdkfd: Trying to map VA 0x%llx to vm %p\n", ++ mem->data2.va, vm); ++ /* ++ * We need to pin the allocated BO, PD and appropriate PTs and to ++ * create a mapping of virtual to MC address ++ */ ++ /* Pin BO*/ ++ ret = try_pin_bo(bo, NULL, false, domain); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to pin BO\n"); ++ goto pin_bo_failed; ++ } ++ ++ ret = map_bo_to_gpuvm(adev, bo, entry->bo_va); ++ if (ret != 0) { ++ pr_err("amdkfd: Failed to map radeon bo to gpuvm\n"); ++ goto map_bo_to_gpuvm_failed; ++ } ++ entry->is_mapped = true; ++ mem->data2.mapped_to_gpu_memory++; ++ pr_debug("amdgpu: INC mapping count %d\n", ++ mem->data2.mapped_to_gpu_memory); ++ } ++ } ++ ++ if (!mem->data2.evicted) ++ unreserve_bo_and_vms(&ctx, true); ++ mutex_unlock(&mem->data2.lock); ++ return 0; ++ ++map_bo_to_gpuvm_failed: ++ unpin_bo(bo, false); ++pin_bo_failed: ++quiesce_failed: ++update_user_pages_failed: ++ if (!mem->data2.evicted) ++ unreserve_bo_and_vms(&ctx, false); ++bo_reserve_failed: ++add_bo_to_vm_failed: ++ mutex_unlock(&mem->data2.lock); ++ return ret; ++} ++ ++int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm) ++{ ++ int ret; ++ struct amdgpu_vm *new_vm; ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ ++ BUG_ON(kgd == NULL); ++ BUG_ON(vm == NULL); ++ ++ new_vm = kzalloc(sizeof(struct amdgpu_vm), GFP_KERNEL); ++ if (new_vm == NULL) ++ return -ENOMEM; ++ ++ /* Initialize the VM context, allocate the page directory and zero it */ ++ ret = amdgpu_vm_init(adev, new_vm); ++ if (ret != 0) { ++ pr_err("amdgpu: failed init vm ret %d\n", ret); ++ /* Undo everything related to the new VM context */ ++ amdgpu_vm_fini(adev, new_vm); ++ kfree(new_vm); ++ new_vm = NULL; ++ } ++ ++ *vm = (void *) new_vm; ++ ++ /* ++ * The previously "released" BOs are really released and their VAs are ++ * removed from PT. This function is called here because it requires ++ * the radeon_vm::mutex to be locked and PT to be reserved ++ */ ++ ret = amdgpu_vm_clear_freed(adev, new_vm); ++ if (ret != 0) ++ pr_err("amdgpu: Failed to amdgpu_vm_clear_freed\n"); ++ ++ pr_debug("amdgpu: created process vm with address 0x%llx\n", ++ new_vm->ids[7].pd_gpu_addr); ++ ++ return ret; ++} ++ ++void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd; ++ struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; ++ ++ BUG_ON(kgd == NULL); ++ BUG_ON(vm == NULL); ++ ++ pr_debug("Destroying process vm with address %p\n", vm); ++ ++ /* Release the VM context */ ++ amdgpu_vm_fini(adev, avm); ++ kfree(vm); ++} ++ ++uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm) ++{ ++ struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; ++ struct amdgpu_vm_id *vm_id; ++ ++ BUG_ON(avm == NULL); ++ ++ vm_id = &avm->ids[7]; ++ return vm_id->pd_gpu_addr >> AMDGPU_GPU_PAGE_SHIFT; ++} ++ ++int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, ++ struct kfd_vm_fault_info *mem) ++{ ++ struct amdgpu_device *adev; ++ ++ BUG_ON(kgd == NULL); ++ adev = (struct amdgpu_device *) kgd; ++ if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) { ++ *mem = *adev->mc.vm_fault_info; ++ mb(); ++ atomic_set(&adev->mc.vm_fault_info_updated, 0); ++ } ++ return 0; ++} ++ ++static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, ++ struct amdgpu_bo_va *bo_va) ++{ ++ struct amdgpu_vm *vm; ++ int ret; ++ struct ttm_validate_buffer tv; ++ struct amdgpu_bo_list_entry *vm_bos; ++ struct ww_acquire_ctx ticket; ++ struct list_head list, duplicates; ++ ++ INIT_LIST_HEAD(&list); ++ INIT_LIST_HEAD(&duplicates); ++ ++ vm = bo_va->vm; ++ tv.bo = &bo_va->bo->tbo; ++ tv.shared = true; ++ list_add(&tv.head, &list); ++ ++ vm_bos = amdgpu_vm_get_bos(adev, vm, &list); ++ if (!vm_bos) { ++ pr_err("amdkfd: Failed to get bos from vm\n"); ++ ret = -ENOMEM; ++ goto err_failed_to_get_bos; ++ } ++ ++ ret = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates); ++ if (ret) { ++ pr_err("amdkfd: Failed to reserve buffers in ttm\n"); ++ goto err_failed_to_ttm_reserve; ++ } ++ ++ mutex_lock(&vm->mutex); ++ ++ /* ++ * The previously "released" BOs are really released and their VAs are ++ * removed from PT. This function is called here because it requires ++ * the radeon_vm::mutex to be locked and PT to be reserved ++ */ ++ amdgpu_vm_clear_freed(adev, vm); ++ ++ /* Update the page tables - Remove the mapping from bo_va */ ++ amdgpu_vm_bo_update(adev, bo_va, NULL); ++ ++ amdgpu_vm_clear_invalids(adev, vm, NULL); ++ ++ mutex_unlock(&vm->mutex); ++ ++ ttm_eu_backoff_reservation(&ticket, &list); ++ drm_free_large(vm_bos); ++ ++ return 0; ++err_failed_to_ttm_reserve: ++ drm_free_large(vm_bos); ++err_failed_to_get_bos: ++ return ret; ++} ++ ++static bool is_mem_on_local_device(struct kgd_dev *kgd, ++ struct list_head *bo_va_list, void *vm) ++{ ++ struct kfd_bo_va_list *entry; ++ ++ list_for_each_entry(entry, bo_va_list, bo_list) { ++ if (entry->kgd_dev == kgd && entry->bo_va->vm == vm) ++ return true; ++ } ++ ++ return false; ++} ++ ++int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( ++ struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) ++{ ++ struct kfd_bo_va_list *entry; ++ struct amdgpu_device *adev; ++ unsigned mapped_before; ++ int ret = 0; ++ ++ BUG_ON(kgd == NULL); ++ BUG_ON(mem == NULL); ++ ++ adev = (struct amdgpu_device *) kgd; ++ ++ mutex_lock(&mem->data2.lock); ++ ++ /* ++ * Make sure that this BO mapped on KGD before unmappping it ++ */ ++ if (!is_mem_on_local_device(kgd, &mem->data2.bo_va_list, vm)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (mem->data2.mapped_to_gpu_memory == 0) { ++ pr_debug("BO size %lu bytes at va 0x%llx is not mapped\n", ++ mem->data2.bo->tbo.mem.size, mem->data2.va); ++ ret = -EINVAL; ++ goto out; ++ } ++ mapped_before = mem->data2.mapped_to_gpu_memory; ++ ++ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { ++ if (entry->kgd_dev == kgd && ++ entry->bo_va->vm == vm && ++ entry->is_mapped) { ++ if (mem->data2.evicted) { ++ /* If the BO is evicted, just mark the ++ * mapping as unmapped and allow the ++ * GPU's queues to resume. */ ++ ret = kgd2kfd->resume_mm(adev->kfd, ++ current->mm); ++ if (ret != 0) ++ goto out; ++ entry->is_mapped = false; ++ mem->data2.mapped_to_gpu_memory--; ++ continue; ++ } ++ ++ pr_debug("unmapping BO with VA 0x%llx, size %lu bytes from GPU memory\n", ++ mem->data2.va, ++ mem->data2.bo->tbo.mem.size); ++ /* Unpin the PD directory*/ ++ unpin_bo(entry->bo_va->vm->page_directory, true); ++ /* Unpin PTs */ ++ unpin_pts(entry->bo_va, entry->bo_va->vm, true); ++ ++ /* Unpin BO*/ ++ unpin_bo(mem->data2.bo, true); ++ ret = unmap_bo_from_gpuvm(adev, entry->bo_va); ++ if (ret == 0) { ++ entry->is_mapped = false; ++ } else { ++ pr_err("amdgpu: failed unmap va 0x%llx\n", ++ mem->data2.va); ++ goto out; ++ } ++ mem->data2.mapped_to_gpu_memory--; ++ pr_debug("amdgpu: DEC mapping count %d\n", ++ mem->data2.mapped_to_gpu_memory); ++ } ++ } ++ if (mapped_before == mem->data2.mapped_to_gpu_memory) { ++ pr_debug("BO size %lu bytes at va 0x%llx is not mapped on GPU %x:%x.%x\n", ++ mem->data2.bo->tbo.mem.size, mem->data2.va, ++ adev->pdev->bus->number, PCI_SLOT(adev->pdev->devfn), ++ PCI_FUNC(adev->pdev->devfn)); ++ ret = -EINVAL; ++ } ++ ++out: ++ mutex_unlock(&mem->data2.lock); ++ return ret; ++} ++ ++int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) ++{ ++ struct amdgpu_device *adev; ++ ++ adev = get_amdgpu_device(kgd); ++ BUG_ON(!adev); ++ ++ return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev); ++} ++ ++int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, ++ struct kgd_mem *mem, void **kptr) ++{ ++ int ret; ++ struct amdgpu_device *adev; ++ struct amdgpu_bo *bo; ++ ++ adev = get_amdgpu_device(kgd); ++ ++ mutex_lock(&mem->data2.lock); ++ ++ bo = mem->data2.bo; ++ /* map the buffer */ ++ ret = amdgpu_bo_reserve(bo, true); ++ if (ret) { ++ dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", ret); ++ mutex_unlock(&mem->data2.lock); ++ return ret; ++ } ++ ++ ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT, ++ NULL); ++ if (ret) { ++ dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", ret); ++ amdgpu_bo_unreserve(bo); ++ mutex_unlock(&mem->data2.lock); ++ return ret; ++ } ++ ++ ret = amdgpu_bo_kmap(bo, kptr); ++ if (ret) { ++ dev_err(adev->dev, ++ "(%d) failed to map bo to kernel for amdkfd\n", ret); ++ amdgpu_bo_unpin(bo); ++ amdgpu_bo_unreserve(bo); ++ mutex_unlock(&mem->data2.lock); ++ return ret; ++ } ++ ++ mem->data2.kptr = *kptr; ++ ++ amdgpu_bo_unreserve(bo); ++ mutex_unlock(&mem->data2.lock); ++ ++ return 0; ++} ++ ++static int pin_bo_wo_map(struct kgd_mem *mem) ++{ ++ struct amdgpu_bo *bo = mem->data2.bo; ++ int ret = 0; ++ ++ ret = amdgpu_bo_reserve(bo, false); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++ ret = amdgpu_bo_pin(bo, mem->data2.domain, NULL); ++ amdgpu_bo_unreserve(bo); ++ ++ return ret; ++} ++ ++static void unpin_bo_wo_map(struct kgd_mem *mem) ++{ ++ struct amdgpu_bo *bo = mem->data2.bo; ++ int ret = 0; ++ ++ ret = amdgpu_bo_reserve(bo, false); ++ if (unlikely(ret != 0)) ++ return; ++ ++ amdgpu_bo_unpin(bo); ++ amdgpu_bo_unreserve(bo); ++} ++ ++#define AMD_GPU_PAGE_SHIFT PAGE_SHIFT ++#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT) ++ ++static int get_sg_table(struct amdgpu_device *adev, ++ struct kgd_mem *mem, uint64_t offset, ++ uint64_t size, struct sg_table **ret_sg) ++{ ++ struct amdgpu_bo *bo = mem->data2.bo; ++ struct sg_table *sg = NULL; ++ unsigned long bus_addr; ++ unsigned int chunks; ++ unsigned int i; ++ struct scatterlist *s; ++ uint64_t offset_in_page; ++ unsigned int page_size; ++ int ret; ++ ++ sg = kmalloc(sizeof(struct sg_table), GFP_KERNEL); ++ if (!sg) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ if (bo->initial_domain == AMDGPU_GEM_DOMAIN_VRAM) ++ page_size = AMD_GPU_PAGE_SIZE; ++ else ++ page_size = PAGE_SIZE; ++ ++ ++ offset_in_page = offset & (page_size - 1); ++ chunks = (size + offset_in_page + page_size - 1) ++ / page_size; ++ ++ ret = sg_alloc_table(sg, chunks, GFP_KERNEL); ++ if (unlikely(ret)) ++ goto out; ++ ++ if (bo->initial_domain == AMDGPU_GEM_DOMAIN_VRAM) { ++ bus_addr = bo->tbo.offset + adev->mc.aper_base + offset; ++ ++ for_each_sg(sg->sgl, s, sg->orig_nents, i) { ++ uint64_t chunk_size, length; ++ ++ chunk_size = page_size - offset_in_page; ++ length = min(size, chunk_size); ++ ++ sg_set_page(s, NULL, length, offset_in_page); ++ s->dma_address = bus_addr; ++ s->dma_length = length; ++ ++ size -= length; ++ offset_in_page = 0; ++ bus_addr += length; ++ } ++ } else { ++ struct page **pages; ++ unsigned int cur_page; ++ ++ pages = bo->tbo.ttm->pages; ++ ++ cur_page = offset / page_size; ++ for_each_sg(sg->sgl, s, sg->orig_nents, i) { ++ uint64_t chunk_size, length; ++ ++ chunk_size = page_size - offset_in_page; ++ length = min(size, chunk_size); ++ ++ sg_set_page(s, pages[cur_page], length, offset_in_page); ++ s->dma_address = page_to_phys(pages[cur_page]); ++ s->dma_length = length; ++ ++ size -= length; ++ offset_in_page = 0; ++ cur_page++; ++ } ++ } ++ ++ *ret_sg = sg; ++ return 0; ++out: ++ kfree(sg); ++ *ret_sg = NULL; ++ return ret; ++} ++ ++int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, ++ struct kgd_mem *mem, uint64_t offset, ++ uint64_t size, struct sg_table **ret_sg) ++{ ++ int ret; ++ struct amdgpu_device *adev; ++ ++ ret = pin_bo_wo_map(mem); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++ adev = get_amdgpu_device(kgd); ++ ++ ret = get_sg_table(adev, mem, offset, size, ret_sg); ++ if (ret) ++ unpin_bo_wo_map(mem); ++ ++ return ret; ++} ++ ++void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( ++ struct kgd_mem *mem, struct sg_table *sg) ++{ ++ sg_free_table(sg); ++ kfree(sg); ++ ++ unpin_bo_wo_map(mem); ++} ++ ++int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, int dma_buf_fd, ++ uint64_t va, void *vm, ++ struct kgd_mem **mem, uint64_t *size) ++{ ++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ struct dma_buf *dma_buf; ++ struct drm_gem_object *obj; ++ struct amdgpu_bo *bo; ++ int r = -EINVAL; ++ ++ dma_buf = dma_buf_get(dma_buf_fd); ++ if (IS_ERR(dma_buf)) ++ return PTR_ERR(dma_buf); ++ ++ if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) ++ /* Can't handle non-graphics buffers */ ++ goto out_put; ++ ++ obj = dma_buf->priv; ++ if (obj->dev->dev_private != adev) ++ /* Can't handle buffers from other devices */ ++ goto out_put; ++ ++ bo = gem_to_amdgpu_bo(obj); ++ if (!(bo->initial_domain & (AMDGPU_GEM_DOMAIN_VRAM | ++ AMDGPU_GEM_DOMAIN_GTT))) ++ /* Only VRAM and GTT BOs are supported */ ++ goto out_put; ++ ++ if (size) ++ *size = amdgpu_bo_size(bo); ++ ++ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); ++ if (*mem == NULL) { ++ r = -ENOMEM; ++ goto out_put; ++ } ++ ++ INIT_LIST_HEAD(&(*mem)->data2.bo_va_list); ++ mutex_init(&(*mem)->data2.lock); ++ (*mem)->data2.execute = true; /* executable by default */ ++ ++ (*mem)->data2.bo = amdgpu_bo_ref(bo); ++ (*mem)->data2.va = va; ++ (*mem)->data2.domain = (bo->initial_domain & AMDGPU_GEM_DOMAIN_VRAM) ? ++ AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT; ++ (*mem)->data2.mapped_to_gpu_memory = 0; ++ ++ r = add_bo_to_vm(adev, va, vm, bo, &(*mem)->data2.bo_va_list, ++ false, true); ++ ++ if (r) { ++ amdgpu_bo_unref(&bo); ++ kfree(*mem); ++ *mem = NULL; ++ } ++ ++out_put: ++ dma_buf_put(dma_buf); ++ return r; ++} ++ ++/* Runs out of process context. mem->data2.lock must be held. */ ++int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm) ++{ ++ struct kfd_bo_va_list *entry; ++ unsigned n_evicted; ++ int r = 0; ++ ++ pr_debug("Evicting buffer %p\n", mem); ++ ++ if (mem->data2.mapped_to_gpu_memory == 0) ++ return 0; ++ ++ /* Remove all GPU mappings of the buffer, but don't change any ++ * of the is_mapped flags so we can restore it later. The ++ * queues of the affected GPUs are quiesced first. Count the ++ * number of evicted mappings so we can roll back if something ++ * goes wrong. */ ++ n_evicted = 0; ++ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { ++ struct amdgpu_device *adev; ++ ++ if (!entry->is_mapped) ++ continue; ++ ++ adev = (struct amdgpu_device *)entry->kgd_dev; ++ ++ r = kgd2kfd->quiesce_mm(adev->kfd, mm); ++ if (r != 0) { ++ pr_err("failed to quiesce KFD\n"); ++ goto fail; ++ } ++ ++ r = unmap_bo_from_gpuvm(adev, entry->bo_va); ++ if (r != 0) { ++ pr_err("failed unmap va 0x%llx\n", ++ mem->data2.va); ++ kgd2kfd->resume_mm(adev->kfd, mm); ++ goto fail; ++ } ++ ++ /* Unpin the PD directory*/ ++ unpin_bo(entry->bo_va->vm->page_directory, true); ++ /* Unpin PTs */ ++ unpin_pts(entry->bo_va, entry->bo_va->vm, true); ++ ++ /* Unpin BO*/ ++ unpin_bo(mem->data2.bo, true); ++ ++ n_evicted++; ++ } ++ ++ return 0; ++ ++fail: ++ /* To avoid hangs and keep state consistent, roll back partial ++ * eviction by restoring queues and marking mappings as ++ * unmapped. Access to now unmapped buffers will fault. */ ++ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { ++ struct amdgpu_device *adev; ++ ++ if (n_evicted == 0) ++ break; ++ if (!entry->is_mapped) ++ continue; ++ ++ entry->is_mapped = false; ++ ++ adev = (struct amdgpu_device *)entry->kgd_dev; ++ if (kgd2kfd->resume_mm(adev->kfd, mm)) ++ pr_err("Failed to resume KFD\n"); ++ ++ n_evicted--; ++ } ++ ++ return r; ++} ++ ++/* Runs out of process context. mem->data2.lock must be held. */ ++int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm) ++{ ++ struct bo_vm_reservation_context ctx; ++ struct kfd_bo_va_list *entry; ++ uint32_t domain; ++ int r, ret = 0; ++ bool have_pages = false; ++ ++ pr_debug("Restoring buffer %p\n", mem); ++ ++ if (mem->data2.mapped_to_gpu_memory == 0) ++ return 0; ++ ++ domain = mem->data2.domain; ++ ++ ret = reserve_bo_and_vms(mem->data2.bo->adev, mem->data2.bo, ++ &mem->data2.bo_va_list, NULL, true, &ctx); ++ if (likely(ret == 0)) { ++ ret = update_user_pages(mem, mm, &ctx); ++ have_pages = !ret; ++ if (!have_pages) ++ unreserve_bo_and_vms(&ctx, false); ++ } ++ ++ /* update_user_pages drops the lock briefly. Check if someone ++ * else evicted or restored the buffer in the mean time */ ++ if (mem->data2.evicted != 1) { ++ unreserve_bo_and_vms(&ctx, false); ++ return 0; ++ } ++ ++ /* Try to restore all mappings. Mappings that fail to restore ++ * will be marked as unmapped. If we failed to get the user ++ * pages, all mappings will be marked as unmapped. */ ++ list_for_each_entry(entry, &mem->data2.bo_va_list, bo_list) { ++ struct amdgpu_device *adev; ++ ++ if (!entry->is_mapped) ++ continue; ++ ++ adev = (struct amdgpu_device *)entry->kgd_dev; ++ ++ if (unlikely(!have_pages)) { ++ entry->is_mapped = false; ++ goto resume_kfd; ++ } ++ ++ r = try_pin_bo(mem->data2.bo, NULL, false, domain); ++ if (unlikely(r != 0)) { ++ pr_err("Failed to pin BO\n"); ++ entry->is_mapped = false; ++ if (ret == 0) ++ ret = r; ++ goto resume_kfd; ++ } ++ ++ r = map_bo_to_gpuvm(adev, mem->data2.bo, entry->bo_va); ++ if (unlikely(r != 0)) { ++ pr_err("Failed to map BO to gpuvm\n"); ++ entry->is_mapped = false; ++ unpin_bo(mem->data2.bo, true); ++ if (ret == 0) ++ ret = r; ++ } ++ ++ /* Resume queues even if restore failed. Worst case ++ * the app will get a GPUVM fault. That's better than ++ * hanging the queues indefinitely. */ ++resume_kfd: ++ r = kgd2kfd->resume_mm(adev->kfd, mm); ++ if (ret != 0) { ++ pr_err("Failed to resume KFD\n"); ++ if (ret == 0) ++ ret = r; ++ } ++ } ++ ++ if (have_pages) ++ unreserve_bo_and_vms(&ctx, true); ++ ++ return ret; ++} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +index 06b824c..5ce6528 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +@@ -381,7 +381,7 @@ void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring *ring) + static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) + { +- struct amdgpu_ring *ring = (struct amdgpu_ring*)kcl_file_private(f); ++ struct amdgpu_ring *ring = file_inode(f)->i_private; + int r, i; + uint32_t value, result, early[3]; + +diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig +index e13c67c..ac49532 100644 +--- a/drivers/gpu/drm/amd/amdkfd/Kconfig ++++ b/drivers/gpu/drm/amd/amdkfd/Kconfig +@@ -5,5 +5,6 @@ + config HSA_AMD + tristate "HSA kernel driver for AMD GPU devices" + depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64 ++ select DRM_AMDGPU_USERPTR + help + Enable this if you want to use HSA features on AMD GPU devices. +diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile +index b400d56..60c60c0 100644 +--- a/drivers/gpu/drm/amd/amdkfd/Makefile ++++ b/drivers/gpu/drm/amd/amdkfd/Makefile +@@ -14,6 +14,6 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ + kfd_process_queue_manager.o kfd_device_queue_manager.o \ + kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ + kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ +- kfd_dbgdev.o kfd_dbgmgr.o ++ kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o + + obj-$(CONFIG_HSA_AMD) += amdkfd.o +diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +index 211fc48..02a9082 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c ++++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +@@ -24,40 +24,59 @@ + #include "kfd_events.h" + #include "cik_int.h" + +-static bool cik_event_interrupt_isr(struct kfd_dev *dev, ++static bool is_cpc_vm_fault(struct kfd_dev *dev, + const uint32_t *ih_ring_entry) + { +- unsigned int pasid; + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; + +- pasid = (ihre->ring_id & 0xffff0000) >> 16; ++ if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || ++ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && ++ ihre->vmid >= dev->vm_info.first_vmid_kfd && ++ ihre->vmid <= dev->vm_info.last_vmid_kfd) ++ return true; ++ return false; ++} ++static bool cik_event_interrupt_isr(struct kfd_dev *dev, ++ const uint32_t *ih_ring_entry) ++{ ++ const struct cik_ih_ring_entry *ihre = ++ (const struct cik_ih_ring_entry *)ih_ring_entry; + + /* Do not process in ISR, just request it to be forwarded to WQ. */ +- return (pasid != 0) && ++ return (ihre->pasid != 0) && + (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || + ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || +- ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); ++ ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || ++ is_cpc_vm_fault(dev, ih_ring_entry)); + } + + static void cik_event_interrupt_wq(struct kfd_dev *dev, + const uint32_t *ih_ring_entry) + { +- unsigned int pasid; + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; + +- pasid = (ihre->ring_id & 0xffff0000) >> 16; +- +- if (pasid == 0) ++ if (ihre->pasid == 0) + return; + + if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) +- kfd_signal_event_interrupt(pasid, 0, 0); ++ kfd_signal_event_interrupt(ihre->pasid, 0, 0); + else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) +- kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8); ++ kfd_signal_event_interrupt(ihre->pasid, ihre->data & 0xFF, 8); + else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) +- kfd_signal_hw_exception_event(pasid); ++ kfd_signal_hw_exception_event(ihre->pasid); ++ else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || ++ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { ++ struct kfd_vm_fault_info info; ++ ++ dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); ++ kfd_process_vm_fault(dev->dqm, ihre->pasid); ++ if (info.vmid == ihre->vmid) ++ kfd_signal_vm_fault_event(dev, ihre->pasid, &info); ++ else ++ kfd_signal_vm_fault_event(dev, ihre->pasid, NULL); ++ } + } + + const struct kfd_event_interrupt_class event_interrupt_class_cik = { +diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h +index 79a16d2..feb3c24 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h ++++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h +@@ -26,16 +26,30 @@ + #include <linux/types.h> + + struct cik_ih_ring_entry { +- uint32_t source_id; +- uint32_t data; +- uint32_t ring_id; +- uint32_t reserved; ++ uint32_t source_id:8; ++ uint32_t reserved1:8; ++ uint32_t reserved2:16; ++ ++ uint32_t data:28; ++ uint32_t reserved3:4; ++ ++ /* pipeid, meid and unused3 are officially called RINGID, ++ * but for our purposes, they always decode into pipe and ME. */ ++ uint32_t pipeid:2; ++ uint32_t meid:2; ++ uint32_t reserved4:4; ++ uint32_t vmid:8; ++ uint32_t pasid:16; ++ ++ uint32_t reserved5; + }; + + #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 + #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 + #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 + #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF ++#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 ++#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 + + #endif + +diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h +index 48769d1..607fc5c 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h ++++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h +@@ -23,11 +23,33 @@ + #ifndef CIK_REGS_H + #define CIK_REGS_H + ++#define IH_VMID_0_LUT 0x3D40u ++ ++#define BIF_DOORBELL_CNTL 0x530Cu ++ ++#define SRBM_GFX_CNTL 0xE44 ++#define PIPEID(x) ((x) << 0) ++#define MEID(x) ((x) << 2) ++#define VMID(x) ((x) << 4) ++#define QUEUEID(x) ((x) << 8) ++ ++#define SQ_CONFIG 0x8C00 ++ ++#define SH_MEM_BASES 0x8C28 + /* if PTR32, these are the bases for scratch and lds */ + #define PRIVATE_BASE(x) ((x) << 0) /* scratch */ + #define SHARED_BASE(x) ((x) << 16) /* LDS */ ++#define SH_MEM_APE1_BASE 0x8C2C ++/* if PTR32, this is the base location of GPUVM */ ++#define SH_MEM_APE1_LIMIT 0x8C30 ++/* if PTR32, this is the upper limit of GPUVM */ ++#define SH_MEM_CONFIG 0x8C34 + #define PTR32 (1 << 0) ++#define PRIVATE_ATC (1 << 1) + #define ALIGNMENT_MODE(x) ((x) << 2) ++#define SH_MEM_ALIGNMENT_MODE_DWORD 0 ++#define SH_MEM_ALIGNMENT_MODE_DWORD_STRICT 1 ++#define SH_MEM_ALIGNMENT_MODE_STRICT 2 + #define SH_MEM_ALIGNMENT_MODE_UNALIGNED 3 + #define DEFAULT_MTYPE(x) ((x) << 4) + #define APE1_MTYPE(x) ((x) << 7) +@@ -36,37 +58,164 @@ + #define MTYPE_CACHED 0 + #define MTYPE_NONCACHED 3 + ++ ++#define SH_STATIC_MEM_CONFIG 0x9604u ++ ++#define TC_CFG_L1_LOAD_POLICY0 0xAC68 ++#define TC_CFG_L1_LOAD_POLICY1 0xAC6C ++#define TC_CFG_L1_STORE_POLICY 0xAC70 ++#define TC_CFG_L2_LOAD_POLICY0 0xAC74 ++#define TC_CFG_L2_LOAD_POLICY1 0xAC78 ++#define TC_CFG_L2_STORE_POLICY0 0xAC7C ++#define TC_CFG_L2_STORE_POLICY1 0xAC80 ++#define TC_CFG_L2_ATOMIC_POLICY 0xAC84 ++#define TC_CFG_L1_VOLATILE 0xAC88 ++#define TC_CFG_L2_VOLATILE 0xAC8C ++ ++#define CP_PQ_WPTR_POLL_CNTL 0xC20C ++#define WPTR_POLL_EN (1 << 31) ++ ++#define CPC_INT_CNTL 0xC2D0 ++#define CP_ME1_PIPE0_INT_CNTL 0xC214 ++#define CP_ME1_PIPE1_INT_CNTL 0xC218 ++#define CP_ME1_PIPE2_INT_CNTL 0xC21C ++#define CP_ME1_PIPE3_INT_CNTL 0xC220 ++#define CP_ME2_PIPE0_INT_CNTL 0xC224 ++#define CP_ME2_PIPE1_INT_CNTL 0xC228 ++#define CP_ME2_PIPE2_INT_CNTL 0xC22C ++#define CP_ME2_PIPE3_INT_CNTL 0xC230 ++#define DEQUEUE_REQUEST_INT_ENABLE (1 << 13) ++#define WRM_POLL_TIMEOUT_INT_ENABLE (1 << 17) ++#define PRIV_REG_INT_ENABLE (1 << 23) ++#define TIME_STAMP_INT_ENABLE (1 << 26) ++#define GENERIC2_INT_ENABLE (1 << 29) ++#define GENERIC1_INT_ENABLE (1 << 30) ++#define GENERIC0_INT_ENABLE (1 << 31) ++#define CP_ME1_PIPE0_INT_STATUS 0xC214 ++#define CP_ME1_PIPE1_INT_STATUS 0xC218 ++#define CP_ME1_PIPE2_INT_STATUS 0xC21C ++#define CP_ME1_PIPE3_INT_STATUS 0xC220 ++#define CP_ME2_PIPE0_INT_STATUS 0xC224 ++#define CP_ME2_PIPE1_INT_STATUS 0xC228 ++#define CP_ME2_PIPE2_INT_STATUS 0xC22C ++#define CP_ME2_PIPE3_INT_STATUS 0xC230 ++#define DEQUEUE_REQUEST_INT_STATUS (1 << 13) ++#define WRM_POLL_TIMEOUT_INT_STATUS (1 << 17) ++#define PRIV_REG_INT_STATUS (1 << 23) ++#define TIME_STAMP_INT_STATUS (1 << 26) ++#define GENERIC2_INT_STATUS (1 << 29) ++#define GENERIC1_INT_STATUS (1 << 30) ++#define GENERIC0_INT_STATUS (1 << 31) ++ ++#define CP_HPD_EOP_BASE_ADDR 0xC904 ++#define CP_HPD_EOP_BASE_ADDR_HI 0xC908 ++#define CP_HPD_EOP_VMID 0xC90C ++#define CP_HPD_EOP_CONTROL 0xC910 ++#define EOP_SIZE(x) ((x) << 0) ++#define EOP_SIZE_MASK (0x3f << 0) ++#define CP_MQD_BASE_ADDR 0xC914 ++#define CP_MQD_BASE_ADDR_HI 0xC918 ++#define CP_HQD_ACTIVE 0xC91C ++#define CP_HQD_VMID 0xC920 ++ ++#define CP_HQD_PERSISTENT_STATE 0xC924u + #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) + #define PRELOAD_REQ (1 << 0) + +-#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) +- +-#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) +- +-#define IB_ATC_EN (1U << 23) +- ++#define CP_HQD_PIPE_PRIORITY 0xC928u ++#define CP_HQD_QUEUE_PRIORITY 0xC92Cu ++#define CP_HQD_QUANTUM 0xC930u + #define QUANTUM_EN 1U + #define QUANTUM_SCALE_1MS (1U << 4) + #define QUANTUM_DURATION(x) ((x) << 8) + ++#define CP_HQD_PQ_BASE 0xC934 ++#define CP_HQD_PQ_BASE_HI 0xC938 ++#define CP_HQD_PQ_RPTR 0xC93C ++#define CP_HQD_PQ_RPTR_REPORT_ADDR 0xC940 ++#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI 0xC944 ++#define CP_HQD_PQ_WPTR_POLL_ADDR 0xC948 ++#define CP_HQD_PQ_WPTR_POLL_ADDR_HI 0xC94C ++#define CP_HQD_PQ_DOORBELL_CONTROL 0xC950 ++#define DOORBELL_OFFSET(x) ((x) << 2) ++#define DOORBELL_OFFSET_MASK (0x1fffff << 2) ++#define DOORBELL_SOURCE (1 << 28) ++#define DOORBELL_SCHD_HIT (1 << 29) ++#define DOORBELL_EN (1 << 30) ++#define DOORBELL_HIT (1 << 31) ++#define CP_HQD_PQ_WPTR 0xC954 ++#define CP_HQD_PQ_CONTROL 0xC958 ++#define QUEUE_SIZE(x) ((x) << 0) ++#define QUEUE_SIZE_MASK (0x3f << 0) + #define RPTR_BLOCK_SIZE(x) ((x) << 8) ++#define RPTR_BLOCK_SIZE_MASK (0x3f << 8) + #define MIN_AVAIL_SIZE(x) ((x) << 20) ++#define PQ_ATC_EN (1 << 23) ++#define PQ_VOLATILE (1 << 26) ++#define NO_UPDATE_RPTR (1 << 27) ++#define UNORD_DISPATCH (1 << 28) ++#define ROQ_PQ_IB_FLIP (1 << 29) ++#define PRIV_STATE (1 << 30) ++#define KMD_QUEUE (1 << 31) ++ + #define DEFAULT_RPTR_BLOCK_SIZE RPTR_BLOCK_SIZE(5) + #define DEFAULT_MIN_AVAIL_SIZE MIN_AVAIL_SIZE(3) + +-#define PQ_ATC_EN (1 << 23) +-#define NO_UPDATE_RPTR (1 << 27) ++#define CP_HQD_IB_BASE_ADDR 0xC95Cu ++#define CP_HQD_IB_BASE_ADDR_HI 0xC960u ++#define CP_HQD_IB_RPTR 0xC964u ++#define CP_HQD_IB_CONTROL 0xC968u ++#define IB_ATC_EN (1U << 23) ++#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) + +-#define DOORBELL_OFFSET(x) ((x) << 2) +-#define DOORBELL_EN (1 << 30) ++#define CP_HQD_DEQUEUE_REQUEST 0xC974 ++#define DEQUEUE_REQUEST_DRAIN 1 ++#define DEQUEUE_REQUEST_RESET 2 ++#define DEQUEUE_INT (1U << 8) + +-#define PRIV_STATE (1 << 30) +-#define KMD_QUEUE (1 << 31) ++#define CP_HQD_SEMA_CMD 0xC97Cu ++#define CP_HQD_MSG_TYPE 0xC980u ++#define CP_HQD_ATOMIC0_PREOP_LO 0xC984u ++#define CP_HQD_ATOMIC0_PREOP_HI 0xC988u ++#define CP_HQD_ATOMIC1_PREOP_LO 0xC98Cu ++#define CP_HQD_ATOMIC1_PREOP_HI 0xC990u ++#define CP_HQD_HQ_SCHEDULER0 0xC994u ++#define CP_HQD_HQ_SCHEDULER1 0xC998u + +-#define AQL_ENABLE 1 ++ ++#define CP_MQD_CONTROL 0xC99C ++#define MQD_VMID(x) ((x) << 0) ++#define MQD_VMID_MASK (0xf << 0) ++#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) + + #define GRBM_GFX_INDEX 0x30800 ++#define INSTANCE_INDEX(x) ((x) << 0) ++#define SH_INDEX(x) ((x) << 8) ++#define SE_INDEX(x) ((x) << 16) ++#define SH_BROADCAST_WRITES (1 << 29) ++#define INSTANCE_BROADCAST_WRITES (1 << 30) ++#define SE_BROADCAST_WRITES (1 << 31) + ++#define SQC_CACHES 0x30d20 ++#define SQC_POLICY 0x8C38u ++#define SQC_VOLATILE 0x8C3Cu ++ ++#define CP_PERFMON_CNTL 0x36020 ++ ++#define ATC_VMID0_PASID_MAPPING 0x339Cu ++#define ATC_VMID_PASID_MAPPING_UPDATE_STATUS 0x3398u + #define ATC_VMID_PASID_MAPPING_VALID (1U << 31) + ++#define ATC_VM_APERTURE0_CNTL 0x3310u ++#define ATS_ACCESS_MODE_NEVER 0 ++#define ATS_ACCESS_MODE_ALWAYS 1 ++ ++#define ATC_VM_APERTURE0_CNTL2 0x3318u ++#define ATC_VM_APERTURE0_HIGH_ADDR 0x3308u ++#define ATC_VM_APERTURE0_LOW_ADDR 0x3300u ++#define ATC_VM_APERTURE1_CNTL 0x3314u ++#define ATC_VM_APERTURE1_CNTL2 0x331Cu ++#define ATC_VM_APERTURE1_HIGH_ADDR 0x330Cu ++#define ATC_VM_APERTURE1_LOW_ADDR 0x3304u ++ + #endif +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h +new file mode 100644 +index 0000000..1880dc0 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h +@@ -0,0 +1,1377 @@ ++/* ++ * Copyright 2015 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#if 0 ++ HW (CARRIZO) source code for CWSR trap handler ++ ++var G8SR_WDMEM_HWREG_OFFSET = 0 ++var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes ++ ++// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. ++ ++var G8SR_DEBUG_TIMESTAMP = 0 ++var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset ++var s_g8sr_ts_save_s = s[34:35] // save start ++var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi ++var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ ++var s_g8sr_ts_save_d = s[40:41] // save end ++var s_g8sr_ts_restore_s = s[42:43] // restore start ++var s_g8sr_ts_restore_d = s[44:45] // restore end ++ ++var G8SR_VGPR_SR_IN_DWX4 = 0 ++var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes ++var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 ++ ++ ++/*************************************************************************/ ++/* control on how to run the shader */ ++/*************************************************************************/ ++//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) ++var EMU_RUN_HACK = 0 ++var EMU_RUN_HACK_RESTORE_NORMAL = 0 ++var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 ++var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 ++var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK ++var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK ++var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK ++var SAVE_LDS = 1 ++var WG_BASE_ADDR_LO = 0x9000a000 ++var WG_BASE_ADDR_HI = 0x0 ++var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem ++var CTX_SAVE_CONTROL = 0x0 ++var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL ++var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) ++var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write ++var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes ++var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing ++ ++/**************************************************************************/ ++/* variables */ ++/**************************************************************************/ ++var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 ++var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 ++ ++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 ++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 ++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 ++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 ++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 ++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits ++ ++var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 ++var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask ++var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 ++var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 ++var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 ++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF ++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 ++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 ++var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 ++var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 ++var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 ++ ++var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME ++var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME ++var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME ++var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME ++var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME ++ ++var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 ++var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 ++ ++ ++/* Save */ ++var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes ++var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE ++ ++var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit ++var S_SAVE_SPI_INIT_ATC_SHIFT = 27 ++var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype ++var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 ++var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG ++var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 ++ ++var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used ++var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME ++var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME ++var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME ++ ++var s_save_spi_init_lo = exec_lo ++var s_save_spi_init_hi = exec_hi ++ ++ //tba_lo and tba_hi need to be saved/restored ++var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} ++var s_save_pc_hi = ttmp1 ++var s_save_exec_lo = ttmp2 ++var s_save_exec_hi = ttmp3 ++var s_save_status = ttmp4 ++var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine ++var s_save_xnack_mask_lo = ttmp6 ++var s_save_xnack_mask_hi = ttmp7 ++var s_save_buf_rsrc0 = ttmp8 ++var s_save_buf_rsrc1 = ttmp9 ++var s_save_buf_rsrc2 = ttmp10 ++var s_save_buf_rsrc3 = ttmp11 ++ ++var s_save_mem_offset = tma_lo ++var s_save_alloc_size = s_save_trapsts //conflict ++var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) ++var s_save_m0 = tma_hi ++ ++/* Restore */ ++var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE ++var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC ++ ++var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit ++var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 ++var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype ++var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 ++var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG ++var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 ++ ++var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT ++var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK ++var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT ++var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK ++ ++var s_restore_spi_init_lo = exec_lo ++var s_restore_spi_init_hi = exec_hi ++ ++var s_restore_mem_offset = ttmp2 ++var s_restore_alloc_size = ttmp3 ++var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored ++var s_restore_mem_offset_save = s_restore_tmp //no conflict ++ ++var s_restore_m0 = s_restore_alloc_size //no conflict ++ ++var s_restore_mode = ttmp7 ++ ++var s_restore_pc_lo = ttmp0 ++var s_restore_pc_hi = ttmp1 ++var s_restore_exec_lo = tma_lo //no conflict ++var s_restore_exec_hi = tma_hi //no conflict ++var s_restore_status = ttmp4 ++var s_restore_trapsts = ttmp5 ++var s_restore_xnack_mask_lo = xnack_mask_lo ++var s_restore_xnack_mask_hi = xnack_mask_hi ++var s_restore_buf_rsrc0 = ttmp8 ++var s_restore_buf_rsrc1 = ttmp9 ++var s_restore_buf_rsrc2 = ttmp10 ++var s_restore_buf_rsrc3 = ttmp11 ++ ++/**************************************************************************/ ++/* trap handler entry points */ ++/**************************************************************************/ ++/* Shader Main*/ ++ ++shader main ++ asic(CARRIZO) ++ type(CS) ++ ++ ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore ++ //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC ++ s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC ++ s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. ++ s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE ++ //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE ++ s_branch L_SKIP_RESTORE //NOT restore, SAVE actually ++ else ++ s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save ++ end ++ ++L_JUMP_TO_RESTORE: ++ s_branch L_RESTORE //restore ++ ++L_SKIP_RESTORE: ++ ++ s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save ++ s_cbranch_scc1 L_SAVE //this is the operation for save ++ ++ // ********* Handle non-CWSR traps ******************* ++if (!EMU_RUN_HACK) ++ /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ ++ s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 ++ s_waitcnt lgkmcnt(0) ++ s_or_b32 ttmp7, ttmp8, ttmp9 ++ s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set ++ s_mov_b32 tma_lo, ttmp10 //set tma_lo/hi for next level trap handler ++ s_mov_b32 tma_hi, ttmp11 ++ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) ++ s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler ++ ++L_NO_NEXT_TRAP: ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception ++ s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. ++ s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 ++ s_addc_u32 ttmp1, ttmp1, 0 ++L_EXCP_CASE: ++ s_and_b32 ttmp1, ttmp1, 0xFFFF ++ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) ++ s_rfe_b64 [ttmp0, ttmp1] ++end ++ // ********* End handling of non-CWSR traps ******************* ++ ++/**************************************************************************/ ++/* save routine */ ++/**************************************************************************/ ++ ++L_SAVE: ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_save_s ++ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? ++end ++ ++ //check whether there is mem_viol ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK ++ s_cbranch_scc0 L_NO_PC_REWIND ++ ++ //if so, need rewind PC assuming GDS operation gets NACKed ++ s_mov_b32 s_save_tmp, 0 //clear mem_viol bit ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit ++ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] ++ s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 ++ s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc ++ ++L_NO_PC_REWIND: ++ s_mov_b32 s_save_tmp, 0 //clear saveCtx bit ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit ++ ++ s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK ++ s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT ++ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT ++ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY ++ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT ++ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS ++ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG ++ ++ s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp ++ ++ /* inform SPI the readiness and wait for SPI's go signal */ ++ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI ++ s_mov_b32 s_save_exec_hi, exec_hi ++ s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_sq_save_msg ++ s_waitcnt lgkmcnt(0) ++end ++ ++ if (EMU_RUN_HACK) ++ ++ else ++ s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC ++ end ++ ++ L_SLEEP: ++ s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 ++ ++ if (EMU_RUN_HACK) ++ ++ else ++ s_cbranch_execz L_SLEEP ++ end ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_spi_wrexec ++ s_waitcnt lgkmcnt(0) ++end ++ ++ /* setup Resource Contants */ ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) ++ //calculate wd_addr using absolute thread id ++ v_readlane_b32 s_save_tmp, v9, 0 ++ s_lshr_b32 s_save_tmp, s_save_tmp, 6 ++ s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE ++ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO ++ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI ++ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL ++ else ++ end ++ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) ++ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO ++ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI ++ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL ++ else ++ end ++ ++ ++ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo ++ s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE ++ s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited ++ s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC ++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK ++ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position ++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC ++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK ++ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position ++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE ++ ++ //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) ++ s_mov_b32 s_save_m0, m0 //save M0 ++ ++ /* global mem offset */ ++ s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 ++ ++ ++ ++ ++ /* save HW registers */ ++ ////////////////////////////// ++ ++ L_SAVE_HWREG: ++ // HWREG SR memory offset : size(VGPR)+size(SGPR) ++ get_vgpr_size_bytes(s_save_mem_offset) ++ get_sgpr_size_bytes(s_save_tmp) ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp ++ ++ ++ s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 ++ ++ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) ++ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 ++ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over ++ s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO ++ s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI ++ end ++ ++ write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC ++ write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) ++ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC ++ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) ++ // Save the tma_lo and tma_hi content from exec_lo and ttmp5 ++ s_mov_b32 s_save_exec_lo, exec_lo ++ s_mov_b32 s_save_exec_hi, ttmp5 ++ write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS ++ ++ //s_save_trapsts conflicts with s_save_alloc_size ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS ++ ++ write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO ++ write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI ++ ++ //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 ++ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE ++ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) ++ write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO ++ write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI ++ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //TMA_LO ++ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) //TMA_HI ++ ++ /* the first wave in the threadgroup */ ++ // save fist_wave bits in tba_hi unused bit.26 ++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit ++ //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] ++ s_mov_b32 s_save_exec_hi, 0x0 ++ s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] ++ ++ ++ /* save SGPRs */ ++ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... ++ ////////////////////////////// ++ ++ // SGPR SR memory offset : size(VGPR) ++ get_vgpr_size_bytes(s_save_mem_offset) ++ // TODO, change RSRC word to rearrange memory layout for SGPRS ++ ++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) ++ ++ if (SGPR_SAVE_USE_SQC) ++ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes ++ else ++ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) ++ end ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 ++ //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 ++ s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 ++ s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset ++ ++ s_mov_b32 m0, 0x0 //SGPR initial index value =0 ++ L_SAVE_SGPR_LOOP: ++ // SGPR is allocated in 16 SGPR granularity ++ s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] ++ s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] ++ s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] ++ s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] ++ s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] ++ s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] ++ s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] ++ s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] ++ ++ write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 ++ s_add_u32 m0, m0, 16 //next sgpr index ++ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? ++ // restore s_save_buf_rsrc0,1 ++ //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo ++ s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo ++ ++ ++ ++ ++ /* save first 4 VGPR, then LDS save could use */ ++ // each wave will alloc 4 vgprs at least... ++ ///////////////////////////////////////////////////////////////////////////////////// ++ ++ s_mov_b32 s_save_mem_offset, 0 ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ // VGPR Allocated in 4-GPR granularity ++ ++if G8SR_VGPR_SR_IN_DWX4 ++ // the const stride for DWx4 is 4*4 bytes ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes ++ ++ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes ++else ++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 ++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 ++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 ++end ++ ++ ++ ++ /* save LDS */ ++ ////////////////////////////// ++ ++ L_SAVE_LDS: ++ ++ // Change EXEC to all threads... ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size ++ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? ++ s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE ++ ++ s_barrier //LDS is used? wait for other waves in the same TG ++ //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here ++ s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here ++ s_cbranch_scc0 L_SAVE_LDS_DONE ++ ++ // first wave do LDS save; ++ ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes ++ s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes ++ ++ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) ++ // ++ get_vgpr_size_bytes(s_save_mem_offset) ++ get_sgpr_size_bytes(s_save_tmp) ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() ++ ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 ++ ++ ++var LDS_DMA_ENABLE = 0 ++var UNROLL = 0 ++if UNROLL==0 && LDS_DMA_ENABLE==1 ++ s_mov_b32 s3, 256*2 ++ s_nop 0 ++ s_nop 0 ++ s_nop 0 ++ L_SAVE_LDS_LOOP: ++ //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? ++ if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW ++ end ++ ++ s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes ++ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? ++ ++elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss ++ // store from higest LDS address to lowest ++ s_mov_b32 s3, 256*2 ++ s_sub_u32 m0, s_save_alloc_size, s3 ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 ++ s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... ++ s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest ++ s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc ++ s_nop 0 ++ s_nop 0 ++ s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes ++ s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved ++ s_add_u32 s0, s0,s_save_alloc_size ++ s_addc_u32 s1, s1, 0 ++ s_setpc_b64 s[0:1] ++ ++ ++ for var i =0; i< 128; i++ ++ // be careful to make here a 64Byte aligned address, which could improve performance... ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW ++ ++ if i!=127 ++ s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline ++ s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 ++ end ++ end ++ ++else // BUFFER_STORE ++ v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 ++ v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid ++ v_mul_i32_i24 v2, v3, 8 // tid*8 ++ v_mov_b32 v3, 256*2 ++ s_mov_b32 m0, 0x10000 ++ s_mov_b32 s0, s_save_buf_rsrc3 ++ s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid ++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT ++ ++L_SAVE_LDS_LOOP_VECTOR: ++ ds_read_b64 v[0:1], v2 //x =LDS[a], byte address ++ s_waitcnt lgkmcnt(0) ++ buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 ++// s_waitcnt vmcnt(0) ++ v_add_u32 v2, vcc[0:1], v2, v3 ++ v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size ++ s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR ++ ++ // restore rsrc3 ++ s_mov_b32 s_save_buf_rsrc3, s0 ++ ++end ++ ++L_SAVE_LDS_DONE: ++ ++ ++ /* save VGPRs - set the Rest VGPRs */ ++ ////////////////////////////////////////////////////////////////////////////////////// ++ L_SAVE_VGPR: ++ // VGPR SR memory offset: 0 ++ // TODO rearrange the RSRC words to use swizzle for VGPR save... ++ ++ s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible ++ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ // VGPR Allocated in 4-GPR granularity ++ ++if G8SR_VGPR_SR_IN_DWX4 ++ // the const stride for DWx4 is 4*4 bytes ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes ++ ++ s_mov_b32 m0, 4 // skip first 4 VGPRs ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs ++ ++ s_set_gpr_idx_on m0, 0x1 // This will change M0 ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 ++L_SAVE_VGPR_LOOP: ++ v_mov_b32 v0, v0 // v0 = v[0+m0] ++ v_mov_b32 v1, v1 ++ v_mov_b32 v2, v2 ++ v_mov_b32 v3, v3 ++ ++ ++ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ s_add_u32 m0, m0, 4 ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? ++ s_set_gpr_idx_off ++L_SAVE_VGPR_LOOP_END: ++ ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes ++else ++ // VGPR store using dw burst ++ s_mov_b32 m0, 0x4 //VGPR initial index value =0 ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc0 L_SAVE_VGPR_END ++ ++ ++ s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later ++ ++ L_SAVE_VGPR_LOOP: ++ v_mov_b32 v0, v0 //v0 = v[0+m0] ++ v_mov_b32 v1, v1 //v0 = v[0+m0] ++ v_mov_b32 v2, v2 //v0 = v[0+m0] ++ v_mov_b32 v3, v3 //v0 = v[0+m0] ++ ++ if(USE_MTBUF_INSTEAD_OF_MUBUF) ++ tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 ++ else ++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 ++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 ++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 ++ end ++ ++ s_add_u32 m0, m0, 4 //next vgpr index ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes ++ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? ++ s_set_gpr_idx_off ++end ++ ++L_SAVE_VGPR_END: ++ ++ ++ ++ ++ ++ ++ /* S_PGM_END_SAVED */ //FIXME graphics ONLY ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) ++ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] ++ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 ++ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over ++ s_rfe_b64 s_save_pc_lo //Return to the main shader program ++ else ++ end ++ ++// Save Done timestamp ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_save_d ++ // SGPR SR memory offset : size(VGPR) ++ get_vgpr_size_bytes(s_save_mem_offset) ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET ++ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? ++ // Need reset rsrc2?? ++ s_mov_b32 m0, s_save_mem_offset ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 ++end ++ ++ ++ s_branch L_END_PGM ++ ++ ++ ++/**************************************************************************/ ++/* restore routine */ ++/**************************************************************************/ ++ ++L_RESTORE: ++ /* Setup Resource Contants */ ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) ++ //calculate wd_addr using absolute thread id ++ v_readlane_b32 s_restore_tmp, v9, 0 ++ s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 ++ s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE ++ s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO ++ s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI ++ s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL ++ else ++ end ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_restore_s ++ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? ++ // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... ++ s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] ++ s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. ++end ++ ++ ++ ++ s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo ++ s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi ++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE ++ s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) ++ s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC ++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK ++ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position ++ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC ++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK ++ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position ++ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE ++ ++ /* global mem offset */ ++// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 ++ ++ /* the first wave in the threadgroup */ ++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK ++ s_cbranch_scc0 L_RESTORE_VGPR ++ ++ /* restore LDS */ ++ ////////////////////////////// ++ L_RESTORE_LDS: ++ ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size ++ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? ++ s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes ++ s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes ++ ++ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) ++ // ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ get_sgpr_size_bytes(s_restore_tmp) ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? ++ ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 ++ ++ L_RESTORE_LDS_LOOP: ++ if (SAVE_LDS) ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW ++ end ++ s_add_u32 m0, m0, 256*2 // 128 DW ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW ++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? ++ ++ ++ /* restore VGPRs */ ++ ////////////////////////////// ++ L_RESTORE_VGPR: ++ // VGPR SR memory offset : 0 ++ s_mov_b32 s_restore_mem_offset, 0x0 ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size ++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) ++ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++if G8SR_VGPR_SR_IN_DWX4 ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 ++ ++ // the const stride for DWx4 is 4*4 bytes ++ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes ++ ++ s_mov_b32 m0, s_restore_alloc_size ++ s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 ++ ++L_RESTORE_VGPR_LOOP: ++ buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 ++ s_waitcnt vmcnt(0) ++ s_sub_u32 m0, m0, 4 ++ v_mov_b32 v0, v0 // v[0+m0] = v0 ++ v_mov_b32 v1, v1 ++ v_mov_b32 v2, v2 ++ v_mov_b32 v3, v3 ++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 ++ s_cmp_eq_u32 m0, 0x8000 ++ s_cbranch_scc0 L_RESTORE_VGPR_LOOP ++ s_set_gpr_idx_off ++ ++ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes ++ ++else ++ // VGPR load using dw burst ++ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 ++ s_mov_b32 m0, 4 //VGPR initial index value = 1 ++ s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 ++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later ++ ++ L_RESTORE_VGPR_LOOP: ++ if(USE_MTBUF_INSTEAD_OF_MUBUF) ++ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 ++ else ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 ++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 ++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 ++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 ++ end ++ s_waitcnt vmcnt(0) //ensure data ready ++ v_mov_b32 v0, v0 //v[0+m0] = v0 ++ v_mov_b32 v1, v1 ++ v_mov_b32 v2, v2 ++ v_mov_b32 v3, v3 ++ s_add_u32 m0, m0, 4 //next vgpr index ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes ++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? ++ s_set_gpr_idx_off ++ /* VGPR restore on v0 */ ++ if(USE_MTBUF_INSTEAD_OF_MUBUF) ++ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 ++ else ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 ++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 ++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 ++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 ++ end ++ ++end ++ ++ /* restore SGPRs */ ++ ////////////////////////////// ++ ++ // SGPR SR memory offset : size(VGPR) ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ get_sgpr_size_bytes(s_restore_tmp) ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp ++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group ++ // TODO, change RSRC word to rearrange memory layout for SGPRS ++ ++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size ++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) ++ ++ if (SGPR_SAVE_USE_SQC) ++ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes ++ else ++ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) ++ end ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), ++ However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG ++ */ ++ s_mov_b32 m0, s_restore_alloc_size ++ ++ L_RESTORE_SGPR_LOOP: ++ read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made ++ s_waitcnt lgkmcnt(0) //ensure data ready ++ ++ s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] ++ ++ s_movreld_b64 s0, s0 //s[0+m0] = s0 ++ s_movreld_b64 s2, s2 ++ s_movreld_b64 s4, s4 ++ s_movreld_b64 s6, s6 ++ s_movreld_b64 s8, s8 ++ s_movreld_b64 s10, s10 ++ s_movreld_b64 s12, s12 ++ s_movreld_b64 s14, s14 ++ ++ s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 ++ s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? ++ ++ /* restore HW registers */ ++ ////////////////////////////// ++ L_RESTORE_HWREG: ++ ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo ++ s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi ++end ++ ++ // HWREG SR memory offset : size(VGPR)+size(SGPR) ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ get_sgpr_size_bytes(s_restore_tmp) ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp ++ ++ ++ s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 ++ read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC ++ read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) ++ read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC ++ read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) ++ read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS ++ read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS ++ read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO ++ read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI ++ read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE ++ read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO ++ read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI ++ ++ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS ++ ++ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS ++ ++ //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) ++ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) ++ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over ++ end ++ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) ++ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal ++ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over ++ end ++ ++ s_mov_b32 m0, s_restore_m0 ++ s_mov_b32 exec_lo, s_restore_exec_lo ++ s_mov_b32 exec_hi, s_restore_exec_hi ++ ++ read_hwreg_from_mem(tma_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //tma_lo ++ read_hwreg_from_mem(tma_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //tma_hi ++ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS ++ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 ++ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts ++ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 ++ //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore ++ s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode ++ //reuse s_restore_m0 as a temp register ++ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK ++ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT ++ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT ++ s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero ++ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 ++ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK ++ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT ++ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT ++ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 ++ s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK ++ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT ++ s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp ++ ++ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 ++ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 ++ s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu ++ ++ s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_restore_d ++ s_waitcnt lgkmcnt(0) ++end ++ ++// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution ++ s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc ++ ++ ++/**************************************************************************/ ++/* the END */ ++/**************************************************************************/ ++L_END_PGM: ++ s_endpgm ++ ++end ++ ++ ++/**************************************************************************/ ++/* the helper functions */ ++/**************************************************************************/ ++ ++//Only for save hwreg to mem ++function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) ++ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on ++ s_mov_b32 m0, s_mem_offset ++ s_buffer_store_dword s, s_rsrc, m0 glc:0 ++ s_add_u32 s_mem_offset, s_mem_offset, 4 ++ s_mov_b32 m0, exec_lo ++end ++ ++//Only for save hwreg to mem ++function write_tma_to_mem(s, s_rsrc, offset_imm) ++ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on ++ s_mov_b32 m0, offset_imm ++ s_buffer_store_dword s, s_rsrc, m0 glc:0 ++ s_mov_b32 m0, exec_lo ++end ++ ++// HWREG are saved before SGPRs, so all HWREG could be use. ++function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) ++ ++ s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:0 ++ s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:0 ++ s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:0 ++ s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:0 ++ s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 ++ s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc ++end ++ ++ ++function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) ++ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 ++ s_add_u32 s_mem_offset, s_mem_offset, 4 ++end ++ ++function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) ++ s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 ++ s_sub_u32 s_mem_offset, s_mem_offset, 4*16 ++end ++ ++ ++ ++function get_lds_size_bytes(s_lds_size_byte) ++ // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW ++ s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size ++ s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW ++end ++ ++function get_vgpr_size_bytes(s_vgpr_size_byte) ++ s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size ++ s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 ++ s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible ++end ++ ++function get_sgpr_size_bytes(s_sgpr_size_byte) ++ s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size ++ s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 ++ s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) ++end ++ ++function get_hwreg_size_bytes ++ return 128 //HWREG size 128 bytes ++end ++ ++#endif ++ ++static const uint32_t cwsr_trap_carrizo_hex[] = { ++ 0xbf820001, 0xbf820131, ++ 0xb8f4f802, 0xb8f5f803, ++ 0x8675ff75, 0x00000400, ++ 0xbf850013, 0xc00a1e37, ++ 0x00000000, 0xbf8c007f, ++ 0x87777978, 0xbf840004, ++ 0xbeee007a, 0xbeef007b, ++ 0xb974f802, 0xbe801d78, ++ 0xb8f5f803, 0x8675ff75, ++ 0x000001ff, 0xbf850002, ++ 0x80708470, 0x82718071, ++ 0x8671ff71, 0x0000ffff, ++ 0xb974f802, 0xbe801f70, ++ 0xb8f5f803, 0x8675ff75, ++ 0x00000100, 0xbf840006, ++ 0xbefa0080, 0xb97a0203, ++ 0x8671ff71, 0x0000ffff, ++ 0x80f08870, 0x82f18071, ++ 0xbefa0080, 0xb97a0283, ++ 0xbef60068, 0xbef70069, ++ 0xb8fa1c07, 0x8e7a9c7a, ++ 0x87717a71, 0xb8fa03c7, ++ 0x8e7a9b7a, 0x87717a71, ++ 0xb8faf807, 0x867aff7a, ++ 0x00007fff, 0xb97af807, ++ 0xbef2007e, 0xbef3007f, ++ 0xbefe0180, 0xbf900004, ++ 0xbf8e0002, 0xbf88fffe, ++ 0xbef8007e, 0x8679ff7f, ++ 0x0000ffff, 0x8779ff79, ++ 0x00040000, 0xbefa0080, ++ 0xbefb00ff, 0x00807fac, ++ 0x867aff7f, 0x08000000, ++ 0x8f7a837a, 0x877b7a7b, ++ 0x867aff7f, 0x70000000, ++ 0x8f7a817a, 0x877b7a7b, ++ 0xbeef007c, 0xbeee0080, ++ 0xb8ee2a05, 0x806e816e, ++ 0x8e6e8a6e, 0xb8fa1605, ++ 0x807a817a, 0x8e7a867a, ++ 0x806e7a6e, 0xbefa0084, ++ 0xbefa00ff, 0x01000000, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601bfc, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601c3c, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601c7c, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601cbc, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601cfc, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xbef2007e, 0xbef30075, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601d3c, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xb8f5f803, 0xbefe007c, ++ 0xbefc006e, 0xc0601d7c, ++ 0x0000007c, 0x806e846e, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc006e, 0xc0601dbc, ++ 0x0000007c, 0x806e846e, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc006e, 0xc0601dfc, ++ 0x0000007c, 0x806e846e, ++ 0xbefc007e, 0xb8eff801, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601bfc, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601b3c, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601b7c, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601cbc, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0xbefe007c, 0xbefc006e, ++ 0xc0601cfc, 0x0000007c, ++ 0x806e846e, 0xbefc007e, ++ 0x867aff7f, 0x04000000, ++ 0xbef30080, 0x8773737a, ++ 0xb8ee2a05, 0x806e816e, ++ 0x8e6e8a6e, 0xb8f51605, ++ 0x80758175, 0x8e758475, ++ 0x8e7a8275, 0xbefa00ff, ++ 0x01000000, 0xbef60178, ++ 0x80786e78, 0xbefc0080, ++ 0xbe802b00, 0xbe822b02, ++ 0xbe842b04, 0xbe862b06, ++ 0xbe882b08, 0xbe8a2b0a, ++ 0xbe8c2b0c, 0xbe8e2b0e, ++ 0xc06a003c, 0x00000000, ++ 0xc06a013c, 0x00000010, ++ 0xc06a023c, 0x00000020, ++ 0xc06a033c, 0x00000030, ++ 0x8078c078, 0x82798079, ++ 0x807c907c, 0xbf0a757c, ++ 0xbf85ffeb, 0xbef80176, ++ 0xbeee0080, 0xbefe00c1, ++ 0xbeff00c1, 0xbefa00ff, ++ 0x01000000, 0xe0724000, ++ 0x6e1e0000, 0xe0724100, ++ 0x6e1e0100, 0xe0724200, ++ 0x6e1e0200, 0xe0724300, ++ 0x6e1e0300, 0xbefe00c1, ++ 0xbeff00c1, 0xb8f54306, ++ 0x8675c175, 0xbf84002c, ++ 0xbf8a0000, 0x867aff73, ++ 0x04000000, 0xbf840028, ++ 0x8e758675, 0x8e758275, ++ 0xbefa0075, 0xb8ee2a05, ++ 0x806e816e, 0x8e6e8a6e, ++ 0xb8fa1605, 0x807a817a, ++ 0x8e7a867a, 0x806e7a6e, ++ 0x806eff6e, 0x00000080, ++ 0xbefa00ff, 0x01000000, ++ 0xbefc0080, 0xd28c0002, ++ 0x000100c1, 0xd28d0003, ++ 0x000204c1, 0xd1060002, ++ 0x00011103, 0x7e0602ff, ++ 0x00000200, 0xbefc00ff, ++ 0x00010000, 0xbe80007b, ++ 0x867bff7b, 0xff7fffff, ++ 0x877bff7b, 0x00058000, ++ 0xd8ec0000, 0x00000002, ++ 0xbf8c007f, 0xe0765000, ++ 0x6e1e0002, 0x32040702, ++ 0xd0c9006a, 0x0000eb02, ++ 0xbf87fff7, 0xbefb0000, ++ 0xbeee00ff, 0x00000400, ++ 0xbefe00c1, 0xbeff00c1, ++ 0xb8f52a05, 0x80758175, ++ 0x8e758275, 0x8e7a8875, ++ 0xbefa00ff, 0x01000000, ++ 0xbefc0084, 0xbf0a757c, ++ 0xbf840015, 0xbf11017c, ++ 0x8075ff75, 0x00001000, ++ 0x7e000300, 0x7e020301, ++ 0x7e040302, 0x7e060303, ++ 0xe0724000, 0x6e1e0000, ++ 0xe0724100, 0x6e1e0100, ++ 0xe0724200, 0x6e1e0200, ++ 0xe0724300, 0x6e1e0300, ++ 0x807c847c, 0x806eff6e, ++ 0x00000400, 0xbf0a757c, ++ 0xbf85ffef, 0xbf9c0000, ++ 0xbf8200d1, 0xbef8007e, ++ 0x8679ff7f, 0x0000ffff, ++ 0x8779ff79, 0x00040000, ++ 0xbefa0080, 0xbefb00ff, ++ 0x00807fac, 0x8676ff7f, ++ 0x08000000, 0x8f768376, ++ 0x877b767b, 0x8676ff7f, ++ 0x70000000, 0x8f768176, ++ 0x877b767b, 0x8676ff7f, ++ 0x04000000, 0xbf84001e, ++ 0xbefe00c1, 0xbeff00c1, ++ 0xb8f34306, 0x8673c173, ++ 0xbf840019, 0x8e738673, ++ 0x8e738273, 0xbefa0073, ++ 0xb8f22a05, 0x80728172, ++ 0x8e728a72, 0xb8f61605, ++ 0x80768176, 0x8e768676, ++ 0x80727672, 0x8072ff72, ++ 0x00000080, 0xbefa00ff, ++ 0x01000000, 0xbefc0080, ++ 0xe0510000, 0x721e0000, ++ 0xe0510100, 0x721e0000, ++ 0x807cff7c, 0x00000200, ++ 0x8072ff72, 0x00000200, ++ 0xbf0a737c, 0xbf85fff6, ++ 0xbef20080, 0xbefe00c1, ++ 0xbeff00c1, 0xb8f32a05, ++ 0x80738173, 0x8e738273, ++ 0x8e7a8873, 0xbefa00ff, ++ 0x01000000, 0xbef60072, ++ 0x8072ff72, 0x00000400, ++ 0xbefc0084, 0xbf11087c, ++ 0x8073ff73, 0x00008000, ++ 0xe0524000, 0x721e0000, ++ 0xe0524100, 0x721e0100, ++ 0xe0524200, 0x721e0200, ++ 0xe0524300, 0x721e0300, ++ 0xbf8c0f70, 0x7e000300, ++ 0x7e020301, 0x7e040302, ++ 0x7e060303, 0x807c847c, ++ 0x8072ff72, 0x00000400, ++ 0xbf0a737c, 0xbf85ffee, ++ 0xbf9c0000, 0xe0524000, ++ 0x761e0000, 0xe0524100, ++ 0x761e0100, 0xe0524200, ++ 0x761e0200, 0xe0524300, ++ 0x761e0300, 0xb8f22a05, ++ 0x80728172, 0x8e728a72, ++ 0xb8f61605, 0x80768176, ++ 0x8e768676, 0x80727672, ++ 0x80f2c072, 0xb8f31605, ++ 0x80738173, 0x8e738473, ++ 0x8e7a8273, 0xbefa00ff, ++ 0x01000000, 0xbefc0073, ++ 0xc031003c, 0x00000072, ++ 0x80f2c072, 0xbf8c007f, ++ 0x80fc907c, 0xbe802d00, ++ 0xbe822d02, 0xbe842d04, ++ 0xbe862d06, 0xbe882d08, ++ 0xbe8a2d0a, 0xbe8c2d0c, ++ 0xbe8e2d0e, 0xbf06807c, ++ 0xbf84fff1, 0xb8f22a05, ++ 0x80728172, 0x8e728a72, ++ 0xb8f61605, 0x80768176, ++ 0x8e768676, 0x80727672, ++ 0xbefa0084, 0xbefa00ff, ++ 0x01000000, 0xc0211cfc, ++ 0x00000072, 0x80728472, ++ 0xc0211c3c, 0x00000072, ++ 0x80728472, 0xc0211c7c, ++ 0x00000072, 0x80728472, ++ 0xc0211bbc, 0x00000072, ++ 0x80728472, 0xc0211bfc, ++ 0x00000072, 0x80728472, ++ 0xc0211d3c, 0x00000072, ++ 0x80728472, 0xc0211d7c, ++ 0x00000072, 0x80728472, ++ 0xc0211a3c, 0x00000072, ++ 0x80728472, 0xc0211a7c, ++ 0x00000072, 0x80728472, ++ 0xc0211dfc, 0x00000072, ++ 0x80728472, 0xc0211b3c, ++ 0x00000072, 0x80728472, ++ 0xc0211b7c, 0x00000072, ++ 0x80728472, 0xbf8c007f, ++ 0x8671ff71, 0x0000ffff, ++ 0xbefc0073, 0xbefe006e, ++ 0xbeff006f, 0xc0211bbc, ++ 0x00000072, 0x80728472, ++ 0xc0211bfc, 0x00000072, ++ 0x80728472, 0xbf8c007f, ++ 0x867375ff, 0x000003ff, ++ 0xb9734803, 0x867375ff, ++ 0xfffff800, 0x8f738b73, ++ 0xb973a2c3, 0xb977f801, ++ 0x8673ff71, 0xf0000000, ++ 0x8f739c73, 0x8e739073, ++ 0xbef60080, 0x87767376, ++ 0x8673ff71, 0x08000000, ++ 0x8f739b73, 0x8e738f73, ++ 0x87767376, 0x8673ff74, ++ 0x00800000, 0x8f739773, ++ 0xb976f807, 0x86fe7e7e, ++ 0x86ea6a6a, 0xb974f802, ++ 0xbf8a0000, 0x95807370, ++ 0xbf810000, 0x00000000, ++}; ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +index 6316aad..595640a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +@@ -31,16 +31,23 @@ + #include <uapi/linux/kfd_ioctl.h> + #include <linux/time.h> + #include <linux/mm.h> +-#include <linux/mman.h> ++#include <uapi/asm-generic/mman-common.h> + #include <asm/processor.h> ++ + #include "kfd_priv.h" + #include "kfd_device_queue_manager.h" + #include "kfd_dbgmgr.h" ++#include "cik_regs.h" + + static long kfd_ioctl(struct file *, unsigned int, unsigned long); + static int kfd_open(struct inode *, struct file *); + static int kfd_mmap(struct file *, struct vm_area_struct *); ++static uint32_t kfd_convert_user_mem_alloction_flags( ++ struct kfd_dev *dev, ++ uint32_t userspace_flags); ++static bool kfd_is_large_bar(struct kfd_dev *dev); + ++static int kfd_evict(struct file *filep, struct kfd_process *p, void *data); + static const char kfd_dev_name[] = "kfd"; + + static const struct file_operations kfd_fops = { +@@ -117,7 +124,7 @@ static int kfd_open(struct inode *inode, struct file *filep) + return -EPERM; + } + +- process = kfd_create_process(current); ++ process = kfd_create_process(filep); + if (IS_ERR(process)) + return PTR_ERR(process); + +@@ -206,6 +213,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, + q_properties->ctx_save_restore_area_address = + args->ctx_save_restore_address; + q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; ++ q_properties->ctl_stack_size = args->ctl_stack_size; + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || + args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->type = KFD_QUEUE_TYPE_COMPUTE; +@@ -270,7 +278,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + return -EINVAL; + } + +- mutex_lock(&p->mutex); ++ down_write(&p->lock); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { +@@ -282,8 +290,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + p->pasid, + dev->id); + +- err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, +- 0, q_properties.type, &queue_id); ++ err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id); + if (err != 0) + goto err_create_queue; + +@@ -291,10 +298,10 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + + + /* Return gpu_id as doorbell offset for mmap usage */ +- args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); ++ args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL | args->gpu_id); + args->doorbell_offset <<= PAGE_SHIFT; + +- mutex_unlock(&p->mutex); ++ up_write(&p->lock); + + pr_debug("kfd: queue id %d was created successfully\n", args->queue_id); + +@@ -311,7 +318,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + + err_create_queue: + err_bind_process: +- mutex_unlock(&p->mutex); ++ up_write(&p->lock); + return err; + } + +@@ -325,11 +332,11 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, + args->queue_id, + p->pasid); + +- mutex_lock(&p->mutex); ++ down_write(&p->lock); + + retval = pqm_destroy_queue(&p->pqm, args->queue_id); + +- mutex_unlock(&p->mutex); ++ up_write(&p->lock); + return retval; + } + +@@ -371,11 +378,33 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, + pr_debug("kfd: updating queue id %d for PASID %d\n", + args->queue_id, p->pasid); + +- mutex_lock(&p->mutex); ++ down_write(&p->lock); + + retval = pqm_update_queue(&p->pqm, args->queue_id, &properties); + +- mutex_unlock(&p->mutex); ++ up_write(&p->lock); ++ ++ return retval; ++} ++ ++static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, ++ void *data) ++{ ++ int retval; ++ struct kfd_ioctl_set_cu_mask_args *args = data; ++ struct queue_properties properties; ++ uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; ++ ++ if (get_user(properties.cu_mask, cu_mask_ptr)) ++ return -EFAULT; ++ if (properties.cu_mask == 0) ++ return 0; ++ ++ down_write(&p->lock); ++ ++ retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); ++ ++ up_write(&p->lock); + + return retval; + } +@@ -403,7 +432,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, + if (dev == NULL) + return -EINVAL; + +- mutex_lock(&p->mutex); ++ down_write(&p->lock); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { +@@ -427,46 +456,80 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, + err = -EINVAL; + + out: +- mutex_unlock(&p->mutex); ++ up_write(&p->lock); + + return err; + } + +-static int kfd_ioctl_dbg_register(struct file *filep, +- struct kfd_process *p, void *data) ++static int kfd_ioctl_set_trap_handler(struct file *filep, ++ struct kfd_process *p, void *data) + { +- struct kfd_ioctl_dbg_register_args *args = data; ++ struct kfd_ioctl_set_trap_handler_args *args = data; + struct kfd_dev *dev; +- struct kfd_dbgmgr *dbgmgr_ptr; ++ int err = 0; + struct kfd_process_device *pdd; +- bool create_ok; +- long status = 0; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + +- if (dev->device_info->asic_family == CHIP_CARRIZO) { +- pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); +- return -EINVAL; ++ down_write(&p->lock); ++ ++ pdd = kfd_bind_process_to_device(dev, p); ++ if (IS_ERR(pdd)) { ++ err = -ESRCH; ++ goto out; ++ } ++ if (!dev->cwsr_enabled || !pdd->qpd.cwsr_kaddr) { ++ pr_err("kfd: CWSR is not enabled, can't set trap handler.\n"); ++ err = -EINVAL; ++ goto out; + } + +- mutex_lock(kfd_get_dbgmgr_mutex()); +- mutex_lock(&p->mutex); ++ if (dev->dqm->ops.set_trap_handler(dev->dqm, ++ &pdd->qpd, ++ args->tba_addr, ++ args->tma_addr)) ++ err = -EINVAL; + +- /* +- * make sure that we have pdd, if this the first queue created for +- * this process +- */ ++out: ++ up_write(&p->lock); ++ ++ return err; ++} ++ ++static int ++kfd_ioctl_dbg_register(struct file *filep, struct kfd_process *p, void *data) ++{ ++ long status = -EFAULT; ++ struct kfd_ioctl_dbg_register_args *args = data; ++ struct kfd_dev *dev; ++ struct kfd_dbgmgr *dbgmgr_ptr; ++ struct kfd_process_device *pdd; ++ bool create_ok = false; ++ ++ pr_debug("kfd:dbg: %s\n", __func__); ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (!dev) { ++ dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__); ++ return status; ++ } ++ ++ down_write(&p->lock); ++ mutex_lock(get_dbgmgr_mutex()); ++ ++ /* make sure that we have pdd, if this the first queue created for this process */ + pdd = kfd_bind_process_to_device(dev, p); +- if (IS_ERR(pdd)) { +- mutex_unlock(&p->mutex); +- mutex_unlock(kfd_get_dbgmgr_mutex()); ++ if (IS_ERR(pdd) < 0) { ++ mutex_unlock(get_dbgmgr_mutex()); ++ up_write(&p->lock); + return PTR_ERR(pdd); + } + + if (dev->dbgmgr == NULL) { + /* In case of a legal call, we have no dbgmgr yet */ ++ + create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev); + if (create_ok) { + status = kfd_dbgmgr_register(dbgmgr_ptr, p); +@@ -475,13 +538,10 @@ static int kfd_ioctl_dbg_register(struct file *filep, + else + dev->dbgmgr = dbgmgr_ptr; + } +- } else { +- pr_debug("debugger already registered\n"); +- status = -EINVAL; + } + +- mutex_unlock(&p->mutex); +- mutex_unlock(kfd_get_dbgmgr_mutex()); ++ mutex_unlock(get_dbgmgr_mutex()); ++ up_write(&p->lock); + + return status; + } +@@ -489,9 +549,9 @@ static int kfd_ioctl_dbg_register(struct file *filep, + static int kfd_ioctl_dbg_unregister(struct file *filep, + struct kfd_process *p, void *data) + { ++ long status = -EFAULT; + struct kfd_ioctl_dbg_unregister_args *args = data; + struct kfd_dev *dev; +- long status; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) +@@ -502,7 +562,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, + return -EINVAL; + } + +- mutex_lock(kfd_get_dbgmgr_mutex()); ++ mutex_lock(get_dbgmgr_mutex()); + + status = kfd_dbgmgr_unregister(dev->dbgmgr, p); + if (status == 0) { +@@ -510,7 +570,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, + dev->dbgmgr = NULL; + } + +- mutex_unlock(kfd_get_dbgmgr_mutex()); ++ mutex_unlock(get_dbgmgr_mutex()); + + return status; + } +@@ -519,125 +579,144 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, + * Parse and generate variable size data structure for address watch. + * Total size of the buffer and # watch points is limited in order + * to prevent kernel abuse. (no bearing to the much smaller HW limitation +- * which is enforced by dbgdev module) ++ * which is enforced by dbgdev module. + * please also note that the watch address itself are not "copied from user", + * since it be set into the HW in user mode values. + * + */ +-static int kfd_ioctl_dbg_address_watch(struct file *filep, +- struct kfd_process *p, void *data) ++ ++static int ++kfd_ioctl_dbg_address_watch(struct file *filep, ++ struct kfd_process *p, ++ void *data) + { ++ long status = -EFAULT; + struct kfd_ioctl_dbg_address_watch_args *args = data; + struct kfd_dev *dev; + struct dbg_address_watch_info aw_info; +- unsigned char *args_buff; +- long status; +- void __user *cmd_from_user; +- uint64_t watch_mask_value = 0; ++ unsigned char *args_buff = NULL; + unsigned int args_idx = 0; ++ uint64_t watch_mask_value = 0; + + memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info)); + +- dev = kfd_device_by_id(args->gpu_id); +- if (dev == NULL) +- return -EINVAL; ++ do { ++ dev = kfd_device_by_id(args->gpu_id); ++ if (!dev) { ++ dev_info(NULL, ++ "Error! kfd: In func %s >> get device by id failed\n", ++ __func__); ++ break; ++ } + +- if (dev->device_info->asic_family == CHIP_CARRIZO) { +- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); +- return -EINVAL; +- } ++ if (args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) { ++ status = -EINVAL; ++ break; ++ } + +- cmd_from_user = (void __user *) args->content_ptr; ++ if (args->buf_size_in_bytes <= sizeof(*args)) { ++ status = -EINVAL; ++ break; ++ } + +- /* Validate arguments */ ++ /* this is the actual buffer to work with */ + +- if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) || +- (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) || +- (cmd_from_user == NULL)) +- return -EINVAL; ++ args_buff = kzalloc(args->buf_size_in_bytes - ++ sizeof(*args), GFP_KERNEL); ++ if (args_buff == NULL) { ++ status = -ENOMEM; ++ break; ++ } + +- /* this is the actual buffer to work with */ +- args_buff = memdup_user(cmd_from_user, +- args->buf_size_in_bytes - sizeof(*args)); +- if (IS_ERR(args_buff)) +- return PTR_ERR(args_buff); ++ /* this is the actual buffer to work with */ ++ args_buff = memdup_user(cmd_from_user, ++ args->buf_size_in_bytes - sizeof(*args)); ++ if (IS_ERR(args_buff)) ++ return PTR_ERR(args_buff); + +- aw_info.process = p; ++ aw_info.process = p; + +- aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx])); +- args_idx += sizeof(aw_info.num_watch_points); ++ aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx])); ++ args_idx += sizeof(aw_info.num_watch_points); + +- aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx]; +- args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points; ++ aw_info.watch_mode = (HSA_DBG_WATCH_MODE *) &args_buff[args_idx]; ++ args_idx += sizeof(HSA_DBG_WATCH_MODE) * aw_info.num_watch_points; + +- /* +- * set watch address base pointer to point on the array base +- * within args_buff +- */ +- aw_info.watch_address = (uint64_t *) &args_buff[args_idx]; ++ /* set watch address base pointer to point on the array base within args_buff */ + +- /* skip over the addresses buffer */ +- args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; ++ aw_info.watch_address = (uint64_t *) &args_buff[args_idx]; + +- if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) { +- kfree(args_buff); +- return -EINVAL; +- } ++ /*skip over the addresses buffer */ ++ args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; + +- watch_mask_value = (uint64_t) args_buff[args_idx]; ++ if (args_idx >= args->buf_size_in_bytes) { ++ status = -EINVAL; ++ break; ++ } + +- if (watch_mask_value > 0) { +- /* +- * There is an array of masks. +- * set watch mask base pointer to point on the array base +- * within args_buff +- */ +- aw_info.watch_mask = (uint64_t *) &args_buff[args_idx]; ++ watch_mask_value = (uint64_t) args_buff[args_idx]; + +- /* skip over the masks buffer */ +- args_idx += sizeof(aw_info.watch_mask) * +- aw_info.num_watch_points; +- } else { +- /* just the NULL mask, set to NULL and skip over it */ +- aw_info.watch_mask = NULL; +- args_idx += sizeof(aw_info.watch_mask); +- } ++ if (watch_mask_value > 0) { ++ /* there is an array of masks */ + +- if (args_idx >= args->buf_size_in_bytes - sizeof(args)) { +- kfree(args_buff); +- return -EINVAL; +- } ++ /* set watch mask base pointer to point on the array base within args_buff */ ++ aw_info.watch_mask = (uint64_t *) &args_buff[args_idx]; ++ ++ /*skip over the masks buffer */ ++ args_idx += sizeof(aw_info.watch_mask) * aw_info.num_watch_points; ++ } ++ ++ else ++ /* just the NULL mask, set to NULL and skip over it */ ++ { ++ aw_info.watch_mask = NULL; ++ args_idx += sizeof(aw_info.watch_mask); ++ } ++ ++ if (args_idx > args->buf_size_in_bytes) { ++ status = -EINVAL; ++ break; ++ } ++ ++ aw_info.watch_event = NULL; /* Currently HSA Event is not supported for DBG */ ++ status = 0; ++ ++ } while (0); + +- /* Currently HSA Event is not supported for DBG */ +- aw_info.watch_event = NULL; ++ if (status == 0) { ++ mutex_lock(get_dbgmgr_mutex()); + +- mutex_lock(kfd_get_dbgmgr_mutex()); ++ status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info); + +- status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info); ++ mutex_unlock(get_dbgmgr_mutex()); + +- mutex_unlock(kfd_get_dbgmgr_mutex()); ++ } + + kfree(args_buff); + + return status; + } + +-/* Parse and generate fixed size data structure for wave control */ +-static int kfd_ioctl_dbg_wave_control(struct file *filep, +- struct kfd_process *p, void *data) ++/* ++ * Parse and generate fixed size data structure for wave control. ++ * Buffer is generated in a "packed" form, for avoiding structure packing/pending dependencies. ++ */ ++ ++static int ++kfd_ioctl_dbg_wave_control(struct file *filep, struct kfd_process *p, void *data) + { ++ long status = -EFAULT; + struct kfd_ioctl_dbg_wave_control_args *args = data; + struct kfd_dev *dev; + struct dbg_wave_control_info wac_info; +- unsigned char *args_buff; +- uint32_t computed_buff_size; +- long status; +- void __user *cmd_from_user; ++ unsigned char *args_buff = NULL; + unsigned int args_idx = 0; ++ uint32_t computed_buff_size; + + memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info)); + + /* we use compact form, independent of the packing attribute value */ ++ + computed_buff_size = sizeof(*args) + + sizeof(wac_info.mode) + + sizeof(wac_info.operand) + +@@ -645,26 +724,25 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, + sizeof(wac_info.dbgWave_msg.MemoryVA) + + sizeof(wac_info.trapId); + +- dev = kfd_device_by_id(args->gpu_id); +- if (dev == NULL) +- return -EINVAL; + +- if (dev->device_info->asic_family == CHIP_CARRIZO) { +- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); +- return -EINVAL; +- } ++ dev_info(NULL, "kfd: In func %s - start\n", __func__); + +- /* input size must match the computed "compact" size */ +- if (args->buf_size_in_bytes != computed_buff_size) { +- pr_debug("size mismatch, computed : actual %u : %u\n", +- args->buf_size_in_bytes, computed_buff_size); +- return -EINVAL; +- } ++ do { ++ dev = kfd_device_by_id(args->gpu_id); ++ if (!dev) { ++ dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__); ++ break; ++ } + +- cmd_from_user = (void __user *) args->content_ptr; ++ /* input size must match the computed "compact" size */ + +- if (cmd_from_user == NULL) +- return -EINVAL; ++ if (args->buf_size_in_bytes != computed_buff_size) { ++ dev_info(NULL, ++ "Error! kfd: In func %s >> size mismatch, computed : actual %u : %u\n", ++ __func__, args->buf_size_in_bytes, computed_buff_size); ++ status = -EINVAL; ++ break; ++ } + + /* copy the entire buffer from user */ + +@@ -673,34 +751,51 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, + if (IS_ERR(args_buff)) + return PTR_ERR(args_buff); + +- /* move ptr to the start of the "pay-load" area */ +- wac_info.process = p; ++ if (copy_from_user(args_buff, ++ (void __user *) args->content_ptr, ++ args->buf_size_in_bytes - sizeof(*args))) { ++ dev_info(NULL, ++ "Error! kfd: In func %s >> copy_from_user failed\n", ++ __func__); ++ break; ++ } ++ ++ /* move ptr to the start of the "pay-load" area */ + +- wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx])); +- args_idx += sizeof(wac_info.operand); + +- wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx])); +- args_idx += sizeof(wac_info.mode); ++ wac_info.process = p; + +- wac_info.trapId = *((uint32_t *)(&args_buff[args_idx])); +- args_idx += sizeof(wac_info.trapId); ++ wac_info.operand = (HSA_DBG_WAVEOP) *((HSA_DBG_WAVEOP *)(&args_buff[args_idx])); ++ args_idx += sizeof(wac_info.operand); + +- wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = +- *((uint32_t *)(&args_buff[args_idx])); +- wac_info.dbgWave_msg.MemoryVA = NULL; ++ wac_info.mode = (HSA_DBG_WAVEMODE) *((HSA_DBG_WAVEMODE *)(&args_buff[args_idx])); ++ args_idx += sizeof(wac_info.mode); + +- mutex_lock(kfd_get_dbgmgr_mutex()); ++ wac_info.trapId = (uint32_t) *((uint32_t *)(&args_buff[args_idx])); ++ args_idx += sizeof(wac_info.trapId); + +- pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n", +- wac_info.process, wac_info.operand, +- wac_info.mode, wac_info.trapId, +- wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); ++ wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = *((uint32_t *)(&args_buff[args_idx])); ++ wac_info.dbgWave_msg.MemoryVA = NULL; + +- status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info); + +- pr_debug("Returned status of dbg manager is %ld\n", status); ++ status = 0; + +- mutex_unlock(kfd_get_dbgmgr_mutex()); ++ } while (0); ++ if (status == 0) { ++ mutex_lock(get_dbgmgr_mutex()); ++ ++ dev_info(NULL, ++ "kfd: In func %s >> calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n", ++ __func__, wac_info.process, wac_info.operand, wac_info.mode, wac_info.trapId, ++ wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); ++ ++ status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info); ++ ++ dev_info(NULL, "kfd: In func %s >> returned status of dbg manager is %ld\n", __func__, status); ++ ++ mutex_unlock(get_dbgmgr_mutex()); ++ ++ } + + kfree(args_buff); + +@@ -715,12 +810,13 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, + struct timespec64 time; + + dev = kfd_device_by_id(args->gpu_id); +- if (dev == NULL) +- return -EINVAL; +- +- /* Reading GPU clock counter from KGD */ +- args->gpu_clock_counter = +- dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); ++ if (dev) ++ /* Reading GPU clock counter from KGD */ ++ args->gpu_clock_counter = ++ dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); ++ else ++ /* Node without GPU resource */ ++ args->gpu_clock_counter = 0; + + /* No access to rdtsc. Using raw monotonic time */ + getrawmonotonic64(&time); +@@ -747,7 +843,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, + + args->num_of_nodes = 0; + +- mutex_lock(&p->mutex); ++ down_write(&p->lock); + + /*if the process-device list isn't empty*/ + if (kfd_has_process_device_data(p)) { +@@ -786,52 +882,180 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, + (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); + } + +- mutex_unlock(&p->mutex); ++ up_write(&p->lock); + + return 0; + } + +-static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, +- void *data) ++static int kfd_ioctl_get_process_apertures_new(struct file *filp, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_get_process_apertures_new_args *args = data; ++ struct kfd_process_device_apertures *pa; ++ struct kfd_process_device *pdd; ++ uint32_t nodes = 0; ++ int ret; ++ ++ dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); ++ ++ if (args->num_of_nodes == 0) { ++ /* Return number of nodes, so that user space can alloacate ++ * sufficient memory */ ++ down_write(&p->lock); ++ ++ if (!kfd_has_process_device_data(p)) { ++ up_write(&p->lock); ++ return 0; ++ } ++ ++ /* Run over all pdd of the process */ ++ pdd = kfd_get_first_process_device_data(p); ++ do { ++ args->num_of_nodes++; ++ } while ((pdd = ++ kfd_get_next_process_device_data(p, pdd)) != NULL); ++ ++ up_write(&p->lock); ++ return 0; ++ } ++ ++ /* Fill in process-aperture information for all available ++ * nodes, but not more than args->num_of_nodes as that is ++ * the amount of memory allocated by user */ ++ pa = kzalloc((sizeof(struct kfd_process_device_apertures) * ++ args->num_of_nodes), GFP_KERNEL); ++ if (!pa) ++ return -ENOMEM; ++ ++ down_write(&p->lock); ++ ++ if (!kfd_has_process_device_data(p)) { ++ up_write(&p->lock); ++ args->num_of_nodes = 0; ++ kfree(pa); ++ return 0; ++ } ++ ++ /* Run over all pdd of the process */ ++ pdd = kfd_get_first_process_device_data(p); ++ do { ++ pa[nodes].gpu_id = pdd->dev->id; ++ pa[nodes].lds_base = pdd->lds_base; ++ pa[nodes].lds_limit = pdd->lds_limit; ++ pa[nodes].gpuvm_base = pdd->gpuvm_base; ++ pa[nodes].gpuvm_limit = pdd->gpuvm_limit; ++ pa[nodes].scratch_base = pdd->scratch_base; ++ pa[nodes].scratch_limit = pdd->scratch_limit; ++ ++ dev_dbg(kfd_device, ++ "gpu id %u\n", pdd->dev->id); ++ dev_dbg(kfd_device, ++ "lds_base %llX\n", pdd->lds_base); ++ dev_dbg(kfd_device, ++ "lds_limit %llX\n", pdd->lds_limit); ++ dev_dbg(kfd_device, ++ "gpuvm_base %llX\n", pdd->gpuvm_base); ++ dev_dbg(kfd_device, ++ "gpuvm_limit %llX\n", pdd->gpuvm_limit); ++ dev_dbg(kfd_device, ++ "scratch_base %llX\n", pdd->scratch_base); ++ dev_dbg(kfd_device, ++ "scratch_limit %llX\n", pdd->scratch_limit); ++ nodes++; ++ } while ( ++ (pdd = kfd_get_next_process_device_data(p, pdd)) != NULL && ++ (nodes < args->num_of_nodes)); ++ up_write(&p->lock); ++ ++ args->num_of_nodes = nodes; ++ ret = copy_to_user( ++ (void __user *)args->kfd_process_device_apertures_ptr, ++ pa, ++ (nodes * sizeof(struct kfd_process_device_apertures))); ++ kfree(pa); ++ return ret ? -EFAULT : 0; ++} ++ ++static int ++kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, void *data) + { + struct kfd_ioctl_create_event_args *args = data; +- int err; ++ struct kfd_dev *kfd; ++ struct kfd_process_device *pdd; ++ int err = -EINVAL; ++ void *mem, *kern_addr = NULL; ++ ++ pr_debug("amdkfd: Event page offset 0x%llx\n", args->event_page_offset); ++ ++ if (args->event_page_offset) { ++ kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); ++ if (!kfd) { ++ pr_err("amdkfd: can't find kfd device\n"); ++ return -EFAULT; ++ } ++ if (KFD_IS_DGPU(kfd->device_info->asic_family)) { ++ down_write(&p->lock); ++ pdd = kfd_bind_process_to_device(kfd, p); ++ if (IS_ERR(pdd) < 0) { ++ err = PTR_ERR(pdd); ++ up_write(&p->lock); ++ return -EFAULT; ++ } ++ mem = kfd_process_device_translate_handle(pdd, ++ GET_IDR_HANDLE(args->event_page_offset)); ++ if (!mem) { ++ pr_err("amdkfd: can't find BO offset is 0x%llx\n", ++ args->event_page_offset); ++ up_write(&p->lock); ++ return -EFAULT; ++ } ++ up_write(&p->lock); ++ ++ /* Map dGPU gtt BO to kernel */ ++ kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, ++ mem, &kern_addr); ++ } ++ } + +- err = kfd_event_create(filp, p, args->event_type, +- args->auto_reset != 0, args->node_id, +- &args->event_id, &args->event_trigger_data, +- &args->event_page_offset, +- &args->event_slot_index); ++ err = kfd_event_create(filp, p, ++ args->event_type, ++ args->auto_reset != 0, ++ args->node_id, ++ &args->event_id, ++ &args->event_trigger_data, ++ &args->event_page_offset, ++ &args->event_slot_index, ++ kern_addr); + + return err; + } + +-static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, +- void *data) ++static int ++kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, void *data) + { + struct kfd_ioctl_destroy_event_args *args = data; + + return kfd_event_destroy(p, args->event_id); + } + +-static int kfd_ioctl_set_event(struct file *filp, struct kfd_process *p, +- void *data) ++static int ++kfd_ioctl_set_event(struct file *filp, struct kfd_process *p, void *data) + { + struct kfd_ioctl_set_event_args *args = data; + + return kfd_set_event(p, args->event_id); + } + +-static int kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p, +- void *data) ++static int ++kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p, void *data) + { + struct kfd_ioctl_reset_event_args *args = data; + + return kfd_reset_event(p, args->event_id); + } + +-static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, +- void *data) ++static int ++kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, void *data) + { + struct kfd_ioctl_wait_events_args *args = data; + enum kfd_event_wait_result wait_result; +@@ -846,6 +1070,711 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, + + return err; + } ++static int kfd_ioctl_alloc_scratch_memory(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_alloc_memory_of_gpu_args *args = ++ (struct kfd_ioctl_alloc_memory_of_gpu_args *)data; ++ struct kfd_process_device *pdd; ++ struct kfd_dev *dev; ++ long err; ++ ++ if (args->size == 0) ++ return -EINVAL; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (dev == NULL) ++ return -EINVAL; ++ ++ down_write(&p->lock); ++ ++ pdd = kfd_bind_process_to_device(dev, p); ++ if (IS_ERR(pdd) < 0) { ++ err = PTR_ERR(pdd); ++ goto bind_process_to_device_fail; ++ } ++ ++ pdd->sh_hidden_private_base_vmid = args->va_addr; ++ pdd->qpd.sh_hidden_private_base = args->va_addr; ++ ++ up_write(&p->lock); ++ ++ if (sched_policy == KFD_SCHED_POLICY_NO_HWS && pdd->qpd.vmid != 0) { ++ err = dev->kfd2kgd->alloc_memory_of_scratch( ++ dev->kgd, args->va_addr, pdd->qpd.vmid); ++ if (err != 0) ++ goto alloc_memory_of_scratch_failed; ++ } ++ ++ return 0; ++ ++bind_process_to_device_fail: ++ up_write(&p->lock); ++alloc_memory_of_scratch_failed: ++ return -EFAULT; ++} ++ ++static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_alloc_memory_of_gpu_args *args = data; ++ struct kfd_process_device *pdd; ++ void *mem; ++ struct kfd_dev *dev; ++ int idr_handle; ++ long err; ++ ++ if (args->size == 0) ++ return -EINVAL; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (dev == NULL) ++ return -EINVAL; ++ ++ down_write(&p->lock); ++ pdd = kfd_bind_process_to_device(dev, p); ++ up_write(&p->lock); ++ if (IS_ERR(pdd) < 0) ++ return PTR_ERR(pdd); ++ ++ err = dev->kfd2kgd->alloc_memory_of_gpu( ++ dev->kgd, args->va_addr, args->size, ++ pdd->vm, (struct kgd_mem **) &mem, NULL, NULL, pdd, 0); ++ ++ if (err != 0) ++ return err; ++ ++ down_write(&p->lock); ++ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, ++ args->va_addr, args->size); ++ up_write(&p->lock); ++ if (idr_handle < 0) { ++ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, ++ (struct kgd_mem *) mem); ++ return -EFAULT; ++ } ++ ++ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); ++ ++ return 0; ++} ++ ++bool kfd_is_large_bar(struct kfd_dev *dev) ++{ ++ struct kfd_local_mem_info mem_info; ++ ++ if (debug_largebar) { ++ pr_debug("amdkfd: simulate large-bar allocation on non large-bar machine\n"); ++ return true; ++ } ++ ++ if (!KFD_IS_DGPU(dev->device_info->asic_family)) ++ return false; ++ ++ dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info); ++ if (mem_info.local_mem_size_private == 0 && ++ mem_info.local_mem_size_public > 0) ++ return true; ++ return false; ++} ++ ++static uint32_t kfd_convert_user_mem_alloction_flags( ++ struct kfd_dev *dev, ++ uint32_t userspace_flags) ++{ ++ uint32_t kernel_allocation_flags; ++ ++ kernel_allocation_flags = 0; ++ ++ /* Allocate VRAM bo */ ++ if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) || ++ (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE)) { ++ kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM; ++ if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) && ++ kfd_is_large_bar(dev)) ++ kernel_allocation_flags |= ALLOC_MEM_FLAGS_PUBLIC; ++ goto out; ++ } ++ /* ++ * Since currently user space library doesn't uses scratch ++ * allocation flag I route it to VRAM ++ */ ++ if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_SCRATCH) || ++ (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_SCRATCH)) { ++ kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM; ++ goto out; ++ } ++ /* ++ * The current usage for *_HOST allocation flags are for GTT memory ++ * Need to verify if we're node zero or we want to allocate bo on ++ * public domain for P2P buffers. ++ */ ++ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST) { ++ kernel_allocation_flags = ALLOC_MEM_FLAGS_GTT; ++ goto out; ++ } ++ /* Allocate userptr BO */ ++ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { ++ kernel_allocation_flags = ALLOC_MEM_FLAGS_USERPTR; ++ goto out; ++ } ++ ++out: ++ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_AQL_QUEUE_MEM) ++ kernel_allocation_flags |= ALLOC_MEM_FLAGS_AQL_QUEUE_MEM; ++ /* Current HW doesn't support non paged memory */ ++ kernel_allocation_flags |= ALLOC_MEM_FLAGS_NONPAGED; ++ /* ++ * Set by default execute access as this buffer might be allocated ++ * for CP's ring buffer ++ */ ++ kernel_allocation_flags |= ALLOC_MEM_FLAGS_EXECUTE_ACCESS; ++ kernel_allocation_flags |= ALLOC_MEM_FLAGS_NO_SUBSTITUTE; ++ ++ pr_debug("amdkfd: user allocation flags 0x%x kernel allocation flags: 0x%x\n", ++ userspace_flags, kernel_allocation_flags); ++ ++ return kernel_allocation_flags; ++} ++ ++static int kfd_ioctl_alloc_memory_of_gpu_new(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_alloc_memory_of_gpu_new_args *args = data; ++ struct kfd_process_device *pdd; ++ void *mem; ++ struct kfd_dev *dev; ++ int idr_handle; ++ long err; ++ uint64_t offset; ++ ++ if (args->size == 0) ++ return -EINVAL; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (dev == NULL) ++ return -EINVAL; ++ ++ down_write(&p->lock); ++ pdd = kfd_bind_process_to_device(dev, p); ++ up_write(&p->lock); ++ if (IS_ERR(pdd) < 0) ++ return PTR_ERR(pdd); ++ ++ offset = args->mmap_offset; ++ err = dev->kfd2kgd->alloc_memory_of_gpu( ++ dev->kgd, args->va_addr, args->size, ++ pdd->vm, (struct kgd_mem **) &mem, &offset, ++ NULL, pdd, ++ kfd_convert_user_mem_alloction_flags(dev, args->flags)); ++ ++ if (err != 0) ++ return err; ++ ++ down_write(&p->lock); ++ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, ++ args->va_addr, args->size); ++ up_write(&p->lock); ++ if (idr_handle < 0) { ++ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, ++ (struct kgd_mem *) mem); ++ return -EFAULT; ++ } ++ ++ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); ++ if ((args->flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) != 0 && ++ !kfd_is_large_bar(dev)) { ++ args->mmap_offset = 0; ++ } else { ++ args->mmap_offset = KFD_MMAP_TYPE_MAP_BO; ++ args->mmap_offset |= KFD_MMAP_GPU_ID(args->gpu_id); ++ args->mmap_offset <<= PAGE_SHIFT; ++ args->mmap_offset |= offset; ++ } ++ ++ return 0; ++} ++ ++static int kfd_ioctl_free_memory_of_gpu(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_free_memory_of_gpu_args *args = data; ++ struct kfd_process_device *pdd; ++ struct kfd_bo *buf_obj; ++ struct kfd_dev *dev; ++ int ret; ++ ++ dev = kfd_device_by_id(GET_GPU_ID(args->handle)); ++ if (dev == NULL) ++ return -EINVAL; ++ ++ down_write(&p->lock); ++ ++ pdd = kfd_get_process_device_data(dev, p); ++ if (!pdd) { ++ pr_err("Process device data doesn't exist\n"); ++ ret = -EINVAL; ++ goto err_unlock; ++ } ++ ++ buf_obj = kfd_process_device_find_bo(pdd, ++ GET_IDR_HANDLE(args->handle)); ++ if (buf_obj == NULL) { ++ ret = -EINVAL; ++ goto err_unlock; ++ } ++ run_rdma_free_callback(buf_obj); ++ ++ up_write(&p->lock); ++ ++ ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem); ++ ++ /* If freeing the buffer failed, leave the handle in place for ++ * clean-up during process tear-down. */ ++ if (ret == 0) { ++ down_write(&p->lock); ++ kfd_process_device_remove_obj_handle( ++ pdd, GET_IDR_HANDLE(args->handle)); ++ up_write(&p->lock); ++ } ++ ++ return ret; ++ ++err_unlock: ++ up_write(&p->lock); ++ return ret; ++} ++ ++int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem, ++ struct kfd_process *p, struct kfd_process_device *pdd) ++{ ++ int err; ++ ++ BUG_ON(!dev); ++ BUG_ON(!pdd); ++ ++ err = dev->kfd2kgd->map_memory_to_gpu( ++ dev->kgd, (struct kgd_mem *) mem, pdd->vm); ++ ++ if (err != 0) ++ return err; ++ ++ radeon_flush_tlb(dev, p->pasid); ++ ++ err = dev->dqm->ops.set_page_directory_base(dev->dqm, &pdd->qpd); ++ if (err != 0) { ++ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, ++ (struct kgd_mem *) mem, pdd->vm); ++ return err; ++ } ++ ++ return 0; ++} ++ ++static int kfd_ioctl_map_memory_to_gpu(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_map_memory_to_gpu_new_args *args = data; ++ struct kfd_process_device *pdd, *peer_pdd; ++ void *mem; ++ struct kfd_dev *dev, *peer; ++ long err = 0; ++ int i, num_dev; ++ uint32_t *devices_arr = NULL; ++ int bo_size; ++ ++ dev = kfd_device_by_id(GET_GPU_ID(args->handle)); ++ if (dev == NULL) ++ return -EINVAL; ++ ++ if (args->device_ids_array_size > 0 && ++ (args->device_ids_array_size < sizeof(uint32_t))) { ++ pr_err("amdkfd: err node IDs array size %u\n", ++ args->device_ids_array_size); ++ return -EFAULT; ++ } ++ ++ if (args->device_ids_array_size > 0) { ++ devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); ++ if (!devices_arr) ++ return -ENOMEM; ++ ++ err = copy_from_user(devices_arr, ++ (void __user *)args->device_ids_array, ++ args->device_ids_array_size); ++ if (err != 0) { ++ err = -EFAULT; ++ goto copy_from_user_failed; ++ } ++ } ++ ++ down_write(&p->lock); ++ ++ pdd = kfd_bind_process_to_device(dev, p); ++ if (IS_ERR(pdd) < 0) { ++ err = PTR_ERR(pdd); ++ goto bind_process_to_device_failed; ++ } ++ ++ mem = kfd_process_device_translate_handle(pdd, ++ GET_IDR_HANDLE(args->handle)); ++ up_write(&p->lock); ++ ++ if (mem == NULL) { ++ err = PTR_ERR(mem); ++ goto get_mem_obj_from_handle_failed; ++ } ++ ++ if (args->device_ids_array_size > 0) { ++ num_dev = args->device_ids_array_size / sizeof(uint32_t); ++ for (i = 0 ; i < num_dev; i++) { ++ peer = kfd_device_by_id(devices_arr[i]); ++ if (!peer) { ++ pr_err("amdkfd: didn't found kfd-dev for 0x%x\n", ++ devices_arr[i]); ++ err = -EFAULT; ++ goto get_mem_obj_from_handle_failed; ++ } ++ down_write(&p->lock); ++ peer_pdd = kfd_bind_process_to_device(peer, p); ++ up_write(&p->lock); ++ if (!peer_pdd) { ++ err = -EFAULT; ++ goto get_mem_obj_from_handle_failed; ++ } ++ err = kfd_map_memory_to_gpu(peer, mem, p, peer_pdd); ++ if (err != 0) ++ pr_err("amdkfd: failed to map\n"); ++ } ++ } else { ++ err = kfd_map_memory_to_gpu(dev, mem, p, pdd); ++ if (err != 0) ++ pr_err("amdkfd: failed to map\n"); ++ } ++ ++ bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem); ++ down_write(&p->lock); ++ pdd->mapped_size += bo_size; ++ up_write(&p->lock); ++ ++ if (args->device_ids_array_size > 0 && devices_arr) ++ kfree(devices_arr); ++ ++ return err; ++ ++bind_process_to_device_failed: ++ up_write(&p->lock); ++get_mem_obj_from_handle_failed: ++copy_from_user_failed: ++ kfree(devices_arr); ++ return err; ++} ++ ++static int kfd_ioctl_map_memory_to_gpu_wrapper(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_map_memory_to_gpu_args *args = data; ++ struct kfd_ioctl_map_memory_to_gpu_new_args new_args; ++ ++ new_args.handle = args->handle; ++ new_args.device_ids_array = NULL; ++ new_args.device_ids_array_size = 0; ++ ++ return kfd_ioctl_map_memory_to_gpu(filep, p, &new_args); ++} ++ ++static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_unmap_memory_from_gpu_new_args *args = data; ++ struct kfd_process_device *pdd, *peer_pdd; ++ void *mem; ++ struct kfd_dev *dev, *peer; ++ long err = 0; ++ uint32_t *devices_arr = NULL, num_dev, i; ++ int bo_size; ++ ++ dev = kfd_device_by_id(GET_GPU_ID(args->handle)); ++ if (dev == NULL) ++ return -EINVAL; ++ ++ if (args->device_ids_array_size > 0 && ++ (args->device_ids_array_size < sizeof(uint32_t))) { ++ pr_err("amdkfd: err node IDs array size %u\n", ++ args->device_ids_array_size); ++ return -EFAULT; ++ } ++ ++ if (args->device_ids_array_size > 0) { ++ devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); ++ if (!devices_arr) ++ return -ENOMEM; ++ ++ err = copy_from_user(devices_arr, ++ (void __user *)args->device_ids_array, ++ args->device_ids_array_size); ++ if (err != 0) { ++ err = -EFAULT; ++ goto copy_from_user_failed; ++ } ++ } ++ ++ down_write(&p->lock); ++ ++ pdd = kfd_get_process_device_data(dev, p); ++ if (!pdd) { ++ pr_err("Process device data doesn't exist\n"); ++ err = PTR_ERR(pdd); ++ goto bind_process_to_device_failed; ++ } ++ ++ mem = kfd_process_device_translate_handle(pdd, ++ GET_IDR_HANDLE(args->handle)); ++ up_write(&p->lock); ++ ++ if (mem == NULL) { ++ err = PTR_ERR(mem); ++ goto get_mem_obj_from_handle_failed; ++ } ++ ++ if (args->device_ids_array_size > 0) { ++ num_dev = args->device_ids_array_size / sizeof(uint32_t); ++ for (i = 0 ; i < num_dev; i++) { ++ peer = kfd_device_by_id(devices_arr[i]); ++ if (!peer) { ++ err = -EFAULT; ++ goto get_mem_obj_from_handle_failed; ++ } ++ down_write(&p->lock); ++ peer_pdd = kfd_get_process_device_data(peer, p); ++ up_write(&p->lock); ++ if (!peer_pdd) { ++ err = -EFAULT; ++ goto get_mem_obj_from_handle_failed; ++ } ++ peer->kfd2kgd->unmap_memory_to_gpu(peer->kgd, ++ mem, peer_pdd->vm); ++ radeon_flush_tlb(peer, p->pasid); ++ } ++ } else { ++ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, mem, pdd->vm); ++ radeon_flush_tlb(dev, p->pasid); ++ } ++ ++ bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem); ++ down_write(&p->lock); ++ pdd->mapped_size -= bo_size; ++ up_write(&p->lock); ++ ++ return 0; ++ ++bind_process_to_device_failed: ++ up_write(&p->lock); ++get_mem_obj_from_handle_failed: ++copy_from_user_failed: ++ kfree(devices_arr); ++ return err; ++} ++ ++static int kfd_ioctl_unmap_memory_from_gpu_wrapper(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; ++ struct kfd_ioctl_unmap_memory_from_gpu_new_args new_args; ++ ++ new_args.handle = args->handle; ++ new_args.device_ids_array = NULL; ++ new_args.device_ids_array_size = 0; ++ ++ return kfd_ioctl_unmap_memory_from_gpu(filep, p, &new_args); ++} ++ ++static int kfd_ioctl_open_graphic_handle(struct file *filep, ++ struct kfd_process *p, ++ void *data) ++{ ++ struct kfd_ioctl_open_graphic_handle_args *args = data; ++ struct kfd_dev *dev; ++ struct kfd_process_device *pdd; ++ void *mem; ++ int idr_handle; ++ long err; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (dev == NULL) ++ return -EINVAL; ++ ++ if (dev->device_info->asic_family != CHIP_KAVERI) { ++ pr_debug("kfd_ioctl_open_graphic_handle only supported on KV\n"); ++ return -EINVAL; ++ } ++ ++ down_write(&p->lock); ++ pdd = kfd_bind_process_to_device(dev, p); ++ up_write(&p->lock); ++ if (IS_ERR(pdd) < 0) ++ return PTR_ERR(pdd); ++ ++ err = dev->kfd2kgd->open_graphic_handle(dev->kgd, ++ args->va_addr, ++ (struct kgd_vm *) pdd->vm, ++ args->graphic_device_fd, ++ args->graphic_handle, ++ (struct kgd_mem **) &mem); ++ ++ if (err != 0) ++ return err; ++ ++ down_write(&p->lock); ++ /*TODO: When open_graphic_handle is implemented, we need to create ++ * the corresponding interval tree. We need to know the size of ++ * the buffer through open_graphic_handle(). We use 1 for now.*/ ++ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, ++ args->va_addr, 1); ++ up_write(&p->lock); ++ if (idr_handle < 0) { ++ /* FIXME: destroy_process_gpumem doesn't seem to be ++ * implemented anywhere */ ++ dev->kfd2kgd->destroy_process_gpumem(dev->kgd, mem); ++ return -EFAULT; ++ } ++ ++ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); ++ ++ return 0; ++} ++ ++static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_set_process_dgpu_aperture_args *args = data; ++ struct kfd_dev *dev; ++ struct kfd_process_device *pdd; ++ long err; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (dev == NULL) ++ return -EINVAL; ++ ++ down_write(&p->lock); ++ ++ pdd = kfd_bind_process_to_device(dev, p); ++ if (IS_ERR(pdd) < 0) { ++ err = PTR_ERR(pdd); ++ goto exit; ++ } ++ ++ err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base, ++ args->dgpu_limit); ++ ++exit: ++ up_write(&p->lock); ++ return err; ++} ++ ++static int kfd_ioctl_get_dmabuf_info(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_get_dmabuf_info_args *args = data; ++ struct kfd_dev *dev = NULL; ++ struct kgd_dev *dma_buf_kgd; ++ void *metadata_buffer = NULL; ++ uint32_t flags; ++ unsigned i; ++ int r; ++ ++ /* Find a KFD GPU device that supports the get_dmabuf_info query */ ++ for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) ++ if (dev && dev->kfd2kgd->get_dmabuf_info) ++ break; ++ if (!dev) ++ return -EINVAL; ++ ++ if (args->metadata_ptr) { ++ metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL); ++ if (!metadata_buffer) ++ return -ENOMEM; ++ } ++ ++ /* Get dmabuf info from KGD */ ++ r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd, ++ &dma_buf_kgd, &args->size, ++ metadata_buffer, args->metadata_size, ++ &args->metadata_size, &flags); ++ if (r) ++ goto exit; ++ ++ /* Reverse-lookup gpu_id from kgd pointer */ ++ dev = kfd_device_by_kgd(dma_buf_kgd); ++ if (!dev) { ++ r = -EINVAL; ++ goto exit; ++ } ++ args->gpu_id = kfd_get_gpu_id(dev); ++ ++ /* Translate flags */ ++ if (flags & ALLOC_MEM_FLAGS_VRAM) { ++ args->flags = KFD_IS_DGPU(dev->device_info->asic_family) ? ++ KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE : ++ KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE; ++ } else ++ args->flags = KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST; ++ ++ /* Copy metadata buffer to user mode */ ++ if (metadata_buffer) { ++ r = copy_to_user((void __user *)args->metadata_ptr, ++ metadata_buffer, args->metadata_size); ++ if (r != 0) ++ r = -EFAULT; ++ } ++ ++exit: ++ kfree(metadata_buffer); ++ ++ return r; ++} ++ ++static int kfd_ioctl_import_dmabuf(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_import_dmabuf_args *args = data; ++ struct kfd_dev *dev; ++ struct kfd_process_device *pdd; ++ void *mem; ++ uint64_t size; ++ int idr_handle; ++ int r; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (!dev || !dev->kfd2kgd->import_dmabuf) ++ return -EINVAL; ++ ++ down_write(&p->lock); ++ pdd = kfd_bind_process_to_device(dev, p); ++ up_write(&p->lock); ++ if (IS_ERR(pdd) < 0) ++ return PTR_ERR(pdd); ++ ++ r = dev->kfd2kgd->import_dmabuf(dev->kgd, args->dmabuf_fd, ++ args->va_addr, pdd->vm, ++ (struct kgd_mem **)&mem, &size); ++ if (r) ++ return r; ++ ++ down_write(&p->lock); ++ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, ++ args->va_addr, size); ++ up_write(&p->lock); ++ if (idr_handle < 0) { ++ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, ++ (struct kgd_mem *)mem); ++ return -EFAULT; ++ } ++ ++ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); ++ ++ return 0; ++} + + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ + [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0, .name = #ioctl} +@@ -899,10 +1828,65 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, + kfd_ioctl_dbg_wave_control, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, ++ kfd_ioctl_alloc_memory_of_gpu, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU, ++ kfd_ioctl_free_memory_of_gpu, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU, ++ kfd_ioctl_map_memory_to_gpu_wrapper, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, ++ kfd_ioctl_unmap_memory_from_gpu_wrapper, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_OPEN_GRAPHIC_HANDLE, ++ kfd_ioctl_open_graphic_handle, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH, ++ kfd_ioctl_alloc_scratch_memory, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, ++ kfd_ioctl_set_cu_mask, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, ++ kfd_ioctl_set_process_dgpu_aperture, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, ++ kfd_ioctl_set_trap_handler, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU_NEW, ++ kfd_ioctl_alloc_memory_of_gpu_new, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW, ++ kfd_ioctl_map_memory_to_gpu, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, ++ kfd_ioctl_unmap_memory_from_gpu, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, ++ kfd_ioctl_get_process_apertures_new, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_EVICT_MEMORY, ++ kfd_evict, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO, ++ kfd_ioctl_get_dmabuf_info, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, ++ kfd_ioctl_import_dmabuf, 0) + }; + + #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) + ++static int kfd_evict(struct file *filep, struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_eviction_args *args = data; ++ ++ return evict_size(p, args->size, args->type); ++ ++} + static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + { + struct kfd_process *process; +@@ -994,20 +1978,37 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) + { + struct kfd_process *process; ++ struct kfd_dev *kfd; ++ unsigned long vm_pgoff; ++ int retval; + + process = kfd_get_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + +- if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == +- KFD_MMAP_DOORBELL_MASK) { +- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; ++ vm_pgoff = vma->vm_pgoff; ++ vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff); ++ ++ switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { ++ case KFD_MMAP_TYPE_DOORBELL: + return kfd_doorbell_mmap(process, vma); +- } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == +- KFD_MMAP_EVENTS_MASK) { +- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; ++ ++ case KFD_MMAP_TYPE_EVENTS: + return kfd_event_mmap(process, vma); ++ ++ case KFD_MMAP_TYPE_MAP_BO: ++ kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); ++ if (!kfd) ++ return -EFAULT; ++ retval = kfd->kfd2kgd->mmap_bo(kfd->kgd, vma); ++ return retval; ++ ++ case KFD_MMAP_TYPE_RESERVED_MEM: ++ return kfd_reserved_mem_mmap(process, vma); ++ + } + + return -EFAULT; + } ++ ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +new file mode 100644 +index 0000000..b3d4a50 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +@@ -0,0 +1,1163 @@ ++#include <linux/kernel.h> ++#include <linux/acpi.h> ++#include <linux/mm.h> ++#include <linux/amd-iommu.h> ++#include <linux/pci.h> ++#include "kfd_crat.h" ++#include "kfd_priv.h" ++#include "kfd_topology.h" ++ ++/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. ++ * GPU processor ID are expressed with Bit[31]=1. ++ * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs ++ * used in the CRAT. */ ++static uint32_t gpu_processor_id_low = 0x80001000; ++ ++/* Return the next available gpu_processor_id and increment it for next GPU ++ * @total_cu_count - Total CUs present in the GPU including ones masked off ++ */ ++static inline unsigned int get_and_inc_gpu_processor_id( ++ unsigned int total_cu_count) ++{ ++ int current_id = gpu_processor_id_low; ++ ++ gpu_processor_id_low += total_cu_count; ++ return current_id; ++} ++ ++/* Static table to describe GPU Cache information */ ++struct kfd_gpu_cache_info { ++ uint32_t cache_size; ++ uint32_t cache_level; ++ uint32_t flags; ++ /* Indicates how many Compute Units share this cache ++ * Value = 1 indicates the cache is not shared */ ++ uint32_t num_cu_shared; ++}; ++ ++static struct kfd_gpu_cache_info kaveri_cache_info[] = { ++ { ++ /* TCP L1 Cache per CU */ ++ .cache_size = 16, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_DATA_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 1, ++ ++ }, ++ { ++ /* Scalar L1 Instruction Cache (in SQC module) per bank */ ++ .cache_size = 16, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_INST_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 2, ++ }, ++ { ++ /* Scalar L1 Data Cache (in SQC module) per bank */ ++ .cache_size = 8, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_DATA_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 2, ++ }, ++ ++ /* TODO: Add L2 Cache information */ ++}; ++ ++ ++static struct kfd_gpu_cache_info carrizo_cache_info[] = { ++ { ++ /* TCP L1 Cache per CU */ ++ .cache_size = 16, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_DATA_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 1, ++ }, ++ { ++ /* Scalar L1 Instruction Cache (in SQC module) per bank */ ++ .cache_size = 8, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_INST_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 4, ++ }, ++ { ++ /* Scalar L1 Data Cache (in SQC module) per bank. */ ++ .cache_size = 4, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_DATA_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 4, ++ }, ++ ++ /* TODO: Add L2 Cache information */ ++}; ++ ++/* NOTE: In future if more information is added to struct kfd_gpu_cache_info ++ * the following ASICs may need a separate table. */ ++#define tonga_cache_info carrizo_cache_info ++#define fiji_cache_info carrizo_cache_info ++ ++static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, ++ struct crat_subtype_computeunit *cu) ++{ ++ BUG_ON(!dev); ++ BUG_ON(!cu); ++ ++ dev->node_props.cpu_cores_count = cu->num_cpu_cores; ++ dev->node_props.cpu_core_id_base = cu->processor_id_low; ++ if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) ++ dev->node_props.capability |= HSA_CAP_ATS_PRESENT; ++ ++ pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, ++ cu->processor_id_low); ++} ++ ++static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, ++ struct crat_subtype_computeunit *cu) ++{ ++ BUG_ON(!dev); ++ BUG_ON(!cu); ++ ++ dev->node_props.simd_id_base = cu->processor_id_low; ++ dev->node_props.simd_count = cu->num_simd_cores; ++ dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; ++ dev->node_props.max_waves_per_simd = cu->max_waves_simd; ++ dev->node_props.wave_front_size = cu->wave_front_size; ++ dev->node_props.array_count = cu->array_count; ++ dev->node_props.cu_per_simd_array = cu->num_cu_per_array; ++ dev->node_props.simd_per_cu = cu->num_simd_per_cu; ++ dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; ++ if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) ++ dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; ++ pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); ++} ++ ++/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct ++ * topology device present in the device_list ++ */ ++static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, ++ struct list_head *device_list) ++{ ++ struct kfd_topology_device *dev; ++ ++ BUG_ON(!cu); ++ ++ pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", ++ cu->proximity_domain, cu->hsa_capability); ++ list_for_each_entry(dev, device_list, list) { ++ if (cu->proximity_domain == dev->proximity_domain) { ++ if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) ++ kfd_populated_cu_info_cpu(dev, cu); ++ ++ if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) ++ kfd_populated_cu_info_gpu(dev, cu); ++ break; ++ } ++ } ++ ++ return 0; ++} ++ ++/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct ++ * topology device present in the device_list ++ */ ++static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, ++ struct list_head *device_list) ++{ ++ struct kfd_mem_properties *props; ++ struct kfd_topology_device *dev; ++ ++ BUG_ON(!mem); ++ ++ pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", ++ mem->proximity_domain); ++ list_for_each_entry(dev, device_list, list) { ++ if (mem->proximity_domain == dev->proximity_domain) { ++ props = kfd_alloc_struct(props); ++ if (props == NULL) ++ return -ENOMEM; ++ ++ /* ++ * We're on GPU node ++ */ ++ if (dev->node_props.cpu_cores_count == 0) { ++ /* APU */ ++ if (mem->visibility_type == 0) ++ props->heap_type = ++ HSA_MEM_HEAP_TYPE_FB_PRIVATE; ++ /* dGPU */ ++ else ++ props->heap_type = mem->visibility_type; ++ } ++ else ++ props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; ++ ++ if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) ++ props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; ++ if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) ++ props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; ++ ++ props->size_in_bytes = ++ ((uint64_t)mem->length_high << 32) + ++ mem->length_low; ++ props->width = mem->width; ++ ++ dev->node_props.mem_banks_count++; ++ list_add_tail(&props->list, &dev->mem_props); ++ ++ break; ++ } ++ } ++ ++ return 0; ++} ++ ++/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct ++ * topology device present in the device_list ++ */ ++static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, ++ struct list_head *device_list) ++{ ++ struct kfd_cache_properties *props; ++ struct kfd_topology_device *dev; ++ uint32_t id; ++ uint32_t total_num_of_cu; ++ ++ BUG_ON(!cache); ++ ++ id = cache->processor_id_low; ++ ++ list_for_each_entry(dev, device_list, list) { ++ total_num_of_cu = (dev->node_props.array_count * ++ dev->node_props.cu_per_simd_array); ++ ++ /* Cache infomration in CRAT doesn't have proximity_domain ++ * information as it is associated with a CPU core or GPU ++ * Compute Unit. So map the cache using CPU core Id or SIMD ++ * (GPU) ID. ++ * TODO: This works because currently we can safely assume that ++ * Compute Units are parsed before caches are parsed. In future ++ * remove this dependency ++ */ ++ if ((id >= dev->node_props.cpu_core_id_base && ++ id <= dev->node_props.cpu_core_id_base + ++ dev->node_props.cpu_cores_count) || ++ (id >= dev->node_props.simd_id_base && ++ id < dev->node_props.simd_id_base + ++ total_num_of_cu)) { ++ props = kfd_alloc_struct(props); ++ if (props == NULL) ++ return -ENOMEM; ++ ++ props->processor_id_low = id; ++ props->cache_level = cache->cache_level; ++ props->cache_size = cache->cache_size; ++ props->cacheline_size = cache->cache_line_size; ++ props->cachelines_per_tag = cache->lines_per_tag; ++ props->cache_assoc = cache->associativity; ++ props->cache_latency = cache->cache_latency; ++ memcpy(props->sibling_map, cache->sibling_map, ++ sizeof(props->sibling_map)); ++ ++ if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) ++ props->cache_type |= HSA_CACHE_TYPE_DATA; ++ if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) ++ props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; ++ if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) ++ props->cache_type |= HSA_CACHE_TYPE_CPU; ++ if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) ++ props->cache_type |= HSA_CACHE_TYPE_HSACU; ++ ++ dev->cache_count++; ++ dev->node_props.caches_count++; ++ list_add_tail(&props->list, &dev->cache_props); ++ ++ break; ++ } ++ } ++ ++ return 0; ++} ++ ++/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct ++ * topology device present in the device_list ++ */ ++static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, ++ struct list_head *device_list) ++{ ++ struct kfd_iolink_properties *props; ++ struct kfd_topology_device *dev; ++ uint32_t i = 0; ++ uint32_t id_from; ++ uint32_t id_to; ++ ++ BUG_ON(!iolink); ++ ++ id_from = iolink->proximity_domain_from; ++ id_to = iolink->proximity_domain_to; ++ ++ pr_debug("Found IO link entry in CRAT table with id_from=%d\n", id_from); ++ list_for_each_entry(dev, device_list, list) { ++ if (id_from == dev->proximity_domain) { ++ props = kfd_alloc_struct(props); ++ if (props == NULL) ++ return -ENOMEM; ++ ++ props->node_from = id_from; ++ props->node_to = id_to; ++ props->ver_maj = iolink->version_major; ++ props->ver_min = iolink->version_minor; ++ props->iolink_type = iolink->io_interface_type; ++ ++ /* ++ * weight factor (derived from CDIR), currently always 1 ++ */ ++ props->weight = 1; ++ ++ props->min_latency = iolink->minimum_latency; ++ props->max_latency = iolink->maximum_latency; ++ props->min_bandwidth = iolink->minimum_bandwidth_mbs; ++ props->max_bandwidth = iolink->maximum_bandwidth_mbs; ++ props->rec_transfer_size = ++ iolink->recommended_transfer_size; ++ ++ dev->io_link_count++; ++ dev->node_props.io_links_count++; ++ list_add_tail(&props->list, &dev->io_link_props); ++ ++ break; ++ } ++ i++; ++ } ++ ++ return 0; ++} ++ ++/* kfd_parse_subtype - parse subtypes and attach it to correct topology device ++ * present in the device_list ++ * @sub_type_hdr - subtype section of crat_image ++ * @device_list - list of topology devices present in this crat_image ++ */ ++static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, ++ struct list_head *device_list) ++{ ++ struct crat_subtype_computeunit *cu; ++ struct crat_subtype_memory *mem; ++ struct crat_subtype_cache *cache; ++ struct crat_subtype_iolink *iolink; ++ int ret = 0; ++ ++ BUG_ON(!sub_type_hdr); ++ ++ switch (sub_type_hdr->type) { ++ case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: ++ cu = (struct crat_subtype_computeunit *)sub_type_hdr; ++ ret = kfd_parse_subtype_cu(cu, device_list); ++ break; ++ case CRAT_SUBTYPE_MEMORY_AFFINITY: ++ mem = (struct crat_subtype_memory *)sub_type_hdr; ++ ret = kfd_parse_subtype_mem(mem, device_list); ++ break; ++ case CRAT_SUBTYPE_CACHE_AFFINITY: ++ cache = (struct crat_subtype_cache *)sub_type_hdr; ++ ret = kfd_parse_subtype_cache(cache, device_list); ++ break; ++ case CRAT_SUBTYPE_TLB_AFFINITY: ++ /* ++ * For now, nothing to do here ++ */ ++ pr_debug("Found TLB entry in CRAT table (not processing)\n"); ++ break; ++ case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: ++ /* ++ * For now, nothing to do here ++ */ ++ pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); ++ break; ++ case CRAT_SUBTYPE_IOLINK_AFFINITY: ++ iolink = (struct crat_subtype_iolink *)sub_type_hdr; ++ ret = kfd_parse_subtype_iolink(iolink, device_list); ++ break; ++ default: ++ pr_warn("Unknown subtype (%d) in CRAT\n", ++ sub_type_hdr->type); ++ } ++ ++ return ret; ++} ++ ++/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT ++ * create a kfd_topology_device and add in to device_list. Also parse ++ * CRAT subtypes and attach it to appropriate kfd_topology_device ++ * @crat_image - input image containing CRAT ++ * @device_list - [OUT] list of kfd_topology_device generated after parsing ++ * crat_image ++ * @proximity_domain - Proximity domain of the first device in the table ++ * Return - 0 if successful else -ve value ++ */ ++int kfd_parse_crat_table(void *crat_image, ++ struct list_head *device_list, ++ uint32_t proximity_domain) ++{ ++ struct kfd_topology_device *top_dev = NULL; ++ struct crat_subtype_generic *sub_type_hdr; ++ uint16_t node_id; ++ int ret; ++ struct crat_header *crat_table = (struct crat_header *)crat_image; ++ uint16_t num_nodes; ++ uint32_t image_len; ++ uint32_t last_header_type, last_header_length; ++ ++ if (!crat_image) ++ return -EINVAL; ++ ++ if (!list_empty(device_list)) { ++ pr_warn("Error device list should be empty\n"); ++ } ++ ++ num_nodes = crat_table->num_domains; ++ image_len = crat_table->length; ++ ++ pr_info("Parsing CRAT table with %d nodes\n", num_nodes); ++ ++ for (node_id = 0; node_id < num_nodes; node_id++) { ++ top_dev = kfd_create_topology_device(device_list); ++ if (!top_dev) ++ break; ++ top_dev->proximity_domain = proximity_domain++; ++ } ++ ++ if (!top_dev) ++ return -ENOMEM; ++ ++ memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); ++ memcpy(top_dev->oem_table_id, crat_table->oem_table_id, CRAT_OEMTABLEID_LENGTH); ++ top_dev->oem_revision = crat_table->oem_revision; ++ ++ last_header_type = last_header_length = 0; ++ sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); ++ while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < ++ ((char *)crat_image) + image_len) { ++ pr_debug("kfd parsing crat sub type header %p enabled: %s type: 0x%x length %d\n", ++ sub_type_hdr, ++ (sub_type_hdr->flags & ++ CRAT_SUBTYPE_FLAGS_ENABLED) ++ ? "true" : "false", ++ sub_type_hdr->type, ++ sub_type_hdr->length); ++ ++ if (sub_type_hdr->length == 0) { ++ pr_err("amdkfd: Parsing wrong CRAT's sub header last header type: %d last header len %d\n", ++ last_header_type, last_header_type); ++ pr_err("amdkfd: Current header type %d length %d\n", ++ sub_type_hdr->type, sub_type_hdr->length); ++ break; ++ } ++ ++ if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { ++ ret = kfd_parse_subtype(sub_type_hdr, device_list); ++ if (ret != 0) ++ return ret; ++ } ++ ++ last_header_type = sub_type_hdr->type; ++ last_header_length = sub_type_hdr->length; ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ sub_type_hdr->length); ++ } ++ ++ return 0; ++} ++ ++/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ ++static int fill_in_pcache(struct crat_subtype_cache *pcache, ++ struct kfd_gpu_cache_info *pcache_info, ++ struct kfd_cu_info *cu_info, ++ int mem_available, ++ int cu_bitmask, ++ int cache_type, unsigned int cu_processor_id, ++ int cu_block) ++{ ++ unsigned int cu_sibling_map_mask; ++ int first_active_cu; ++ ++ /* First check if enough memory is available */ ++ if (mem_available - sizeof(struct crat_subtype_cache) < 0) ++ return -ENOMEM; ++ ++ cu_sibling_map_mask = cu_bitmask; ++ cu_sibling_map_mask >>= cu_block; ++ cu_sibling_map_mask &= ++ ((1 << pcache_info[cache_type].num_cu_shared) - 1); ++ first_active_cu = ffs(cu_sibling_map_mask); ++ ++ /* CU could be inactive. In case of shared cache find the first active ++ * CU. and incase of non-shared cache check if the CU is inactive. If ++ * inactive active skip it*/ ++ if (first_active_cu) { ++ memset(pcache, 0, sizeof(struct crat_subtype_cache)); ++ pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; ++ pcache->length = sizeof(struct crat_subtype_cache); ++ pcache->flags = pcache_info[cache_type].flags; ++ pcache->processor_id_low = cu_processor_id ++ + (first_active_cu - 1); ++ pcache->cache_level = pcache_info[cache_type].cache_level; ++ pcache->cache_size = pcache_info[cache_type].cache_size; ++ ++ /* Sibling map is w.r.t processor_id_low, so shift out ++ * inactive CU */ ++ cu_sibling_map_mask = ++ cu_sibling_map_mask >> (first_active_cu - 1); ++ ++ pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); ++ pcache->sibling_map[1] = ++ (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); ++ pcache->sibling_map[2] = ++ (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); ++ pcache->sibling_map[3] = ++ (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); ++ return 0; ++ } ++ return 1; ++} ++ ++/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info tables ++ * @kdev - [IN] GPU device ++ * @gpu_processor_id - [IN] GPU processor ID to which these caches associate ++ * @available_size - [IN] Amount of memory available in pcache ++ * @cu_info - [IN] Compute Unit info obtained from KGD ++ * @pcache - [OUT] memory into which cache data is to be filled in. ++ * @size_filled - [OUT] amount of data used up in pcache. ++ * @num_of_entries - [OUT] number of caches added ++ */ ++static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, ++ int gpu_processor_id, ++ int available_size, ++ struct kfd_cu_info *cu_info, ++ struct crat_subtype_cache *pcache, ++ int *size_filled, ++ int *num_of_entries) ++{ ++ struct kfd_gpu_cache_info *pcache_info; ++ int num_of_cache_types = 0; ++ int i, j, k; ++ int ct = 0; ++ int mem_available = available_size; ++ unsigned int cu_processor_id; ++ int ret; ++ ++ switch (kdev->device_info->asic_family) { ++ case CHIP_KAVERI: ++ pcache_info = kaveri_cache_info; ++ num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); ++ break; ++ case CHIP_CARRIZO: ++ pcache_info = carrizo_cache_info; ++ num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); ++ break; ++ case CHIP_TONGA: ++ pcache_info = tonga_cache_info; ++ num_of_cache_types = ARRAY_SIZE(tonga_cache_info); ++ break; ++ case CHIP_FIJI: ++ pcache_info = fiji_cache_info; ++ num_of_cache_types = ARRAY_SIZE(fiji_cache_info); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ *size_filled = 0; ++ *num_of_entries = 0; ++ ++ /* For each type of cache listed in the kfd_gpu_cache_info table, ++ * go through all available Compute Units. ++ * The [i,j,k] loop will ++ * if kfd_gpu_cache_info.num_cu_shared = 1 ++ * will parse through all available CU ++ * If (kfd_gpu_cache_info.num_cu_shared != 1) ++ * then it will consider only one CU from ++ * the shared unit ++ */ ++ ++ for (ct = 0; ct < num_of_cache_types; ct++) { ++ cu_processor_id = gpu_processor_id; ++ for (i = 0; i < cu_info->num_shader_engines; i++) { ++ for (j = 0; j < cu_info->num_shader_arrays_per_engine; ++ j++) { ++ for (k = 0; k < cu_info->num_cu_per_sh; ++ k += pcache_info[ct].num_cu_shared) { ++ ++ ret = fill_in_pcache(pcache, ++ pcache_info, ++ cu_info, ++ mem_available, ++ cu_info->cu_bitmap[i][j], ++ ct, ++ cu_processor_id, ++ k); ++ ++ if (ret < 0) ++ break; ++ ++ if (!ret) { ++ pcache++; ++ (*num_of_entries)++; ++ mem_available -= ++ sizeof(*pcache); ++ (*size_filled) += ++ sizeof(*pcache); ++ } ++ ++ /* Move to next CU block */ ++ cu_processor_id += ++ pcache_info[ct].num_cu_shared; ++ } ++ } ++ } ++ } ++ ++ pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); ++ ++ return 0; ++} ++ ++/* ++ * kfd_create_crat_image_acpi - Allocates memory for CRAT image and ++ * copies CRAT from ACPI (if available). ++ * ++ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory ++ * ++ * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then ++ * *crat_image will be NULL ++ * @size: [OUT] size of crat_image ++ * ++ * Return 0 if successful else return -ve value ++ */ ++int kfd_create_crat_image_acpi(void **crat_image, size_t *size) ++{ ++ struct acpi_table_header *crat_table; ++ acpi_status status; ++ void *pcrat_image; ++ ++ if (!crat_image) ++ return -EINVAL; ++ ++ *crat_image = NULL; ++ ++ /* ++ * Fetch the CRAT table from ACPI ++ */ ++ status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); ++ if (status == AE_NOT_FOUND) { ++ pr_warn("CRAT table not found\n"); ++ return -ENODATA; ++ } else if (ACPI_FAILURE(status)) { ++ const char *err = acpi_format_exception(status); ++ pr_err("CRAT table error: %s\n", err); ++ return -EINVAL; ++ } ++ ++ pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); ++ if (!pcrat_image) { ++ pr_err("No memory for allocating CRAT image\n"); ++ return -ENOMEM; ++ } ++ ++ memcpy(pcrat_image, crat_table, crat_table->length); ++ ++ *crat_image = pcrat_image; ++ *size = crat_table->length; ++ ++ return 0; ++} ++ ++/* Memory required to create Virtual CRAT. ++ * Since there is no easy way to predict the amount of memory required, the ++ * following amount are allocated for CPU and GPU Virtual CRAT. This is ++ * expected to cover all known conditions. But to be safe additional check ++ * is put in the code to ensure we don't overwrite. ++ */ ++#define VCRAT_SIZE_FOR_CPU PAGE_SIZE ++#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) ++ ++/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node ++ * ++ * @numa_node_id: CPU NUMA node id ++ * @avail_size: Available size in the memory ++ * @sub_type_hdr: Memory into which compute info will be filled in ++ * ++ * Return 0 if successful else return -ve value ++ */ ++static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, ++ int proximity_domain, ++ struct crat_subtype_computeunit *sub_type_hdr) ++{ ++ const struct cpumask *cpumask; ++ ++ *avail_size -= sizeof(struct crat_subtype_computeunit); ++ if (*avail_size < 0) ++ return -ENOMEM; ++ ++ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); ++ ++ /* Fill in subtype header data */ ++ sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; ++ sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); ++ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; ++ ++ cpumask = cpumask_of_node(numa_node_id); ++ ++ /* Fill in CU data */ ++ sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; ++ sub_type_hdr->proximity_domain = proximity_domain; ++ sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); ++ if (sub_type_hdr->processor_id_low == -1) ++ return -EINVAL; ++ ++ sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); ++ ++ return 0; ++} ++ ++/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node ++ * ++ * @numa_node_id: CPU NUMA node id ++ * @avail_size: Available size in the memory ++ * @sub_type_hdr: Memory into which compute info will be filled in ++ * ++ * Return 0 if successful else return -ve value ++ */ ++static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, ++ int proximity_domain, ++ struct crat_subtype_memory *sub_type_hdr) ++{ ++ uint64_t mem_in_bytes = 0; ++ pg_data_t *pgdat; ++ int zone_type; ++ ++ *avail_size -= sizeof(struct crat_subtype_computeunit); ++ if (*avail_size < 0) ++ return -ENOMEM; ++ ++ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); ++ ++ /* Fill in subtype header data */ ++ sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; ++ sub_type_hdr->length = sizeof(struct crat_subtype_memory); ++ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; ++ ++ /* Fill in Memory Subunit data */ ++ ++ /* Unlike si_meminfo, si_meminfo_node is not exported. So ++ * the following lines are duplicated from si_meminfo_node ++ * function */ ++ pgdat = NODE_DATA(numa_node_id); ++ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) ++ mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; ++ mem_in_bytes <<= PAGE_SHIFT; ++ ++ sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); ++ sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); ++ sub_type_hdr->proximity_domain = proximity_domain; ++ ++ return 0; ++} ++ ++/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU ++ * ++ * @pcrat_image: Fill in VCRAT for CPU ++ * @size: [IN] allocated size of crat_image. ++ * [OUT] actual size of data filled in crat_image ++ */ ++static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) ++{ ++ struct crat_header *crat_table = (struct crat_header *)pcrat_image; ++ struct acpi_table_header *acpi_table; ++ acpi_status status; ++ struct crat_subtype_generic *sub_type_hdr; ++ int avail_size = *size; ++ int numa_node_id; ++ int ret = 0; ++ ++ if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_CPU) ++ return -EINVAL; ++ ++ /* Fill in CRAT Header. ++ * Modify length and total_entries as subunits are added. ++ */ ++ avail_size -= sizeof(struct crat_header); ++ if (avail_size < 0) ++ return -ENOMEM; ++ ++ memset(crat_table, 0, sizeof(struct crat_header)); ++ memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature)); ++ crat_table->length = sizeof(struct crat_header); ++ ++ status = acpi_get_table("DSDT", 0, &acpi_table); ++ if (status == AE_NOT_FOUND) ++ pr_warn("DSDT table not found for OEM information\n"); ++ else { ++ crat_table->oem_revision = acpi_table->revision; ++ memcpy(crat_table->oem_id, acpi_table->oem_id, CRAT_OEMID_LENGTH); ++ memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, CRAT_OEMTABLEID_LENGTH); ++ } ++ crat_table->total_entries = 0; ++ crat_table->num_domains = 0; ++ ++ sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); ++ ++ for_each_online_node(numa_node_id) { ++ /* Fill in Subtype: Compute Unit */ ++ ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, ++ crat_table->num_domains, ++ (struct crat_subtype_computeunit *)sub_type_hdr); ++ if (ret < 0) ++ return ret; ++ crat_table->length += sub_type_hdr->length; ++ crat_table->total_entries++; ++ ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ sub_type_hdr->length); ++ ++ /* Fill in Subtype: Memory */ ++ ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, ++ crat_table->num_domains, ++ (struct crat_subtype_memory *)sub_type_hdr); ++ if (ret < 0) ++ return ret; ++ crat_table->length += sub_type_hdr->length; ++ crat_table->total_entries++; ++ ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ sub_type_hdr->length); ++ ++ crat_table->num_domains++; ++ } ++ ++ /* TODO: Add cache Subtype for CPU. ++ * Currently, CPU cache information is available in function ++ * detect_cache_attributes(cpu) defined in the file ++ * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not exported ++ * and to get the same information the code needs to be duplicated. ++ */ ++ ++ *size = crat_table->length; ++ pr_info("Virtual CRAT table created for CPU\n"); ++ ++ return 0; ++} ++ ++static int kfd_fill_gpu_memory_affinity(int *avail_size, ++ struct kfd_dev *kdev, uint8_t type, uint64_t size, ++ struct crat_subtype_memory *sub_type_hdr, ++ uint32_t proximity_domain, ++ const struct kfd_local_mem_info *local_mem_info) ++{ ++ *avail_size -= sizeof(struct crat_subtype_memory); ++ if (*avail_size < 0) ++ return -ENOMEM; ++ ++ memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); ++ sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; ++ sub_type_hdr->length = sizeof(struct crat_subtype_memory); ++ sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; ++ ++ sub_type_hdr->proximity_domain = proximity_domain; ++ ++ pr_debug("amdkfd: fill gpu memory affinity - type 0x%x size 0x%llx\n", ++ type, size); ++ ++ sub_type_hdr->length_low = lower_32_bits(size); ++ sub_type_hdr->length_high = upper_32_bits(size); ++ ++ sub_type_hdr->width = local_mem_info->vram_width; ++ sub_type_hdr->visibility_type = type; ++ ++ return 0; ++} ++ ++/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU ++ * to its NUMA node ++ * ++ * @avail_size: Available size in the memory ++ * @kdev - [IN] GPU device ++ * @sub_type_hdr: Memory into which io link info will be filled in ++ * @proximity_domain - proximity domain of the GPU node ++ * ++ * Return 0 if successful else return -ve value ++ */ ++static int kfd_fill_gpu_direct_io_link(int *avail_size, ++ struct kfd_dev *kdev, ++ struct crat_subtype_iolink *sub_type_hdr, ++ uint32_t proximity_domain) ++{ ++ int proximity_domain_to; ++ *avail_size -= sizeof(struct crat_subtype_iolink); ++ if (*avail_size < 0) ++ return -ENOMEM; ++ ++ memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); ++ ++ /* Fill in subtype header data */ ++ sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; ++ sub_type_hdr->length = sizeof(struct crat_subtype_iolink); ++ sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; ++ ++ /* Fill in IOLINK subtype. ++ * TODO: Fill-in other fields of iolink subtype */ ++ sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; ++ sub_type_hdr->proximity_domain_from = proximity_domain; ++ proximity_domain_to = ++ kfd_get_proximity_domain(kdev->pdev->bus); ++ if (proximity_domain_to == -1) ++ return -EINVAL; ++ ++ sub_type_hdr->proximity_domain_to = proximity_domain_to; ++ return 0; ++} ++ ++/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU ++ * ++ * @pcrat_image: Fill in VCRAT for GPU ++ * @size: [IN] allocated size of crat_image. ++ * [OUT] actual size of data filled in crat_image ++ */ ++static int kfd_create_vcrat_image_gpu(void *pcrat_image, ++ size_t *size, struct kfd_dev *kdev, ++ uint32_t proximity_domain) ++{ ++ struct crat_header *crat_table = (struct crat_header *)pcrat_image; ++ struct crat_subtype_generic *sub_type_hdr; ++ struct crat_subtype_computeunit *cu; ++ struct kfd_cu_info cu_info; ++ struct amd_iommu_device_info iommu_info; ++ int avail_size = *size; ++ uint32_t total_num_of_cu; ++ int num_of_cache_entries = 0; ++ int cache_mem_filled = 0; ++ int ret = 0; ++ const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | ++ AMD_IOMMU_DEVICE_FLAG_PRI_SUP | ++ AMD_IOMMU_DEVICE_FLAG_PASID_SUP; ++ struct kfd_local_mem_info local_mem_info; ++ ++ if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_GPU) ++ return -EINVAL; ++ ++ /* Fill the CRAT Header. ++ * Modify length and total_entries as subunits are added. ++ */ ++ avail_size -= sizeof(struct crat_header); ++ if (avail_size < 0) ++ return -ENOMEM; ++ ++ memset(crat_table, 0, sizeof(struct crat_header)); ++ ++ memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature)); ++ crat_table->length = sizeof(struct crat_header); /* Change length as we add more subtypes*/ ++ crat_table->num_domains = 1; ++ crat_table->total_entries = 0; ++ ++ /* Fill in Subtype: Compute Unit ++ * First fill in the sub type header and then sub type data ++ */ ++ avail_size -= sizeof(struct crat_subtype_computeunit); ++ if (avail_size < 0) ++ return -ENOMEM; ++ ++ sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); ++ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); ++ ++ sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; ++ sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); ++ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; ++ ++ /* Fill CU subtype data */ ++ cu = (struct crat_subtype_computeunit *)sub_type_hdr; ++ cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; ++ cu->proximity_domain = proximity_domain; ++ ++ kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); ++ cu->num_simd_per_cu = cu_info.simd_per_cu; ++ cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; ++ cu->max_waves_simd = cu_info.max_waves_per_simd; ++ ++ cu->wave_front_size = cu_info.wave_front_size; ++ cu->array_count = cu_info.num_shader_arrays_per_engine * ++ cu_info.num_shader_engines; ++ total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); ++ cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); ++ cu->num_cu_per_array = cu_info.num_cu_per_sh; ++ cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; ++ cu->num_banks = cu_info.num_shader_engines; ++ cu->lds_size_in_kb = cu_info.lds_size; ++ ++ cu->hsa_capability = 0; ++ ++ /* Check if this node supports IOMMU. During parsing this flag will ++ * translate to HSA_CAP_ATS_PRESENT */ ++ iommu_info.flags = 0; ++ if (0 == amd_iommu_device_info(kdev->pdev, &iommu_info)) { ++ if ((iommu_info.flags & required_iommu_flags) == required_iommu_flags) ++ cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; ++ } ++ ++ crat_table->length += sub_type_hdr->length; ++ crat_table->total_entries++; ++ ++ /* Fill in Subtype: Memory. Only on systems with large BAR (no ++ * private FB), report memory as public. On other systems ++ * report the total FB size (public+private) as a single ++ * private heap. */ ++ kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ sub_type_hdr->length); ++ ++ if (local_mem_info.local_mem_size_private == 0) ++ ret = kfd_fill_gpu_memory_affinity(&avail_size, ++ kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, ++ local_mem_info.local_mem_size_public, ++ (struct crat_subtype_memory *)sub_type_hdr, ++ proximity_domain, ++ &local_mem_info); ++ else ++ ret = kfd_fill_gpu_memory_affinity(&avail_size, ++ kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, ++ local_mem_info.local_mem_size_public + ++ local_mem_info.local_mem_size_private, ++ (struct crat_subtype_memory *)sub_type_hdr, ++ proximity_domain, ++ &local_mem_info); ++ if (ret < 0) ++ return ret; ++ ++ crat_table->length += sizeof(struct crat_subtype_memory); ++ crat_table->total_entries++; ++ ++ /* TODO: Fill in cache information. This information is NOT readily ++ * available in KGD */ ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ sub_type_hdr->length); ++ ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, ++ avail_size, ++ &cu_info, ++ (struct crat_subtype_cache *)sub_type_hdr, ++ &cache_mem_filled, ++ &num_of_cache_entries); ++ ++ if (ret < 0) ++ return ret; ++ ++ crat_table->length += cache_mem_filled; ++ crat_table->total_entries += num_of_cache_entries; ++ avail_size -= cache_mem_filled; ++ ++ /* Fill in Subtype: IO_LINKS ++ * Only direct links are added here which is Link from GPU to ++ * to its NUMA node. Indirect links are added by userspace. ++ */ ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ cache_mem_filled); ++ ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, ++ (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); ++ ++ if (ret < 0) ++ return ret; ++ ++ crat_table->length += sub_type_hdr->length; ++ crat_table->total_entries++; ++ ++ *size = crat_table->length; ++ pr_info("Virtual CRAT table created for GPU\n"); ++ ++ return ret; ++} ++ ++/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and ++ * creates a Virtual CRAT (VCRAT) image ++ * ++ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory ++ * ++ * @crat_image: VCRAT image created because ACPI does not have a ++ * CRAT for this device ++ * @size: [OUT] size of virtual crat_image ++ * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device ++ * COMPUTE_UNIT_GPU - Create VCRAT for GPU ++ * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU ++ * -- this option is not currently implemented. The assumption ++ * is that all AMD APUs will have CRAT ++ * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU ++ * ++ * Return 0 if successful else return -ve value ++*/ ++int kfd_create_crat_image_virtual(void **crat_image, size_t *size, ++ int flags, struct kfd_dev *kdev, uint32_t proximity_domain) ++{ ++ void *pcrat_image; ++ int ret = 0; ++ ++ if (!crat_image) ++ return -EINVAL; ++ ++ *crat_image = NULL; ++ ++ /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and ++ * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover ++ * all the current conditions. A check is put not to overwrite beyond ++ * allocated size ++ */ ++ switch (flags) { ++ case COMPUTE_UNIT_CPU: ++ pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); ++ if (!pcrat_image) ++ return -ENOMEM; ++ *size = VCRAT_SIZE_FOR_CPU; ++ ret = kfd_create_vcrat_image_cpu(pcrat_image, size); ++ break; ++ case COMPUTE_UNIT_GPU: ++ if (!kdev) ++ return -EINVAL; ++ pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); ++ if (!pcrat_image) ++ return -ENOMEM; ++ *size = VCRAT_SIZE_FOR_GPU; ++ ret = kfd_create_vcrat_image_gpu(pcrat_image, size, ++ kdev, proximity_domain); ++ break; ++ case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) : ++ /*TODO:*/ ++ ret = -EINVAL; ++ pr_err("VCRAT not implemented for APU\n"); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret == 0) ++ *crat_image = pcrat_image; ++ ++ return ret; ++} ++ ++ ++/* kfd_destroy_crat_image ++ * ++ * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) ++ * ++ */ ++void kfd_destroy_crat_image(void *crat_image) ++{ ++ if (crat_image) ++ kfree(crat_image); ++ return; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +index a374fa3..9af3745 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +@@ -24,6 +24,7 @@ + #define KFD_CRAT_H_INCLUDED + + #include <linux/types.h> ++#include "kfd_priv.h" + + #pragma pack(1) + +@@ -44,6 +45,10 @@ + + #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) + ++/* Compute Unit flags */ ++#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ ++#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ ++ + struct crat_header { + uint32_t signature; + uint32_t length; +@@ -105,7 +110,7 @@ struct crat_subtype_computeunit { + uint8_t wave_front_size; + uint8_t num_banks; + uint16_t micro_engine_id; +- uint8_t num_arrays; ++ uint8_t array_count; + uint8_t num_cu_per_array; + uint8_t num_simd_per_cu; + uint8_t max_slots_scatch_cu; +@@ -127,13 +132,14 @@ struct crat_subtype_memory { + uint8_t length; + uint16_t reserved; + uint32_t flags; +- uint32_t promixity_domain; ++ uint32_t proximity_domain; + uint32_t base_addr_low; + uint32_t base_addr_high; + uint32_t length_low; + uint32_t length_high; + uint32_t width; +- uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; ++ uint8_t visibility_type; /* for virtual (dGPU) CRAT */ ++ uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; + }; + + /* +@@ -222,9 +228,12 @@ struct crat_subtype_ccompute { + /* + * HSA IO Link Affinity structure and definitions + */ +-#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 +-#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 +-#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc ++#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) ++#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) ++#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) ++#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) ++#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) ++#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 + + /* + * IO interface types +@@ -232,8 +241,16 @@ struct crat_subtype_ccompute { + #define CRAT_IOLINK_TYPE_UNDEFINED 0 + #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 + #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 +-#define CRAT_IOLINK_TYPE_OTHER 3 +-#define CRAT_IOLINK_TYPE_MAX 255 ++#define CRAT_IOLINK_TYPE_AMBA 3 ++#define CRAT_IOLINK_TYPE_MIPI 4 ++#define CRAT_IOLINK_TYPE_QPI_1_1 5 ++#define CRAT_IOLINK_TYPE_RESERVED1 6 ++#define CRAT_IOLINK_TYPE_RESERVED2 7 ++#define CRAT_IOLINK_TYPE_RAPID_IO 8 ++#define CRAT_IOLINK_TYPE_INFINIBAND 9 ++#define CRAT_IOLINK_TYPE_RESERVED3 10 ++#define CRAT_IOLINK_TYPE_OTHER 11 ++#define CRAT_IOLINK_TYPE_MAX 255 + + #define CRAT_IOLINK_RESERVED_LENGTH 24 + +@@ -291,4 +308,11 @@ struct cdit_header { + + #pragma pack() + ++int kfd_create_crat_image_acpi(void **crat_image, size_t *size); ++void kfd_destroy_crat_image(void *crat_image); ++int kfd_parse_crat_table(void *crat_image, ++ struct list_head *device_list, ++ uint32_t proximity_domain); ++int kfd_create_crat_image_virtual(void **crat_image, size_t *size, ++ int flags, struct kfd_dev *kdev, uint32_t proximity_domain); + #endif /* KFD_CRAT_H_INCLUDED */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +index d5e19b5..4f2311e 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +@@ -42,8 +42,6 @@ + + static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev) + { +- BUG_ON(!dev || !dev->kfd2kgd); +- + dev->kfd2kgd->address_watch_disable(dev->kgd); + } + +@@ -51,129 +49,118 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + unsigned int pasid, uint64_t vmid0_address, + uint32_t *packet_buff, size_t size_in_bytes) + { ++ int status = 0; ++ unsigned int *ib_packet_buff = NULL; + struct pm4__release_mem *rm_packet; + struct pm4__indirect_buffer_pasid *ib_packet; ++ struct kernel_queue *kq = dbgdev->kq; ++ size_t pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + sizeof(struct pm4__indirect_buffer_pasid); + struct kfd_mem_obj *mem_obj; +- size_t pq_packets_size_in_bytes; ++ ++ uint64_t *rm_state = NULL; ++ + union ULARGE_INTEGER *largep; + union ULARGE_INTEGER addr; +- struct kernel_queue *kq; +- uint64_t *rm_state; +- unsigned int *ib_packet_buff; +- int status; +- +- BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes); +- +- kq = dbgdev->kq; +- +- pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + +- sizeof(struct pm4__indirect_buffer_pasid); +- +- /* +- * We acquire a buffer from DIQ +- * The receive packet buff will be sitting on the Indirect Buffer +- * and in the PQ we put the IB packet + sync packet(s). +- */ +- status = kq->ops.acquire_packet_buffer(kq, +- pq_packets_size_in_bytes / sizeof(uint32_t), +- &ib_packet_buff); +- if (status != 0) { +- pr_err("amdkfd: acquire_packet_buffer failed\n"); +- return status; +- } + +- memset(ib_packet_buff, 0, pq_packets_size_in_bytes); ++ do { ++ if ((kq == NULL) || (packet_buff == NULL) || (size_in_bytes == 0)) { ++ pr_debug("Error! kfd: In func %s >> Illegal packet parameters\n", __func__); ++ status = -EINVAL; ++ break; ++ } ++ /* todo - enter proper locking to be multithreaded safe */ + +- ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff); ++ /* We acquire a buffer from DIQ ++ * The receive packet buff will be sitting on the Indirect Buffer ++ * and in the PQ we put the IB packet + sync packet(s). ++ */ ++ status = kq->ops.acquire_packet_buffer(kq, pq_packets_size_in_bytes / sizeof(uint32_t), &ib_packet_buff); ++ if (status != 0) { ++ pr_debug("Error! kfd: In func %s >> acquire_packet_buffer failed\n", __func__); ++ break; ++ } + +- ib_packet->header.count = 3; +- ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID; +- ib_packet->header.type = PM4_TYPE_3; ++ memset(ib_packet_buff, 0, pq_packets_size_in_bytes); + +- largep = (union ULARGE_INTEGER *) &vmid0_address; ++ ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff); + +- ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2; +- ib_packet->bitfields3.ib_base_hi = largep->u.high_part; ++ ib_packet->header.count = 3; ++ ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID; ++ ib_packet->header.type = PM4_TYPE_3; + +- ib_packet->control = (1 << 23) | (1 << 31) | +- ((size_in_bytes / sizeof(uint32_t)) & 0xfffff); ++ largep = (union ULARGE_INTEGER *) &vmid0_address; + +- ib_packet->bitfields5.pasid = pasid; ++ ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2; ++ ib_packet->bitfields3.ib_base_hi = largep->u.high_part; + +- /* +- * for now we use release mem for GPU-CPU synchronization +- * Consider WaitRegMem + WriteData as a better alternative +- * we get a GART allocations ( gpu/cpu mapping), +- * for the sync variable, and wait until: +- * (a) Sync with HW +- * (b) Sync var is written by CP to mem. +- */ +- rm_packet = (struct pm4__release_mem *) (ib_packet_buff + +- (sizeof(struct pm4__indirect_buffer_pasid) / +- sizeof(unsigned int))); ++ ib_packet->control = (1 << 23) | (1 << 31) | ++ ((size_in_bytes / sizeof(uint32_t)) & 0xfffff); + +- status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), +- &mem_obj); ++ ib_packet->bitfields5.pasid = pasid; + +- if (status != 0) { +- pr_err("amdkfd: Failed to allocate GART memory\n"); +- kq->ops.rollback_packet(kq); +- return status; +- } ++ /* ++ * for now we use release mem for GPU-CPU synchronization ++ * Consider WaitRegMem + WriteData as a better alternative ++ * we get a GART allocations ( gpu/cpu mapping), ++ * for the sync variable, and wait until: ++ * (a) Sync with HW ++ * (b) Sync var is written by CP to mem. ++ */ ++ rm_packet = (struct pm4__release_mem *) (ib_packet_buff + ++ (sizeof(struct pm4__indirect_buffer_pasid) / sizeof(unsigned int))); + +- rm_state = (uint64_t *) mem_obj->cpu_ptr; ++ status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), ++ &mem_obj); + +- *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING; ++ if (status == 0) { + +- rm_packet->header.opcode = IT_RELEASE_MEM; +- rm_packet->header.type = PM4_TYPE_3; +- rm_packet->header.count = sizeof(struct pm4__release_mem) / +- sizeof(unsigned int) - 2; ++ rm_state = (uint64_t *) mem_obj->cpu_ptr; + +- rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; +- rm_packet->bitfields2.event_index = +- event_index___release_mem__end_of_pipe; ++ *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING; + +- rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru; +- rm_packet->bitfields2.atc = 0; +- rm_packet->bitfields2.tc_wb_action_ena = 1; ++ rm_packet->header.opcode = IT_RELEASE_MEM; ++ rm_packet->header.type = PM4_TYPE_3; ++ rm_packet->header.count = sizeof(struct pm4__release_mem) / sizeof(unsigned int) - 2; + +- addr.quad_part = mem_obj->gpu_addr; ++ rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; ++ rm_packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; ++ rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru; ++ rm_packet->bitfields2.atc = 0; ++ rm_packet->bitfields2.tc_wb_action_ena = 1; + +- rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2; +- rm_packet->address_hi = addr.u.high_part; ++ addr.quad_part = mem_obj->gpu_addr; + +- rm_packet->bitfields3.data_sel = +- data_sel___release_mem__send_64_bit_data; ++ rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2; ++ rm_packet->address_hi = addr.u.high_part; + +- rm_packet->bitfields3.int_sel = +- int_sel___release_mem__send_data_after_write_confirm; ++ rm_packet->bitfields3.data_sel = data_sel___release_mem__send_64_bit_data; ++ rm_packet->bitfields3.int_sel = int_sel___release_mem__send_data_after_write_confirm; ++ rm_packet->bitfields3.dst_sel = dst_sel___release_mem__memory_controller; + +- rm_packet->bitfields3.dst_sel = +- dst_sel___release_mem__memory_controller; ++ rm_packet->data_lo = QUEUESTATE__ACTIVE; + +- rm_packet->data_lo = QUEUESTATE__ACTIVE; ++ kq->ops.submit_packet(kq); + +- kq->ops.submit_packet(kq); ++ /* Wait till CP writes sync code: */ + +- /* Wait till CP writes sync code: */ +- status = amdkfd_fence_wait_timeout( +- (unsigned int *) rm_state, +- QUEUESTATE__ACTIVE, 1500); ++ status = amdkfd_fence_wait_timeout( ++ (unsigned int *) rm_state, ++ QUEUESTATE__ACTIVE, 1500); ++ ++ } else { ++ pr_debug("Error! kfd: In func %s >> failed to allocate GART memory\n", __func__); ++ } ++ } while (false); + +- kfd_gtt_sa_free(dbgdev->dev, mem_obj); ++ if (rm_state != NULL) ++ kfd_gtt_sa_free(dbgdev->dev, mem_obj); + + return status; + } + + static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) + { +- BUG_ON(!dbgdev); +- +- /* +- * no action is needed in this case, +- * just make sure diq will not be used +- */ ++ /* no action is needed in this case, just make sure diq will not be used */ + + dbgdev->kq = NULL; + +@@ -182,57 +169,68 @@ static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) + + static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) + { ++ ++ int status = 0; ++ struct kernel_queue *kq = NULL; + struct queue_properties properties; + unsigned int qid; +- struct kernel_queue *kq = NULL; +- int status; ++ struct process_queue_manager *pqm = dbgdev->pqm; + +- BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->dev); ++ do { + +- status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, +- &properties, 0, KFD_QUEUE_TYPE_DIQ, +- &qid); ++ if (!pqm) { ++ pr_debug("Error! kfd: In func %s >> No PQM\n", __func__); ++ status = -EFAULT; ++ break; ++ } + +- if (status) { +- pr_err("amdkfd: Failed to create DIQ\n"); +- return status; +- } ++ properties.type = KFD_QUEUE_TYPE_DIQ; + +- pr_debug("DIQ Created with queue id: %d\n", qid); ++ status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, ++ &properties, &qid); + +- kq = pqm_get_kernel_queue(dbgdev->pqm, qid); ++ if (status != 0) { ++ pr_debug("Error! kfd: In func %s >> Create Queue failed\n", __func__); ++ break; ++ } + +- if (kq == NULL) { +- pr_err("amdkfd: Error getting DIQ\n"); +- pqm_destroy_queue(dbgdev->pqm, qid); +- return -EFAULT; +- } ++ pr_debug("kfd: DIQ Created with queue id: %d\n", qid); ++ ++ kq = pqm_get_kernel_queue(dbgdev->pqm, qid); ++ ++ if (kq == NULL) { ++ pr_debug("Error! kfd: In func %s >> Error getting Kernel Queue\n", __func__); ++ status = -ENOMEM; ++ break; ++ } ++ ++ dbgdev->kq = kq; + +- dbgdev->kq = kq; ++ } while (false); + + return status; + } + + static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev) + { +- BUG_ON(!dbgdev || !dbgdev->dev); +- + /* disable watch address */ ++ + dbgdev_address_watch_disable_nodiq(dbgdev->dev); + return 0; + } + + static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev) + { +- /* todo - disable address watch */ +- int status; +- +- BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->kq); +- +- status = pqm_destroy_queue(dbgdev->pqm, +- dbgdev->kq->queue->properties.queue_id); +- dbgdev->kq = NULL; +- ++ /* todo - if needed, kill wavefronts and disable watch */ ++ int status = 0; ++ if ((dbgdev == NULL) || (dbgdev->pqm == NULL) || (dbgdev->kq == NULL)) { ++ pr_debug("kfd Err:In func %s >> can't destroy diq\n", __func__); ++ status = -EFAULT; ++ } else { ++ pqm_destroy_queue(dbgdev->pqm, ++ dbgdev->kq->queue->properties.queue_id); ++ dbgdev->kq = NULL; ++ } + return status; + } + +@@ -241,341 +239,350 @@ static void dbgdev_address_watch_set_registers( + union TCP_WATCH_ADDR_H_BITS *addrHi, + union TCP_WATCH_ADDR_L_BITS *addrLo, + union TCP_WATCH_CNTL_BITS *cntl, +- unsigned int index, unsigned int vmid) ++ unsigned int index, unsigned int vmid, ++ unsigned int asic_family) + { + union ULARGE_INTEGER addr; + +- BUG_ON(!adw_info || !addrHi || !addrLo || !cntl); +- + addr.quad_part = 0; + addrHi->u32All = 0; + addrLo->u32All = 0; + cntl->u32All = 0; + + if (adw_info->watch_mask != NULL) +- cntl->bitfields.mask = +- (uint32_t) (adw_info->watch_mask[index] & +- ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); ++ cntl->bitfields.mask = (uint32_t) (adw_info->watch_mask[index] & ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); + else + cntl->bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; + + addr.quad_part = (unsigned long long) adw_info->watch_address[index]; + +- addrHi->bitfields.addr = addr.u.high_part & +- ADDRESS_WATCH_REG_ADDHIGH_MASK; ++ addrHi->bitfields.addr = addr.u.high_part & ADDRESS_WATCH_REG_ADDHIGH_MASK; + addrLo->bitfields.addr = + (addr.u.low_part >> ADDRESS_WATCH_REG_ADDLOW_SHIFT); + + cntl->bitfields.mode = adw_info->watch_mode[index]; + cntl->bitfields.vmid = (uint32_t) vmid; +- /* for now assume it is an ATC address */ +- cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; +- ++ /* for APU assume it is an ATC address. */ ++ if (KFD_IS_DGPU(asic_family) == false) ++ cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; + pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask); +- pr_debug("\t\t%20s %08x\n", "set reg add high :", +- addrHi->bitfields.addr); +- pr_debug("\t\t%20s %08x\n", "set reg add low :", +- addrLo->bitfields.addr); ++ pr_debug("\t\t%20s %08x\n", "set reg add high :", addrHi->bitfields.addr); ++ pr_debug("\t\t%20s %08x\n", "set reg add low :", addrLo->bitfields.addr); ++ + } + + static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, + struct dbg_address_watch_info *adw_info) + { ++ ++ int status = 0; ++ + union TCP_WATCH_ADDR_H_BITS addrHi; + union TCP_WATCH_ADDR_L_BITS addrLo; + union TCP_WATCH_CNTL_BITS cntl; +- struct kfd_process_device *pdd; ++ ++ unsigned int vmid; + unsigned int i; + +- BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); ++ struct kfd_process_device *pdd; + +- /* taking the vmid for that process on the safe way using pdd */ +- pdd = kfd_get_process_device_data(dbgdev->dev, +- adw_info->process); +- if (!pdd) { +- pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); +- return -EFAULT; +- } ++ do { ++ /* taking the vmid for that process on the safe way using pdd */ ++ pdd = kfd_get_process_device_data(dbgdev->dev, ++ adw_info->process); ++ if (!pdd) { ++ pr_debug("Error! kfd: In func %s >> no PDD available\n", __func__); ++ status = -EFAULT; ++ break; ++ } + +- addrHi.u32All = 0; +- addrLo.u32All = 0; +- cntl.u32All = 0; ++ addrHi.u32All = 0; ++ addrLo.u32All = 0; ++ cntl.u32All = 0; + +- if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || +- (adw_info->num_watch_points == 0)) { +- pr_err("amdkfd: num_watch_points is invalid\n"); +- return -EINVAL; +- } ++ vmid = pdd->qpd.vmid; + +- if ((adw_info->watch_mode == NULL) || +- (adw_info->watch_address == NULL)) { +- pr_err("amdkfd: adw_info fields are not valid\n"); +- return -EINVAL; +- } ++ if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ++ || (adw_info->num_watch_points == 0)) { ++ status = -EINVAL; ++ break; ++ } + +- for (i = 0 ; i < adw_info->num_watch_points ; i++) { +- dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, +- &cntl, i, pdd->qpd.vmid); +- +- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); +- pr_debug("\t\t%20s %08x\n", "register index :", i); +- pr_debug("\t\t%20s %08x\n", "vmid is :", pdd->qpd.vmid); +- pr_debug("\t\t%20s %08x\n", "Address Low is :", +- addrLo.bitfields.addr); +- pr_debug("\t\t%20s %08x\n", "Address high is :", +- addrHi.bitfields.addr); +- pr_debug("\t\t%20s %08x\n", "Address high is :", +- addrHi.bitfields.addr); +- pr_debug("\t\t%20s %08x\n", "Control Mask is :", +- cntl.bitfields.mask); +- pr_debug("\t\t%20s %08x\n", "Control Mode is :", +- cntl.bitfields.mode); +- pr_debug("\t\t%20s %08x\n", "Control Vmid is :", +- cntl.bitfields.vmid); +- pr_debug("\t\t%20s %08x\n", "Control atc is :", +- cntl.bitfields.atc); +- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); +- +- pdd->dev->kfd2kgd->address_watch_execute( +- dbgdev->dev->kgd, +- i, +- cntl.u32All, +- addrHi.u32All, +- addrLo.u32All); +- } ++ if ((adw_info->watch_mode == NULL) || (adw_info->watch_address == NULL)) { ++ status = -EINVAL; ++ break; ++ } + +- return 0; ++ for (i = 0; i < adw_info->num_watch_points; i++) { ++ ++ dbgdev_address_watch_set_registers( ++ adw_info, ++ &addrHi, ++ &addrLo, ++ &cntl, ++ i, ++ vmid, ++ dbgdev->dev->device_info->asic_family ++ ); ++ ++ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); ++ pr_debug("\t\t%20s %08x\n", "register index :", i); ++ pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); ++ pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr); ++ pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr); ++ pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr); ++ pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask); ++ pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode); ++ pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid); ++ pr_debug("\t\t%20s %08x\n", "Control atc is :", cntl.bitfields.atc); ++ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); ++ ++ pdd->dev->kfd2kgd->address_watch_execute( ++ dbgdev->dev->kgd, ++ i, ++ cntl.u32All, ++ addrHi.u32All, ++ addrLo.u32All); ++ } ++ ++ } while (false); ++ ++ return status; + } + + static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + struct dbg_address_watch_info *adw_info) + { +- struct pm4__set_config_reg *packets_vec; ++ ++ int status = 0; ++ unsigned int i = 0; + union TCP_WATCH_ADDR_H_BITS addrHi; + union TCP_WATCH_ADDR_L_BITS addrLo; + union TCP_WATCH_CNTL_BITS cntl; +- struct kfd_mem_obj *mem_obj; +- unsigned int aw_reg_add_dword; +- uint32_t *packet_buff_uint; +- unsigned int i; +- int status; +- size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; ++ + /* we do not control the vmid in DIQ mode, just a place holder */ + unsigned int vmid = 0; + +- BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); ++ struct kfd_mem_obj *mem_obj; ++ uint32_t *packet_buff_uint = NULL; ++ ++ struct pm4__set_config_reg *packets_vec = NULL; ++ ++ size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; ++ ++ unsigned int aw_reg_add_dword; + + addrHi.u32All = 0; + addrLo.u32All = 0; + cntl.u32All = 0; + +- if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || +- (adw_info->num_watch_points == 0)) { +- pr_err("amdkfd: num_watch_points is invalid\n"); +- return -EINVAL; +- } ++ do { + +- if ((NULL == adw_info->watch_mode) || +- (NULL == adw_info->watch_address)) { +- pr_err("amdkfd: adw_info fields are not valid\n"); +- return -EINVAL; +- } ++ if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || (adw_info->num_watch_points == 0)) { ++ status = -EINVAL; ++ break; ++ } ++ ++ if ((NULL == adw_info->watch_mode) || (NULL == adw_info->watch_address)) { ++ status = -EINVAL; ++ break; ++ } + +- status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); ++ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); + +- if (status != 0) { +- pr_err("amdkfd: Failed to allocate GART memory\n"); +- return status; +- } ++ if (status != 0) ++ break; + +- packet_buff_uint = mem_obj->cpu_ptr; +- +- memset(packet_buff_uint, 0, ib_size); +- +- packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); +- +- packets_vec[0].header.count = 1; +- packets_vec[0].header.opcode = IT_SET_CONFIG_REG; +- packets_vec[0].header.type = PM4_TYPE_3; +- packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; +- packets_vec[0].bitfields2.insert_vmid = 1; +- packets_vec[1].ordinal1 = packets_vec[0].ordinal1; +- packets_vec[1].bitfields2.insert_vmid = 0; +- packets_vec[2].ordinal1 = packets_vec[0].ordinal1; +- packets_vec[2].bitfields2.insert_vmid = 0; +- packets_vec[3].ordinal1 = packets_vec[0].ordinal1; +- packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; +- packets_vec[3].bitfields2.insert_vmid = 1; +- +- for (i = 0; i < adw_info->num_watch_points; i++) { +- dbgdev_address_watch_set_registers(adw_info, +- &addrHi, +- &addrLo, +- &cntl, +- i, +- vmid); +- +- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); +- pr_debug("\t\t%20s %08x\n", "register index :", i); +- pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); +- pr_debug("\t\t%20s %p\n", "Add ptr is :", +- adw_info->watch_address); +- pr_debug("\t\t%20s %08llx\n", "Add is :", +- adw_info->watch_address[i]); +- pr_debug("\t\t%20s %08x\n", "Address Low is :", +- addrLo.bitfields.addr); +- pr_debug("\t\t%20s %08x\n", "Address high is :", +- addrHi.bitfields.addr); +- pr_debug("\t\t%20s %08x\n", "Control Mask is :", +- cntl.bitfields.mask); +- pr_debug("\t\t%20s %08x\n", "Control Mode is :", +- cntl.bitfields.mode); +- pr_debug("\t\t%20s %08x\n", "Control Vmid is :", +- cntl.bitfields.vmid); +- pr_debug("\t\t%20s %08x\n", "Control atc is :", +- cntl.bitfields.atc); +- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); +- +- aw_reg_add_dword = +- dbgdev->dev->kfd2kgd->address_watch_get_offset( +- dbgdev->dev->kgd, +- i, +- ADDRESS_WATCH_REG_CNTL); ++ packet_buff_uint = mem_obj->cpu_ptr; ++ ++ memset(packet_buff_uint, 0, ib_size); + +- aw_reg_add_dword /= sizeof(uint32_t); ++ packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); + +- packets_vec[0].bitfields2.reg_offset = +- aw_reg_add_dword - AMD_CONFIG_REG_BASE; ++ packets_vec[0].header.count = 1; ++ packets_vec[0].header.opcode = IT_SET_CONFIG_REG; ++ packets_vec[0].header.type = PM4_TYPE_3; ++ packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; ++ packets_vec[0].bitfields2.insert_vmid = 1; ++ packets_vec[1].ordinal1 = packets_vec[0].ordinal1; ++ packets_vec[1].bitfields2.insert_vmid = 0; ++ packets_vec[2].ordinal1 = packets_vec[0].ordinal1; ++ packets_vec[2].bitfields2.insert_vmid = 0; ++ packets_vec[3].ordinal1 = packets_vec[0].ordinal1; ++ packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; ++ packets_vec[3].bitfields2.insert_vmid = 1; + +- packets_vec[0].reg_data[0] = cntl.u32All; ++ for (i = 0; i < adw_info->num_watch_points; i++) { + +- aw_reg_add_dword = +- dbgdev->dev->kfd2kgd->address_watch_get_offset( +- dbgdev->dev->kgd, ++ dbgdev_address_watch_set_registers( ++ adw_info, ++ &addrHi, ++ &addrLo, ++ &cntl, + i, +- ADDRESS_WATCH_REG_ADDR_HI); ++ vmid, ++ dbgdev->dev->device_info->asic_family ++ ); ++ ++ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); ++ pr_debug("\t\t%20s %08x\n", "register index :", i); ++ pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); ++ pr_debug("\t\t%20s %p\n", "Add ptr is :", adw_info->watch_address); ++ pr_debug("\t\t%20s %08llx\n", "Add is :", adw_info->watch_address[i]); ++ pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr); ++ pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr); ++ pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask); ++ pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode); ++ pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid); ++ pr_debug("\t\t%20s %08x\n", "Control atc is :", cntl.bitfields.atc); ++ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); ++ ++ aw_reg_add_dword = ++ dbgdev->dev->kfd2kgd ++ ->address_watch_get_offset( ++ dbgdev->dev->kgd, ++ i, ++ ADDRESS_WATCH_REG_CNTL); ++ ++ packets_vec[0].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; ++ packets_vec[0].reg_data[0] = cntl.u32All; + +- aw_reg_add_dword /= sizeof(uint32_t); ++ aw_reg_add_dword = ++ dbgdev->dev->kfd2kgd ++ ->address_watch_get_offset( ++ dbgdev->dev->kgd, ++ i, ++ ADDRESS_WATCH_REG_ADDR_HI); + +- packets_vec[1].bitfields2.reg_offset = +- aw_reg_add_dword - AMD_CONFIG_REG_BASE; +- packets_vec[1].reg_data[0] = addrHi.u32All; + +- aw_reg_add_dword = +- dbgdev->dev->kfd2kgd->address_watch_get_offset( +- dbgdev->dev->kgd, +- i, +- ADDRESS_WATCH_REG_ADDR_LO); ++ packets_vec[1].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; ++ packets_vec[1].reg_data[0] = addrHi.u32All; + +- aw_reg_add_dword /= sizeof(uint32_t); ++ aw_reg_add_dword = ++ dbgdev->dev->kfd2kgd ++ ->address_watch_get_offset( ++ dbgdev->dev->kgd, ++ i, ++ ADDRESS_WATCH_REG_ADDR_LO); + +- packets_vec[2].bitfields2.reg_offset = +- aw_reg_add_dword - AMD_CONFIG_REG_BASE; +- packets_vec[2].reg_data[0] = addrLo.u32All; + +- /* enable watch flag if address is not zero*/ +- if (adw_info->watch_address[i] > 0) +- cntl.bitfields.valid = 1; +- else +- cntl.bitfields.valid = 0; ++ packets_vec[2].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; ++ packets_vec[2].reg_data[0] = addrLo.u32All; + +- aw_reg_add_dword = +- dbgdev->dev->kfd2kgd->address_watch_get_offset( +- dbgdev->dev->kgd, +- i, +- ADDRESS_WATCH_REG_CNTL); ++ /* enable watch flag if address is not zero*/ ++ if (adw_info->watch_address[i] > 0) ++ cntl.bitfields.valid = 1; ++ else ++ cntl.bitfields.valid = 0; + +- aw_reg_add_dword /= sizeof(uint32_t); ++ aw_reg_add_dword = ++ dbgdev->dev->kfd2kgd ++ ->address_watch_get_offset( ++ dbgdev->dev->kgd, ++ i, ++ ADDRESS_WATCH_REG_CNTL); + +- packets_vec[3].bitfields2.reg_offset = +- aw_reg_add_dword - AMD_CONFIG_REG_BASE; +- packets_vec[3].reg_data[0] = cntl.u32All; + +- status = dbgdev_diq_submit_ib( +- dbgdev, +- adw_info->process->pasid, +- mem_obj->gpu_addr, +- packet_buff_uint, +- ib_size); ++ packets_vec[3].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; ++ packets_vec[3].reg_data[0] = cntl.u32All; ++ ++ status = dbgdev_diq_submit_ib( ++ dbgdev, ++ adw_info->process->pasid, ++ mem_obj->gpu_addr, ++ packet_buff_uint, ++ ib_size); ++ ++ if (status != 0) { ++ pr_debug("Error! kfd: In func %s >> failed to submit DIQ packet\n", __func__); ++ break; ++ } + +- if (status != 0) { +- pr_err("amdkfd: Failed to submit IB to DIQ\n"); +- break; + } +- } + +- kfd_gtt_sa_free(dbgdev->dev, mem_obj); ++ } while (false); ++ if (packet_buff_uint != NULL) ++ kfd_gtt_sa_free(dbgdev->dev, mem_obj); ++ + return status; ++ + } + + static int dbgdev_wave_control_set_registers( + struct dbg_wave_control_info *wac_info, + union SQ_CMD_BITS *in_reg_sq_cmd, +- union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) ++ union GRBM_GFX_INDEX_BITS *in_reg_gfx_index, ++ unsigned int asic_family) + { + int status = 0; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; +- struct HsaDbgWaveMsgAMDGen2 *pMsg; +- +- BUG_ON(!wac_info || !in_reg_sq_cmd || !in_reg_gfx_index); + + reg_sq_cmd.u32All = 0; ++ + reg_gfx_index.u32All = 0; +- pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2; + + switch (wac_info->mode) { +- /* Send command to single wave */ +- case HSA_DBG_WAVEMODE_SINGLE: +- /* +- * Limit access to the process waves only, +- * by setting vmid check +- */ ++ case HSA_DBG_WAVEMODE_SINGLE: /* Send command to single wave */ ++ /*limit access to the process waves only,by setting vmid check */ + reg_sq_cmd.bits.check_vmid = 1; +- reg_sq_cmd.bits.simd_id = pMsg->ui32.SIMD; +- reg_sq_cmd.bits.wave_id = pMsg->ui32.WaveId; ++ reg_sq_cmd.bits.simd_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.SIMD; ++ reg_sq_cmd.bits.wave_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.WaveId; + reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_SINGLE; + +- reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; +- reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; +- reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; ++ reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray; ++ reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine; ++ reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU; + + break; + +- /* Send command to all waves with matching VMID */ +- case HSA_DBG_WAVEMODE_BROADCAST_PROCESS: ++ case HSA_DBG_WAVEMODE_BROADCAST_PROCESS: /* Send command to all waves with matching VMID */ ++ + + reg_gfx_index.bits.sh_broadcast_writes = 1; + reg_gfx_index.bits.se_broadcast_writes = 1; + reg_gfx_index.bits.instance_broadcast_writes = 1; + + reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; +- + break; + +- /* Send command to all CU waves with matching VMID */ +- case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU: ++ case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU: /* Send command to all CU waves with matching VMID */ + + reg_sq_cmd.bits.check_vmid = 1; + reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; + +- reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; +- reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; +- reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; ++ reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray; ++ reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine; ++ reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU; + + break; + + default: +- return -EINVAL; ++ status = -EINVAL; ++ break; + } + + switch (wac_info->operand) { + case HSA_DBG_WAVEOP_HALT: +- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; ++ if (asic_family == CHIP_KAVERI) { ++ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; ++ pr_debug("kfd:dbgdev: halting KV\n"); ++ } else { ++ reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; ++ reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT; ++ pr_debug("kfd:dbgdev: halting CZ\n"); ++ } + break; + + case HSA_DBG_WAVEOP_RESUME: +- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; ++ if (asic_family == CHIP_KAVERI) { ++ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; ++ pr_debug("kfd:dbgdev: resuming KV\n"); ++ } else { ++ reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; ++ reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME; ++ pr_debug("kfd:dbgdev: resuming CZ\n"); ++ } + break; + + case HSA_DBG_WAVEOP_KILL: +@@ -601,128 +608,114 @@ static int dbgdev_wave_control_set_registers( + } + + if (status == 0) { +- *in_reg_sq_cmd = reg_sq_cmd; ++ *in_reg_sq_cmd = reg_sq_cmd; + *in_reg_gfx_index = reg_gfx_index; + } +- + return status; ++ + } + + static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, + struct dbg_wave_control_info *wac_info) + { + +- int status; ++ int status = 0; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct kfd_mem_obj *mem_obj; +- uint32_t *packet_buff_uint; +- struct pm4__set_config_reg *packets_vec; ++ uint32_t *packet_buff_uint = NULL; ++ struct pm4__set_config_reg *packets_vec = NULL; + size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; + +- BUG_ON(!dbgdev || !wac_info); +- + reg_sq_cmd.u32All = 0; ++ do { + +- status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, +- ®_gfx_index); +- if (status) { +- pr_err("amdkfd: Failed to set wave control registers\n"); +- return status; +- } +- +- /* we do not control the VMID in DIQ,so reset it to a known value */ +- reg_sq_cmd.bits.vm_id = 0; +- +- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); +- +- pr_debug("\t\t mode is: %u\n", wac_info->mode); +- pr_debug("\t\t operand is: %u\n", wac_info->operand); +- pr_debug("\t\t trap id is: %u\n", wac_info->trapId); +- pr_debug("\t\t msg value is: %u\n", +- wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); +- pr_debug("\t\t vmid is: N/A\n"); +- +- pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); +- pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); +- pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); +- pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); +- pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); +- pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); +- pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); +- +- pr_debug("\t\t ibw is : %u\n", +- reg_gfx_index.bitfields.instance_broadcast_writes); +- pr_debug("\t\t ii is : %u\n", +- reg_gfx_index.bitfields.instance_index); +- pr_debug("\t\t sebw is : %u\n", +- reg_gfx_index.bitfields.se_broadcast_writes); +- pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); +- pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); +- pr_debug("\t\t sbw is : %u\n", +- reg_gfx_index.bitfields.sh_broadcast_writes); +- +- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); +- +- status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); +- +- if (status != 0) { +- pr_err("amdkfd: Failed to allocate GART memory\n"); +- return status; +- } +- +- packet_buff_uint = mem_obj->cpu_ptr; ++ status = dbgdev_wave_control_set_registers(wac_info, ++ ®_sq_cmd, ++ ®_gfx_index, ++ dbgdev->dev->device_info->asic_family); + +- memset(packet_buff_uint, 0, ib_size); ++ /* we do not control the VMID in DIQ,so reset it to a known value */ ++ reg_sq_cmd.bits.vm_id = 0; ++ if (status != 0) ++ break; ++ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); ++ ++ pr_debug("\t\t mode is: %u\n", wac_info->mode); ++ pr_debug("\t\t operand is: %u\n", wac_info->operand); ++ pr_debug("\t\t trap id is: %u\n", wac_info->trapId); ++ pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); ++ pr_debug("\t\t vmid is: N/A\n"); ++ ++ pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); ++ pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); ++ pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); ++ pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); ++ pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); ++ pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); ++ pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); ++ ++ pr_debug("\t\t ibw is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes); ++ pr_debug("\t\t ii is : %u\n", reg_gfx_index.bitfields.instance_index); ++ pr_debug("\t\t sebw is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes); ++ pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); ++ pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); ++ pr_debug("\t\t sbw is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes); ++ ++ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); ++ ++ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); ++ ++ if (status != 0) ++ break; + +- packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; +- packets_vec[0].header.count = 1; +- packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; +- packets_vec[0].header.type = PM4_TYPE_3; +- packets_vec[0].bitfields2.reg_offset = +- GRBM_GFX_INDEX / (sizeof(uint32_t)) - +- USERCONFIG_REG_BASE; ++ packet_buff_uint = mem_obj->cpu_ptr; + +- packets_vec[0].bitfields2.insert_vmid = 0; +- packets_vec[0].reg_data[0] = reg_gfx_index.u32All; ++ memset(packet_buff_uint, 0, ib_size); + +- packets_vec[1].header.count = 1; +- packets_vec[1].header.opcode = IT_SET_CONFIG_REG; +- packets_vec[1].header.type = PM4_TYPE_3; +- packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - +- AMD_CONFIG_REG_BASE; ++ packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; ++ packets_vec[0].header.count = 1; ++ packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; ++ packets_vec[0].header.type = PM4_TYPE_3; ++ packets_vec[0].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE; ++ packets_vec[0].bitfields2.insert_vmid = 0; ++ packets_vec[0].reg_data[0] = reg_gfx_index.u32All; + +- packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; +- packets_vec[1].bitfields2.insert_vmid = 1; +- packets_vec[1].reg_data[0] = reg_sq_cmd.u32All; ++ packets_vec[1].header.count = 1; ++ packets_vec[1].header.opcode = IT_SET_CONFIG_REG; ++ packets_vec[1].header.type = PM4_TYPE_3; ++ packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - CONFIG_REG_BASE; ++ packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; ++ packets_vec[1].bitfields2.insert_vmid = 1; ++ packets_vec[1].reg_data[0] = reg_sq_cmd.u32All; + +- /* Restore the GRBM_GFX_INDEX register */ ++ /* Restore the GRBM_GFX_INDEX register */ + +- reg_gfx_index.u32All = 0; +- reg_gfx_index.bits.sh_broadcast_writes = 1; +- reg_gfx_index.bits.instance_broadcast_writes = 1; +- reg_gfx_index.bits.se_broadcast_writes = 1; ++ reg_gfx_index.u32All = 0; ++ reg_gfx_index.bits.sh_broadcast_writes = 1; ++ reg_gfx_index.bits.instance_broadcast_writes = 1; ++ reg_gfx_index.bits.se_broadcast_writes = 1; + + +- packets_vec[2].ordinal1 = packets_vec[0].ordinal1; +- packets_vec[2].bitfields2.reg_offset = +- GRBM_GFX_INDEX / (sizeof(uint32_t)) - +- USERCONFIG_REG_BASE; ++ packets_vec[2].ordinal1 = packets_vec[0].ordinal1; ++ packets_vec[2].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE; ++ packets_vec[2].bitfields2.insert_vmid = 0; ++ packets_vec[2].reg_data[0] = reg_gfx_index.u32All; + +- packets_vec[2].bitfields2.insert_vmid = 0; +- packets_vec[2].reg_data[0] = reg_gfx_index.u32All; ++ status = dbgdev_diq_submit_ib( ++ dbgdev, ++ wac_info->process->pasid, ++ mem_obj->gpu_addr, ++ packet_buff_uint, ++ ib_size); + +- status = dbgdev_diq_submit_ib( +- dbgdev, +- wac_info->process->pasid, +- mem_obj->gpu_addr, +- packet_buff_uint, +- ib_size); ++ if (status != 0) ++ pr_debug("%s\n", " Critical Error ! Submit diq packet failed "); + +- if (status != 0) +- pr_err("amdkfd: Failed to submit IB to DIQ\n"); ++ } while (false); + +- kfd_gtt_sa_free(dbgdev->dev, mem_obj); ++ if (packet_buff_uint != NULL) ++ kfd_gtt_sa_free(dbgdev->dev, mem_obj); + + return status; + } +@@ -730,66 +723,69 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, + static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, + struct dbg_wave_control_info *wac_info) + { +- int status; ++ int status = 0; ++ unsigned int vmid = 0xffff; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; +- struct kfd_process_device *pdd; + +- BUG_ON(!dbgdev || !dbgdev->dev || !wac_info); ++ struct kfd_process_device *pdd = NULL; + + reg_sq_cmd.u32All = 0; ++ status = 0; + + /* taking the VMID for that process on the safe way using PDD */ + pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process); + +- if (!pdd) { +- pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); +- return -EFAULT; +- } +- status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, +- ®_gfx_index); +- if (status) { +- pr_err("amdkfd: Failed to set wave control registers\n"); +- return status; ++ if (pdd) { ++ status = dbgdev_wave_control_set_registers(wac_info, ++ ®_sq_cmd, ++ ®_gfx_index, ++ dbgdev->dev->device_info->asic_family); ++ if (status == 0) { ++ ++ /* for non DIQ we need to patch the VMID: */ ++ ++ vmid = pdd->qpd.vmid; ++ reg_sq_cmd.bits.vm_id = vmid; ++ ++ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); ++ ++ pr_debug("\t\t mode is: %u\n", wac_info->mode); ++ pr_debug("\t\t operand is: %u\n", wac_info->operand); ++ pr_debug("\t\t trap id is: %u\n", wac_info->trapId); ++ pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); ++ pr_debug("\t\t vmid is: %u\n", vmid); ++ ++ pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); ++ pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); ++ pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); ++ pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); ++ pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); ++ pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); ++ pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); ++ ++ pr_debug("\t\t ibw is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes); ++ pr_debug("\t\t ii is : %u\n", reg_gfx_index.bitfields.instance_index); ++ pr_debug("\t\t sebw is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes); ++ pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); ++ pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); ++ pr_debug("\t\t sbw is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes); ++ ++ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); ++ ++ dbgdev->dev->kfd2kgd ++ ->wave_control_execute(dbgdev->dev->kgd, ++ reg_gfx_index.u32All, ++ reg_sq_cmd.u32All); ++ } else { ++ status = -EINVAL; ++ } ++ } else { ++ status = -EFAULT; + } + +- /* for non DIQ we need to patch the VMID: */ ++ return status; + +- reg_sq_cmd.bits.vm_id = pdd->qpd.vmid; +- +- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); +- +- pr_debug("\t\t mode is: %u\n", wac_info->mode); +- pr_debug("\t\t operand is: %u\n", wac_info->operand); +- pr_debug("\t\t trap id is: %u\n", wac_info->trapId); +- pr_debug("\t\t msg value is: %u\n", +- wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); +- pr_debug("\t\t vmid is: %u\n", pdd->qpd.vmid); +- +- pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); +- pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); +- pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); +- pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); +- pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); +- pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); +- pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); +- +- pr_debug("\t\t ibw is : %u\n", +- reg_gfx_index.bitfields.instance_broadcast_writes); +- pr_debug("\t\t ii is : %u\n", +- reg_gfx_index.bitfields.instance_index); +- pr_debug("\t\t sebw is : %u\n", +- reg_gfx_index.bitfields.se_broadcast_writes); +- pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); +- pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); +- pr_debug("\t\t sbw is : %u\n", +- reg_gfx_index.bitfields.sh_broadcast_writes); +- +- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); +- +- return dbgdev->dev->kfd2kgd->wave_control_execute(dbgdev->dev->kgd, +- reg_gfx_index.u32All, +- reg_sq_cmd.u32All); + } + + int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) +@@ -800,13 +796,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct kfd_process_device *pdd; + struct dbg_wave_control_info wac_info; +- int temp; +- int first_vmid_to_scan = 8; +- int last_vmid_to_scan = 15; +- +- first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1; +- temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan; +- last_vmid_to_scan = first_vmid_to_scan + ffz(temp); ++ int first_vmid_to_scan = dev->vm_info.first_vmid_kfd; ++ int last_vmid_to_scan = dev->vm_info.last_vmid_kfd; + + reg_sq_cmd.u32All = 0; + status = 0; +@@ -823,7 +814,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { + if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid + (dev->kgd, vmid)) { +- if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid ++ if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid + (dev->kgd, vmid) == p->pasid) { + pr_debug("Killing wave fronts of vmid %d and pasid %d\n", + vmid, p->pasid); +@@ -833,7 +824,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + } + + if (vmid > last_vmid_to_scan) { +- pr_err("amdkfd: didn't found vmid for pasid (%d)\n", p->pasid); ++ pr_err("amdkfd: didn't find vmid for pasid (%d)\n", p->pasid); + return -EFAULT; + } + +@@ -843,7 +834,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + return -EFAULT; + + status = dbgdev_wave_control_set_registers(&wac_info, ®_sq_cmd, +- ®_gfx_index); ++ ®_gfx_index, dev->device_info->asic_family); + if (status != 0) + return -EINVAL; + +@@ -858,15 +849,12 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + } + + void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, +- enum DBGDEV_TYPE type) ++ DBGDEV_TYPE type) + { +- BUG_ON(!pdbgdev || !pdev); +- + pdbgdev->dev = pdev; + pdbgdev->kq = NULL; + pdbgdev->type = type; + pdbgdev->pqm = NULL; +- + switch (type) { + case DBGDEV_TYPE_NODIQ: + pdbgdev->dbgdev_register = dbgdev_register_nodiq; +@@ -876,10 +864,12 @@ void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, + break; + case DBGDEV_TYPE_DIQ: + default: ++ + pdbgdev->dbgdev_register = dbgdev_register_diq; + pdbgdev->dbgdev_unregister = dbgdev_unregister_diq; + pdbgdev->dbgdev_wave_control = dbgdev_wave_control_diq; + pdbgdev->dbgdev_address_watch = dbgdev_address_watch_diq; ++ + break; + } + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h +index 03424c2..82f48ff 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h +@@ -23,6 +23,10 @@ + #ifndef KFD_DBGDEV_H_ + #define KFD_DBGDEV_H_ + ++/* ++ * SQ_IND_CMD_CMD enum ++ */ ++ + enum { + SQ_CMD_VMID_OFFSET = 28, + ADDRESS_WATCH_CNTL_OFFSET = 24 +@@ -48,9 +52,9 @@ enum { + + /* CONFIG reg space definition */ + enum { +- AMD_CONFIG_REG_BASE = 0x2000, /* in dwords */ +- AMD_CONFIG_REG_END = 0x2B00, +- AMD_CONFIG_REG_SIZE = AMD_CONFIG_REG_END - AMD_CONFIG_REG_BASE ++ CONFIG_REG_BASE = 0x2000, /* in dwords */ ++ CONFIG_REG_END = 0x2B00, ++ CONFIG_REG_SIZE = CONFIG_REG_END - CONFIG_REG_BASE + }; + + /* SH reg space definition */ +@@ -60,22 +64,43 @@ enum { + SH_REG_SIZE = SH_REG_END - SH_REG_BASE + }; + ++/* SQ_CMD definitions */ ++ ++enum { ++ SQ_IND_CMD_DATA_RESUME = 0, ++ SQ_IND_CMD_DATA_HALT = 1 ++}; ++ ++enum SQ_IND_CMD_NEW { ++ SQ_IND_CMD_NEW_NULL = 0x00000000, ++ SQ_IND_CMD_NEW_SETHALT = 0x00000001, ++ SQ_IND_CMD_NEW_SAVECTX = 0x00000002, ++ SQ_IND_CMD_NEW_KILL = 0x00000003, ++ SQ_IND_CMD_NEW_DEBUG = 0x00000004, ++ SQ_IND_CMD_NEW_TRAP = 0x00000005, ++ SQ_IND_CMD_NEW_SET_PRIO = 0x00000006 ++ ++}; ++ + enum SQ_IND_CMD_CMD { + SQ_IND_CMD_CMD_NULL = 0x00000000, + SQ_IND_CMD_CMD_HALT = 0x00000001, + SQ_IND_CMD_CMD_RESUME = 0x00000002, + SQ_IND_CMD_CMD_KILL = 0x00000003, + SQ_IND_CMD_CMD_DEBUG = 0x00000004, +- SQ_IND_CMD_CMD_TRAP = 0x00000005, ++ SQ_IND_CMD_CMD_TRAP = 0x00000005 + }; ++/* ++ * SQ_IND_CMD_MODE enum ++ */ + +-enum SQ_IND_CMD_MODE { ++typedef enum SQ_IND_CMD_MODE { + SQ_IND_CMD_MODE_SINGLE = 0x00000000, + SQ_IND_CMD_MODE_BROADCAST = 0x00000001, + SQ_IND_CMD_MODE_BROADCAST_QUEUE = 0x00000002, + SQ_IND_CMD_MODE_BROADCAST_PIPE = 0x00000003, + SQ_IND_CMD_MODE_BROADCAST_ME = 0x00000004, +-}; ++} SQ_IND_CMD_MODE; + + union SQ_IND_INDEX_BITS { + struct { +@@ -106,18 +131,32 @@ union SQ_IND_CMD_BITS { + union SQ_CMD_BITS { + struct { + uint32_t cmd:3; +- uint32_t:1; ++ uint32_t:1; + uint32_t mode:3; + uint32_t check_vmid:1; + uint32_t trap_id:3; +- uint32_t:5; ++ uint32_t:5; + uint32_t wave_id:4; + uint32_t simd_id:2; +- uint32_t:2; ++ uint32_t:2; + uint32_t queue_id:3; +- uint32_t:1; ++ uint32_t:1; + uint32_t vm_id:4; + } bitfields, bits; ++ struct { ++ uint32_t cmd:3; ++ uint32_t:1; ++ uint32_t mode:3; ++ uint32_t check_vmid:1; ++ uint32_t data:3; ++ uint32_t:5; ++ uint32_t wave_id:4; ++ uint32_t simd_id:2; ++ uint32_t:2; ++ uint32_t queue_id:3; ++ uint32_t:1; ++ uint32_t vm_id:4; ++ } bitfields_sethalt, bits_sethalt; + uint32_t u32All; + signed int i32All; + float f32All; +@@ -169,7 +208,7 @@ union TCP_WATCH_ADDR_L_BITS { + }; + + enum { +- QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */ ++ QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */ + QUEUESTATE__ACTIVE_COMPLETION_PENDING, + QUEUESTATE__ACTIVE + }; +@@ -187,7 +226,6 @@ union ULARGE_INTEGER { + #define KFD_CIK_VMID_END_OFFSET (KFD_CIK_VMID_START_OFFSET + (8)) + + +-void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, +- enum DBGDEV_TYPE type); ++void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, DBGDEV_TYPE type); + +-#endif /* KFD_DBGDEV_H_ */ ++#endif /* KFD_DBGDEV_H_ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +index 56d6763..5d269ea 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +@@ -36,42 +36,50 @@ + + static DEFINE_MUTEX(kfd_dbgmgr_mutex); + +-struct mutex *kfd_get_dbgmgr_mutex(void) ++struct mutex * ++get_dbgmgr_mutex(void) + { + return &kfd_dbgmgr_mutex; + } + ++/*===========================================================================*/ + +-static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) ++static void ++kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) + { +- BUG_ON(!pmgr); +- + kfree(pmgr->dbgdev); +- + pmgr->dbgdev = NULL; + pmgr->pasid = 0; + pmgr->dev = NULL; + } + +-void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) ++/*===========================================================================*/ ++ ++void ++kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) + { + if (pmgr != NULL) { + kfd_dbgmgr_uninitialize(pmgr); + kfree(pmgr); ++ pmgr = NULL; + } + } + +-bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) ++/*===========================================================================*/ ++ ++bool ++kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) + { +- enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; ++ DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; + struct kfd_dbgmgr *new_buff; + + BUG_ON(pdev == NULL); + BUG_ON(!pdev->init_complete); + + new_buff = kfd_alloc_struct(new_buff); +- if (!new_buff) { +- pr_err("amdkfd: Failed to allocate dbgmgr instance\n"); ++ if (!new_buff) ++ { ++ dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgmgr instance\n", __func__); + return false; + } + +@@ -79,7 +87,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) + new_buff->dev = pdev; + new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev); + if (!new_buff->dbgdev) { +- pr_err("amdkfd: Failed to allocate dbgdev instance\n"); ++ dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgdev\n", __func__); + kfree(new_buff); + return false; + } +@@ -94,75 +102,200 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) + return true; + } + +-long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) ++/*===========================================================================*/ ++ ++long ++kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) + { +- BUG_ON(!p || !pmgr || !pmgr->dbgdev); ++ long status = 0; + +- if (pmgr->pasid != 0) { +- pr_debug("H/W debugger is already active using pasid %d\n", +- pmgr->pasid); +- return -EBUSY; +- } ++ do { ++ ++ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) { ++ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); ++ /* Invalid Pointer. */ ++ status = -EINVAL; ++ break; ++ } ++ if (pmgr->pasid != 0) { ++ /* HW debugger is already active. */ ++ status = -EBUSY; ++ break; ++ } ++ ++ /* remember pasid */ ++ ++ pmgr->pasid = p->pasid; ++ ++ /* provide the pqm for diq generation */ + +- /* remember pasid */ +- pmgr->pasid = p->pasid; ++ pmgr->dbgdev->pqm = &p->pqm; + +- /* provide the pqm for diq generation */ +- pmgr->dbgdev->pqm = &p->pqm; ++ /* activate the actual registering */ ++ /* todo: you should lock with the process mutex here */ ++ pmgr->dbgdev->dbgdev_register(pmgr->dbgdev); ++ /* todo: you should unlock with the process mutex here */ + +- /* activate the actual registering */ +- pmgr->dbgdev->dbgdev_register(pmgr->dbgdev); ++ } while (false); + +- return 0; ++ return status; + } + +-long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) ++/* ========================================================================== */ ++ ++long ++kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) + { +- BUG_ON(!p || !pmgr || !pmgr->dbgdev); + +- /* Is the requests coming from the already registered process? */ +- if (pmgr->pasid != p->pasid) { +- pr_debug("H/W debugger is not registered by calling pasid %d\n", +- p->pasid); +- return -EINVAL; +- } ++ long status = 0; + +- pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev); ++ do { + +- pmgr->pasid = 0; ++ if ((pmgr == NULL) || (pmgr->dev == NULL) ++ || (pmgr->dbgdev == NULL) || (p == NULL)) { ++ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); ++ /* Invalid Pointer */ ++ status = -EINVAL; ++ break; ++ } ++ if (pmgr->pasid != p->pasid) { ++ /* Is the requests coming from the already registered process? */ ++ status = -EINVAL; ++ break; ++ } ++ ++ /* todo: you should lock with the process mutex here */ ++ ++ pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev); + +- return 0; ++ /* todo: you should unlock with the process mutex here */ ++ ++ pmgr->pasid = 0; ++ ++ } while (false); ++ ++ return status; + } + +-long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, +- struct dbg_wave_control_info *wac_info) ++/* =========================================================================== */ ++ ++long ++kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info) + { +- BUG_ON(!pmgr || !pmgr->dbgdev || !wac_info); ++ long status = 0; + +- /* Is the requests coming from the already registered process? */ +- if (pmgr->pasid != wac_info->process->pasid) { +- pr_debug("H/W debugger support was not registered for requester pasid %d\n", +- wac_info->process->pasid); +- return -EINVAL; +- } ++ dev_info(NULL, "kfd: In func %s\n", __func__); ++ ++ do { ++ ++ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (wac_info == NULL) ++ || (wac_info->process == NULL)) { ++ /* Invalid Pointer */ ++ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); ++ status = -EINVAL; ++ break; ++ } ++ /* Is the requests coming from the already registered process? */ ++ if (pmgr->pasid != wac_info->process->pasid) { ++ /* HW debugger support was not registered for requester process */ ++ status = -EINVAL; ++ break; ++ } ++ ++ status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info); ++ ++ } while (false); ++ ++ return status; + +- return (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info); + } + +-long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, +- struct dbg_address_watch_info *adw_info) ++/* =========================================================================== */ ++ ++long ++kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info) + { +- BUG_ON(!pmgr || !pmgr->dbgdev || !adw_info); ++ long status = 0; + ++ dev_info(NULL, "kfd: In func %s\n", __func__); + +- /* Is the requests coming from the already registered process? */ +- if (pmgr->pasid != adw_info->process->pasid) { +- pr_debug("H/W debugger support was not registered for requester pasid %d\n", +- adw_info->process->pasid); +- return -EINVAL; +- } ++ do { ++ ++ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (adw_info == NULL) ++ || (adw_info->process == NULL)) { ++ /* Invalid Pointer */ ++ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); ++ status = -EINVAL; ++ break; ++ } ++ /* Is the requests coming from the already registered process? */ ++ if (pmgr->pasid != adw_info->process->pasid) { ++ /* HW debugger support was not registered for requester process */ ++ status = -EINVAL; ++ break; ++ } ++ ++ status = (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev, adw_info); ++ ++ } while (false); ++ ++ return status; + +- return (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev, +- adw_info); + } + ++ ++/* =========================================================================== */ ++/* ++ * Handle abnormal process termination ++ * if we are in the midst of a debug session, we should kill all pending waves ++ * of the debugged process and unregister the process from the Debugger. ++ */ ++long ++kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process) ++{ ++ long status = 0; ++ struct dbg_wave_control_info wac_info; ++ ++ dev_info(NULL, "kfd: In func %s\n", __func__); ++ ++ do { ++ ++ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) { ++ /* Invalid Pointer */ ++ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); ++ status = -EINVAL; ++ break; ++ } ++ /* first, we kill all the wavefronts of this process */ ++ ++ wac_info.process = process; ++ wac_info.mode = HSA_DBG_WAVEMODE_BROADCAST_PROCESS; ++ wac_info.operand = HSA_DBG_WAVEOP_KILL; ++ wac_info.trapId = 0x0; /* not used for the KILL */ ++ wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = 0; /* not used for kill */ ++ wac_info.dbgWave_msg.MemoryVA = NULL; /* not used for kill */ ++ ++ status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, &wac_info); ++ ++ if (status != 0) { ++ dev_info(NULL, "Error! kfd: In func %s: wave control failed, status is: %ld\n", __func__, status); ++ break; ++ } ++ if (pmgr->pasid == wac_info.process->pasid) { ++ /* if terminated process was registered for debug, then unregister it */ ++ status = kfd_dbgmgr_unregister(pmgr, process); ++ pmgr->pasid = 0; ++ } ++ if (status != 0) ++ dev_info(NULL, ++ "Error! kfd: In func %s: unregister failed, status is: %ld debugger can not be reused\n", ++ __func__, status); ++ ++ } while (false); ++ ++ return status; ++ ++} ++ ++ ++/*///////////////////////////////////////////////////////////////////////////////////////// */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h +index 257a745..2b6484e 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h +@@ -26,252 +26,242 @@ + + #include "kfd_priv.h" + +-/* must align with hsakmttypes definition */ ++/* ++ * SQ_IND_CMD_CMD enum ++ */ ++ ++ ++/* must align with hsakmttypes definition. */ + #pragma pack(push, 4) + +-enum HSA_DBG_WAVEOP { +- HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ +- HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ +- HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ +- HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter +- debug mode */ +- HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take +- a trap */ ++typedef enum _HSA_DBG_WAVEOP { ++ HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ ++ HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ ++ HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ ++ HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter debug mode */ ++ HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take a trap */ + HSA_DBG_NUM_WAVEOP = 5, + HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF +-}; ++} HSA_DBG_WAVEOP; + +-enum HSA_DBG_WAVEMODE { +- /* send command to a single wave */ +- HSA_DBG_WAVEMODE_SINGLE = 0, +- /* +- * Broadcast to all wavefronts of all processes is not +- * supported for HSA user mode +- */ +- +- /* send to waves within current process */ +- HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, +- /* send to waves within current process on CU */ +- HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, ++typedef enum _HSA_DBG_WAVEMODE { ++ HSA_DBG_WAVEMODE_SINGLE = 0, /* send command to a single wave */ ++ /* Broadcast to all wavefronts of all processes is not supported for HSA user mode */ ++ HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, /* send to waves within current process */ ++ HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, /* send to waves within current process on CU */ + HSA_DBG_NUM_WAVEMODE = 3, + HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF +-}; ++} HSA_DBG_WAVEMODE; + +-enum HSA_DBG_WAVEMSG_TYPE { ++typedef enum _HSA_DBG_WAVEMSG_TYPE { + HSA_DBG_WAVEMSG_AUTO = 0, + HSA_DBG_WAVEMSG_USER = 1, + HSA_DBG_WAVEMSG_ERROR = 2, + HSA_DBG_NUM_WAVEMSG, + HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF +-}; ++} HSA_DBG_WAVEMSG_TYPE; + +-enum HSA_DBG_WATCH_MODE { +- HSA_DBG_WATCH_READ = 0, /* Read operations only */ +- HSA_DBG_WATCH_NONREAD = 1, /* Write or Atomic operations only */ +- HSA_DBG_WATCH_ATOMIC = 2, /* Atomic Operations only */ +- HSA_DBG_WATCH_ALL = 3, /* Read, Write or Atomic operations */ ++typedef enum _HSA_DBG_WATCH_MODE { ++ HSA_DBG_WATCH_READ = 0, /* Read operations only */ ++ HSA_DBG_WATCH_NONREAD = 1, /* Write or Atomic operations only */ ++ HSA_DBG_WATCH_ATOMIC = 2, /* Atomic Operations only */ ++ HSA_DBG_WATCH_ALL = 3, /* Read, Write or Atomic operations */ + HSA_DBG_WATCH_NUM, + HSA_DBG_WATCH_SIZE = 0xFFFFFFFF +-}; ++} HSA_DBG_WATCH_MODE; + + /* This structure is hardware specific and may change in the future */ +-struct HsaDbgWaveMsgAMDGen2 { ++typedef struct _HsaDbgWaveMsgAMDGen2 { + union { +- struct ui32 { +- uint32_t UserData:8; /* user data */ +- uint32_t ShaderArray:1; /* Shader array */ +- uint32_t Priv:1; /* Privileged */ +- uint32_t Reserved0:4; /* This field is reserved, +- should be 0 */ +- uint32_t WaveId:4; /* wave id */ +- uint32_t SIMD:2; /* SIMD id */ +- uint32_t HSACU:4; /* Compute unit */ +- uint32_t ShaderEngine:2;/* Shader engine */ +- uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ +- uint32_t Reserved1:4; /* This field is reserved, +- should be 0 */ ++ struct { ++ uint32_t UserData:8; /* user data */ ++ uint32_t ShaderArray:1; /* Shader array */ ++ uint32_t Priv:1; /* Privileged */ ++ uint32_t Reserved0:4; /* This field is reserved, should be 0 */ ++ uint32_t WaveId:4; /* wave id */ ++ uint32_t SIMD:2; /* SIMD id */ ++ uint32_t HSACU:4; /* Compute unit */ ++ uint32_t ShaderEngine:2; /* Shader engine */ ++ uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ ++ uint32_t Reserved1:4; /* This field is reserved, should be 0 */ + } ui32; + uint32_t Value; + }; +- uint32_t Reserved2; +-}; + +-union HsaDbgWaveMessageAMD { +- struct HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2; +- /* for future HsaDbgWaveMsgAMDGen3; */ +-}; +- +-struct HsaDbgWaveMessage { +- void *MemoryVA; /* ptr to associated host-accessible data */ +- union HsaDbgWaveMessageAMD DbgWaveMsg; +-}; ++ uint32_t Reserved2; + +-/* +- * TODO: This definitions to be MOVED to kfd_event, once it is implemented. +- * +- * HSA sync primitive, Event and HW Exception notification API definitions. +- * The API functions allow the runtime to define a so-called sync-primitive, +- * a SW object combining a user-mode provided "syncvar" and a scheduler event +- * that can be signaled through a defined GPU interrupt. A syncvar is +- * a process virtual memory location of a certain size that can be accessed +- * by CPU and GPU shader code within the process to set and query the content +- * within that memory. The definition of the content is determined by the HSA +- * runtime and potentially GPU shader code interfacing with the HSA runtime. +- * The syncvar values may be commonly written through an PM4 WRITE_DATA packet +- * in the user mode instruction stream. The OS scheduler event is typically +- * associated and signaled by an interrupt issued by the GPU, but other HSA +- * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced +- * by the KFD by this mechanism, too. */ +- +-/* these are the new definitions for events */ +-enum HSA_EVENTTYPE { +- HSA_EVENTTYPE_SIGNAL = 0, /* user-mode generated GPU signal */ +- HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ +- HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change +- (start/stop) */ +- HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ +- HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ +- HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ +- HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */ +- HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state +- (EOP pm4) */ ++} HsaDbgWaveMsgAMDGen2; ++ ++typedef union _HsaDbgWaveMessageAMD { ++ HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2; ++ /* for future HsaDbgWaveMsgAMDGen3; */ ++} HsaDbgWaveMessageAMD; ++ ++typedef struct _HsaDbgWaveMessage { ++ void *MemoryVA; /* ptr to associated host-accessible data */ ++ HsaDbgWaveMessageAMD DbgWaveMsg; ++} HsaDbgWaveMessage; ++ ++/* TODO: This definitions to be MOVED to kfd_event, once it is implemented. ++ ++ HSA sync primitive, Event and HW Exception notification API definitions ++ The API functions allow the runtime to define a so-called sync-primitive, a SW object ++ combining a user-mode provided "syncvar" and a scheduler event that can be signaled ++ through a defined GPU interrupt. A syncvar is a process virtual memory location of ++ a certain size that can be accessed by CPU and GPU shader code within the process to set ++ and query the content within that memory. The definition of the content is determined by ++ the HSA runtime and potentially GPU shader code interfacing with the HSA runtime. ++ The syncvar values may be commonly written through an PM4 WRITE_DATA packet in the ++ user mode instruction stream. The OS scheduler event is typically associated and ++ signaled by an interrupt issued by the GPU, but other HSA system interrupt conditions ++ from other HW (e.g. IOMMUv2) may besurfaced by the KFD by this mechanism, too. */ ++ ++/* these are the new definitions for events */ ++ ++typedef enum _HSA_EVENTTYPE { ++ HSA_EVENTTYPE_SIGNAL = 0, /* /user-mode generated GPU signal */ ++ HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ ++ HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change( start/stop ) */ ++ HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ ++ HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ ++ HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ ++ HSA_EVENTTYPE_PROFILE_EVENT = 6, /* GPU signal for profiling */ ++ HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state (EOP pm4) */ + /* ... */ + HSA_EVENTTYPE_MAXID, + HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF +-}; ++} HSA_EVENTTYPE; ++ ++typedef uint32_t HSA_EVENTID; + +-/* Sub-definitions for various event types: Syncvar */ +-struct HsaSyncVar { +- union SyncVar { +- void *UserData; /* pointer to user mode data */ +- uint64_t UserDataPtrValue; /* 64bit compatibility of value */ ++/* Subdefinitions for various event types: Syncvar */ ++ ++typedef struct _HsaSyncVar { ++ union { ++ void *UserData; /* pointer to user mode data */ ++ uint64_t UserDataPtrValue; /* 64bit compatibility of value */ + } SyncVar; + uint64_t SyncVarSize; +-}; ++} HsaSyncVar; + +-/* Sub-definitions for various event types: NodeChange */ ++/* ++ Subdefinitions for various event types: NodeChange ++*/ + +-enum HSA_EVENTTYPE_NODECHANGE_FLAGS { ++typedef enum _HSA_EVENTTYPE_NODECHANGE_FLAGS { + HSA_EVENTTYPE_NODECHANGE_ADD = 0, + HSA_EVENTTYPE_NODECHANGE_REMOVE = 1, + HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF +-}; ++} HSA_EVENTTYPE_NODECHANGE_FLAGS; + +-struct HsaNodeChange { +- /* HSA node added/removed on the platform */ +- enum HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; +-}; ++typedef struct _HsaNodeChange { ++ HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; /* HSA node added/removed on the platform */ ++} HsaNodeChange; ++ ++/* ++ Sub-definitions for various event types: DeviceStateChange ++*/ + +-/* Sub-definitions for various event types: DeviceStateChange */ +-enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS { +- /* device started (and available) */ +- HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, +- /* device stopped (i.e. unavailable) */ +- HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, ++typedef enum _HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS { ++ HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, /* device started (and available) */ ++ HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, /* device stopped (i.e. unavailable) */ + HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF +-}; ++} HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS; + +-enum HSA_DEVICE { ++typedef enum _HSA_DEVICE { + HSA_DEVICE_CPU = 0, + HSA_DEVICE_GPU = 1, + MAX_HSA_DEVICE = 2 +-}; ++} HSA_DEVICE; + +-struct HsaDeviceStateChange { ++typedef struct _HsaDeviceStateChange { + uint32_t NodeId; /* F-NUMA node that contains the device */ +- enum HSA_DEVICE Device; /* device type: GPU or CPU */ +- enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */ +-}; ++ HSA_DEVICE Device; /* device type: GPU or CPU */ ++ HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */ ++} HsaDeviceStateChange; + +-struct HsaEventData { +- enum HSA_EVENTTYPE EventType; /* event type */ +- union EventData { +- /* +- * return data associated with HSA_EVENTTYPE_SIGNAL +- * and other events +- */ +- struct HsaSyncVar SyncVar; ++typedef struct _HsaEventData { ++ HSA_EVENTTYPE EventType; /* event type */ ++ union { ++ /* return data associated with HSA_EVENTTYPE_SIGNAL and other events */ ++ HsaSyncVar SyncVar; + + /* data associated with HSA_EVENTTYPE_NODE_CHANGE */ +- struct HsaNodeChange NodeChangeState; ++ HsaNodeChange NodeChangeState; + + /* data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE */ +- struct HsaDeviceStateChange DeviceState; ++ HsaDeviceStateChange DeviceState; + } EventData; + +- /* the following data entries are internal to the KFD & thunk itself */ ++ /* the following data entries are internal to the KFD & thunk itself. */ + +- /* internal thunk store for Event data (OsEventHandle) */ +- uint64_t HWData1; +- /* internal thunk store for Event data (HWAddress) */ +- uint64_t HWData2; +- /* internal thunk store for Event data (HWData) */ +- uint32_t HWData3; +-}; ++ uint64_t HWData1; /* internal thunk store for Event data (OsEventHandle) */ ++ uint64_t HWData2; /* internal thunk store for Event data (HWAddress) */ ++ uint32_t HWData3; /* internal thunk store for Event data (HWData) */ ++} HsaEventData; + +-struct HsaEventDescriptor { +- /* event type to allocate */ +- enum HSA_EVENTTYPE EventType; +- /* H-NUMA node containing GPU device that is event source */ +- uint32_t NodeId; +- /* pointer to user mode syncvar data, syncvar->UserDataPtrValue +- * may be NULL +- */ +- struct HsaSyncVar SyncVar; +-}; ++typedef struct _HsaEventDescriptor { ++ HSA_EVENTTYPE EventType; /* event type to allocate */ ++ uint32_t NodeId; /* H-NUMA node containing GPU device that is event source */ ++ HsaSyncVar SyncVar; /* pointer to user mode syncvar data, syncvar->UserDataPtrValue may be NULL */ ++} HsaEventDescriptor; ++ ++typedef struct _HsaEvent { ++ HSA_EVENTID EventId; ++ HsaEventData EventData; ++} HsaEvent; + +-struct HsaEvent { +- uint32_t EventId; +- struct HsaEventData EventData; +-}; + + #pragma pack(pop) + +-enum DBGDEV_TYPE { ++typedef enum _DBGDEV_TYPE { + DBGDEV_TYPE_ILLEGAL = 0, + DBGDEV_TYPE_NODIQ = 1, + DBGDEV_TYPE_DIQ = 2, + DBGDEV_TYPE_TEST = 3 +-}; ++} DBGDEV_TYPE; + + struct dbg_address_watch_info { + struct kfd_process *process; +- enum HSA_DBG_WATCH_MODE *watch_mode; ++ HSA_DBG_WATCH_MODE *watch_mode; + uint64_t *watch_address; + uint64_t *watch_mask; +- struct HsaEvent *watch_event; ++ HsaEvent *watch_event; + uint32_t num_watch_points; + }; + + struct dbg_wave_control_info { + struct kfd_process *process; + uint32_t trapId; +- enum HSA_DBG_WAVEOP operand; +- enum HSA_DBG_WAVEMODE mode; +- struct HsaDbgWaveMessage dbgWave_msg; ++ HSA_DBG_WAVEOP operand; ++ HSA_DBG_WAVEMODE mode; ++ HsaDbgWaveMessage dbgWave_msg; + }; + + struct kfd_dbgdev { + + /* The device that owns this data. */ ++ + struct kfd_dev *dev; + + /* kernel queue for DIQ */ ++ + struct kernel_queue *kq; + + /* a pointer to the pqm of the calling process */ ++ + struct process_queue_manager *pqm; + + /* type of debug device ( DIQ, non DIQ, etc. ) */ +- enum DBGDEV_TYPE type; ++ ++ DBGDEV_TYPE type; + + /* virtualized function pointers to device dbg */ ++ + int (*dbgdev_register)(struct kfd_dbgdev *dbgdev); + int (*dbgdev_unregister)(struct kfd_dbgdev *dbgdev); +- int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev, +- struct dbg_address_watch_info *adw_info); +- int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, +- struct dbg_wave_control_info *wac_info); ++ int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev, struct dbg_address_watch_info *adw_info); ++ int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, struct dbg_wave_control_info *wac_info); + + }; + +@@ -282,13 +272,12 @@ struct kfd_dbgmgr { + }; + + /* prototypes for debug manager functions */ +-struct mutex *kfd_get_dbgmgr_mutex(void); ++struct mutex *get_dbgmgr_mutex(void); + void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr); + bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev); + long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p); + long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p); +-long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, +- struct dbg_wave_control_info *wac_info); +-long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, +- struct dbg_address_watch_info *adw_info); +-#endif /* KFD_DBGMGR_H_ */ ++long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info); ++long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info); ++long kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process); ++#endif /* KFD_DBGMGR_H_ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +index 3f95f7c..20592ba 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +@@ -24,9 +24,11 @@ + #include <linux/bsearch.h> + #include <linux/pci.h> + #include <linux/slab.h> ++#include <linux/highmem.h> + #include "kfd_priv.h" + #include "kfd_device_queue_manager.h" + #include "kfd_pm4_headers.h" ++#include "cwsr_trap_handler_carrizo.h" + + #define MQD_SIZE_ALIGNED 768 + +@@ -38,7 +40,8 @@ static const struct kfd_device_info kaveri_device_info = { + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .is_need_iommu_device = true + }; + + static const struct kfd_device_info carrizo_device_info = { +@@ -49,14 +52,50 @@ static const struct kfd_device_info carrizo_device_info = { + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .is_need_iommu_device = true + }; + ++static const struct kfd_device_info tonga_device_info = { ++ .asic_family = CHIP_TONGA, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .is_need_iommu_device = false ++}; ++ ++static const struct kfd_device_info fiji_device_info = { ++ .asic_family = CHIP_FIJI, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .is_need_iommu_device = false ++} ++; + struct kfd_deviceid { + unsigned short did; + const struct kfd_device_info *device_info; + }; + ++/* ++ * // ++// TONGA/AMETHYST device IDs (performance segment) ++// ++#define DEVICE_ID_VI_TONGA_P_6920 0x6920 // unfused ++#define DEVICE_ID_VI_TONGA_P_6921 0x6921 // Amethyst XT ++#define DEVICE_ID_VI_TONGA_P_6928 0x6928 // Tonga GL XT ++#define DEVICE_ID_VI_TONGA_P_692B 0x692B // Tonga GL PRO ++#define DEVICE_ID_VI_TONGA_P_692F 0x692F // Tonga GL PRO VF ++#define DEVICE_ID_VI_TONGA_P_6938 0x6938 // Tonga XT ++#define DEVICE_ID_VI_TONGA_P_6939 0x6939 // Tonga PRO ++ * ++ */ + /* Please keep this sorted by increasing device id. */ + static const struct kfd_deviceid supported_devices[] = { + { 0x1304, &kaveri_device_info }, /* Kaveri */ +@@ -85,13 +124,23 @@ static const struct kfd_deviceid supported_devices[] = { + { 0x9874, &carrizo_device_info }, /* Carrizo */ + { 0x9875, &carrizo_device_info }, /* Carrizo */ + { 0x9876, &carrizo_device_info }, /* Carrizo */ +- { 0x9877, &carrizo_device_info } /* Carrizo */ ++ { 0x9877, &carrizo_device_info }, /* Carrizo */ ++ { 0x6920, &tonga_device_info }, /* Tonga */ ++ { 0x6921, &tonga_device_info }, /* Tonga */ ++ { 0x6928, &tonga_device_info }, /* Tonga */ ++ { 0x692B, &tonga_device_info }, /* Tonga */ ++ { 0x692F, &tonga_device_info }, /* Tonga */ ++ { 0x6938, &tonga_device_info }, /* Tonga */ ++ { 0x6939, &tonga_device_info }, /* Tonga */ ++ { 0x7300, &fiji_device_info } /* Fiji */ + }; + + static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size); + static void kfd_gtt_sa_fini(struct kfd_dev *kfd); + ++static int kfd_resume(struct kfd_dev *kfd); ++ + static const struct kfd_device_info *lookup_device_info(unsigned short did) + { + size_t i; +@@ -117,6 +166,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + if (!device_info) + return NULL; + ++ BUG_ON(!f2g); ++ + kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); + if (!kfd) + return NULL; +@@ -170,15 +221,8 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) + pasid_limit, + kfd->doorbell_process_limit - 1); + +- err = amd_iommu_init_device(kfd->pdev, pasid_limit); +- if (err < 0) { +- dev_err(kfd_device, "error initializing iommu device\n"); +- return false; +- } +- + if (!kfd_set_pasid_limit(pasid_limit)) { + dev_err(kfd_device, "error setting pasid limit\n"); +- amd_iommu_free_device(kfd->pdev); + return false; + } + +@@ -219,13 +263,81 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, + return AMD_IOMMU_INV_PRI_RSP_INVALID; + } + ++static int kfd_cwsr_init(struct kfd_dev *kfd) ++{ ++ /* ++ * Initialize the CWSR required memory for TBA and TMA ++ * only support CWSR on VI and up with FW version >=625. ++ */ ++ if (cwsr_enable && ++ (kfd->mec_fw_version >= KFD_CWSR_CZ_FW_VER)) { ++ void *cwsr_addr = NULL; ++ unsigned int size = sizeof(cwsr_trap_carrizo_hex); ++ ++ if (size > PAGE_SIZE) { ++ pr_err("amdkfd: wrong CWSR ISA size.\n"); ++ return -EINVAL; ++ } ++ kfd->cwsr_size = ++ ALIGN(size, PAGE_SIZE) + PAGE_SIZE; ++ kfd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, ++ get_order(kfd->cwsr_size)); ++ if (!kfd->cwsr_pages) { ++ pr_err("amdkfd: error alloc CWSR isa memory.\n"); ++ return -ENOMEM; ++ } ++ /*Only first page used for cwsr ISA code */ ++ cwsr_addr = kmap(kfd->cwsr_pages); ++ memset(cwsr_addr, 0, PAGE_SIZE); ++ memcpy(cwsr_addr, cwsr_trap_carrizo_hex, size); ++ kunmap(kfd->cwsr_pages); ++ kfd->tma_offset = ALIGN(size, PAGE_SIZE); ++ kfd->cwsr_enabled = true; ++ dev_info(kfd_device, ++ "Reserved %d pages for cwsr.\n", ++ (kfd->cwsr_size >> PAGE_SHIFT)); ++ } ++ ++ return 0; ++} ++ ++static void kfd_cwsr_fini(struct kfd_dev *kfd) ++{ ++ if (kfd->cwsr_pages) ++ __free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size)); ++} ++ + bool kgd2kfd_device_init(struct kfd_dev *kfd, + const struct kgd2kfd_shared_resources *gpu_resources) + { + unsigned int size; ++ unsigned int vmid_bitmap_kfd, vmid_num_kfd; ++ ++ kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, ++ KGD_ENGINE_MEC1); + + kfd->shared_resources = *gpu_resources; + ++ vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap; ++ kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1; ++ kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1; ++ vmid_num_kfd = kfd->vm_info.last_vmid_kfd ++ - kfd->vm_info.first_vmid_kfd + 1; ++ kfd->vm_info.vmid_num_kfd = vmid_num_kfd; ++ ++ /* If MEC firmware is too old, turn off hws multiple process mapping */ ++ if (kfd->mec_fw_version < KFD_MULTI_PROC_MAPPING_HWS_SUPPORT) ++ kfd->max_proc_per_quantum = 0; ++ /* Verify module parameters regarding mapped process number*/ ++ else if ((hws_max_conc_proc < 0) ++ || (hws_max_conc_proc > vmid_num_kfd)) { ++ dev_err(kfd_device, ++ "hws_max_conc_proc (%d) must be between 0 and %d, use %d instead\n", ++ hws_max_conc_proc, vmid_num_kfd, vmid_num_kfd); ++ kfd->max_proc_per_quantum = vmid_num_kfd; ++ } else ++ kfd->max_proc_per_quantum = hws_max_conc_proc; ++ + /* calculate max size of mqds needed for queues */ + size = max_num_of_queues_per_device * + kfd->device_info->mqd_size_aligned; +@@ -280,16 +392,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + goto kfd_interrupt_error; + } + +- if (!device_iommu_pasid_init(kfd)) { +- dev_err(kfd_device, +- "Error initializing iommuv2 for device (%x:%x)\n", +- kfd->pdev->vendor, kfd->pdev->device); +- goto device_iommu_pasid_error; +- } +- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, +- iommu_pasid_shutdown_callback); +- amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); +- + kfd->dqm = device_queue_manager_init(kfd); + if (!kfd->dqm) { + dev_err(kfd_device, +@@ -298,13 +400,21 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + goto device_queue_manager_error; + } + +- if (kfd->dqm->ops.start(kfd->dqm) != 0) { +- dev_err(kfd_device, +- "Error starting queuen manager for device (%x:%x)\n", +- kfd->pdev->vendor, kfd->pdev->device); +- goto dqm_start_error; ++ if (kfd->device_info->is_need_iommu_device) { ++ if (!device_iommu_pasid_init(kfd)) { ++ dev_err(kfd_device, ++ "Error initializing iommuv2 for device (%x:%x)\n", ++ kfd->pdev->vendor, kfd->pdev->device); ++ goto device_iommu_pasid_error; ++ } + } + ++ if (kfd_cwsr_init(kfd)) ++ goto device_iommu_pasid_error; ++ ++ if (kfd_resume(kfd)) ++ goto kfd_resume_error; ++ + kfd->dbgmgr = NULL; + + kfd->init_complete = true; +@@ -316,11 +426,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + + goto out; + +-dqm_start_error: ++kfd_resume_error: ++ kfd_cwsr_fini(kfd); ++device_iommu_pasid_error: + device_queue_manager_uninit(kfd->dqm); + device_queue_manager_error: +- amd_iommu_free_device(kfd->pdev); +-device_iommu_pasid_error: + kfd_interrupt_exit(kfd); + kfd_interrupt_error: + kfd_topology_remove_device(kfd); +@@ -338,8 +448,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + void kgd2kfd_device_exit(struct kfd_dev *kfd) + { + if (kfd->init_complete) { ++ kgd2kfd_suspend(kfd); ++ kfd_cwsr_fini(kfd); + device_queue_manager_uninit(kfd->dqm); +- amd_iommu_free_device(kfd->pdev); + kfd_interrupt_exit(kfd); + kfd_topology_remove_device(kfd); + kfd_gtt_sa_fini(kfd); +@@ -355,32 +466,68 @@ void kgd2kfd_suspend(struct kfd_dev *kfd) + + if (kfd->init_complete) { + kfd->dqm->ops.stop(kfd->dqm); +- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); +- amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); +- amd_iommu_free_device(kfd->pdev); ++ if (kfd->device_info->is_need_iommu_device) { ++ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); ++ amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); ++ amd_iommu_free_device(kfd->pdev); ++ } + } + } + +-int kgd2kfd_resume(struct kfd_dev *kfd) ++int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem) + { +- unsigned int pasid_limit; +- int err; ++ return evict_bo(dev, mem); ++} + ++int kgd2kfd_restore(struct kfd_dev *kfd) ++{ ++ return restore(kfd); ++} ++ ++int kgd2kfd_resume(struct kfd_dev *kfd) ++{ + BUG_ON(kfd == NULL); + +- pasid_limit = kfd_get_pasid_limit(); ++ if (!kfd->init_complete) ++ return 0; ++ ++ return kfd_resume(kfd); ++ ++} ++ ++static int kfd_resume(struct kfd_dev *kfd) ++{ ++ int err = 0; ++ ++ if (kfd->device_info->is_need_iommu_device) { ++ unsigned int pasid_limit = kfd_get_pasid_limit(); + +- if (kfd->init_complete) { + err = amd_iommu_init_device(kfd->pdev, pasid_limit); +- if (err < 0) ++ if (err) + return -ENXIO; + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, +- iommu_pasid_shutdown_callback); +- amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); +- kfd->dqm->ops.start(kfd->dqm); ++ iommu_pasid_shutdown_callback); ++ amd_iommu_set_invalid_ppr_cb(kfd->pdev, ++ iommu_invalid_ppr_cb); + } + +- return 0; ++ err = kfd->dqm->ops.start(kfd->dqm); ++ if (err) { ++ dev_err(kfd_device, ++ "Error starting queue manager for device (%x:%x)\n", ++ kfd->pdev->vendor, kfd->pdev->device); ++ goto dqm_start_error; ++ } ++ ++ kfd->kfd2kgd->write_config_static_mem(kfd->kgd, true, 1, 3, 0); ++ ++ return err; ++ ++dqm_start_error: ++ if (kfd->device_info->is_need_iommu_device) ++ amd_iommu_free_device(kfd->pdev); ++ ++ return err; + } + + /* This is called directly from KGD at ISR. */ +@@ -399,6 +546,58 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) + spin_unlock(&kfd->interrupt_lock); + } + ++int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm) ++{ ++ struct kfd_process *p; ++ struct kfd_process_device *pdd; ++ int r; ++ ++ BUG_ON(kfd == NULL); ++ if (!kfd->init_complete) ++ return 0; ++ ++ /* Because we are called from arbitrary context (workqueue) as opposed ++ * to process context, kfd_process could attempt to exit while we are ++ * running so the lookup function returns a read-locked process. */ ++ p = kfd_lookup_process_by_mm(mm); ++ if (!p) ++ return -ENODEV; ++ ++ r = -ENODEV; ++ pdd = kfd_get_process_device_data(kfd, p); ++ if (pdd) ++ r = process_evict_queues(kfd->dqm, &pdd->qpd); ++ ++ up_read(&p->lock); ++ return r; ++} ++ ++int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) ++{ ++ struct kfd_process *p; ++ struct kfd_process_device *pdd; ++ int r; ++ ++ BUG_ON(kfd == NULL); ++ if (!kfd->init_complete) ++ return 0; ++ ++ /* Because we are called from arbitrary context (workqueue) as opposed ++ * to process context, kfd_process could attempt to exit while we are ++ * running so the lookup function returns a read-locked process. */ ++ p = kfd_lookup_process_by_mm(mm); ++ if (!p) ++ return -ENODEV; ++ ++ r = -ENODEV; ++ pdd = kfd_get_process_device_data(kfd, p); ++ if (pdd) ++ r = process_restore_queues(kfd->dqm, &pdd->qpd); ++ ++ up_read(&p->lock); ++ return r; ++} ++ + static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size) + { +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +index 42de22b..e123390 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +@@ -44,9 +44,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +-static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); +-static int destroy_queues_cpsch(struct device_queue_manager *dqm, +- bool preempt_static_queues, bool lock); ++static int execute_queues_cpsch(struct device_queue_manager *dqm); ++static int unmap_queues_cpsch(struct device_queue_manager *dqm, ++ enum kfd_unmap_queues_filter filter, ++ uint32_t filter_param, bool reset); + + static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, +@@ -116,11 +117,11 @@ static int allocate_vmid(struct device_queue_manager *dqm, + if (dqm->vmid_bitmap == 0) + return -ENOMEM; + +- bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM); ++ bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, ++ dqm->dev->vm_info.vmid_num_kfd); + clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); + +- /* Kaveri kfd vmid's starts from vmid 8 */ +- allocated_vmid = bit + KFD_VMID_START_OFFSET; ++ allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; + pr_debug("kfd: vmid allocation %d\n", allocated_vmid); + qpd->vmid = allocated_vmid; + q->properties.vmid = allocated_vmid; +@@ -128,6 +129,11 @@ static int allocate_vmid(struct device_queue_manager *dqm, + set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); + program_sh_mem_settings(dqm, qpd); + ++ dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, ++ allocated_vmid, ++ qpd->page_table_base); ++ /*invalidate the VM context after pasid and vmid mapping is set up*/ ++ radeon_flush_tlb(dqm->dev, qpd->pqm->process->pasid); + return 0; + } + +@@ -135,7 +141,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) + { +- int bit = qpd->vmid - KFD_VMID_START_OFFSET; ++ int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; + + /* Release the vmid mapping */ + set_pasid_vmid_mapping(dqm, 0, qpd->vmid); +@@ -175,6 +181,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, + } + *allocated_vmid = qpd->vmid; + q->properties.vmid = qpd->vmid; ++ /* ++ * Eviction state logic: we only mark active queues as evicted ++ * to avoid the overhead of restoring inactive queues later ++ */ ++ if (qpd->evicted) ++ q->properties.is_evicted = (q->properties.queue_size > 0 && ++ q->properties.queue_percent > 0 && ++ q->properties.queue_address != 0); + + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + retval = create_compute_queue_nocpsch(dqm, q, qpd); +@@ -281,8 +295,12 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + q->pipe, + q->queue); + ++ dqm->dev->kfd2kgd->alloc_memory_of_scratch( ++ dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); ++ + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, +- q->queue, (uint32_t __user *) q->properties.write_ptr); ++ q->queue, (uint32_t __user *) q->properties.write_ptr, ++ qpd->page_table_base); + if (retval != 0) { + deallocate_hqd(dqm, q); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); +@@ -362,34 +380,56 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) + { + int retval; + struct mqd_manager *mqd; ++ struct kfd_process_device *pdd; ++ + bool prev_active = false; + + BUG_ON(!dqm || !q || !q->mqd); + + mutex_lock(&dqm->lock); ++ ++ pdd = kfd_get_process_device_data(q->device, q->process); ++ if (!pdd) { ++ mutex_unlock(&dqm->lock); ++ return -ENODEV; ++ } + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (mqd == NULL) { + mutex_unlock(&dqm->lock); + return -ENOMEM; + } ++ /* ++ * Eviction state logic: we only mark active queues as evicted ++ * to avoid the overhead of restoring inactive queues later ++ */ ++ if (pdd->qpd.evicted > 0) ++ q->properties.is_evicted = (q->properties.queue_size > 0 && ++ q->properties.queue_percent > 0 && ++ q->properties.queue_address != 0); + ++ /* save previous activity state for counters */ + if (q->properties.is_active) + prev_active = true; + +- /* +- * +- * check active state vs. the previous state +- * and modify counter accordingly +- */ ++ + retval = mqd->update_mqd(mqd, q->mqd, &q->properties); ++ if (sched_policy == KFD_SCHED_POLICY_NO_HWS && ++ q->properties.type == KFD_QUEUE_TYPE_COMPUTE) ++ retval = mqd->load_mqd(mqd, q->mqd, q->pipe, ++ q->queue, ++ (uint32_t __user *)q->properties.write_ptr, 0); ++ /* ++ * check active state vs. the previous state ++ * and modify counter accordingly ++ */ + if ((q->properties.is_active) && (!prev_active)) + dqm->queue_count++; + else if ((!q->properties.is_active) && (prev_active)) + dqm->queue_count--; + + if (sched_policy != KFD_SCHED_POLICY_NO_HWS) +- retval = execute_queues_cpsch(dqm, false); ++ retval = execute_queues_cpsch(dqm); + + mutex_unlock(&dqm->lock); + return retval; +@@ -415,15 +455,115 @@ static struct mqd_manager *get_mqd_manager_nocpsch( + return mqd; + } + ++int process_evict_queues(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct queue *q, *next; ++ struct mqd_manager *mqd; ++ int retval = 0; ++ ++ BUG_ON(!dqm || !qpd); ++ ++ mutex_lock(&dqm->lock); ++ if (qpd->evicted++ > 0) { /* already evicted, do nothing */ ++ mutex_unlock(&dqm->lock); ++ return 0; ++ } ++ /* unactivate all active queues on the qpd */ ++ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { ++ mqd = dqm->ops.get_mqd_manager(dqm, ++ get_mqd_type_from_queue_type(q->properties.type)); ++ if (!mqd) { /* should not be here */ ++ BUG(); ++ continue; ++ } ++ /* if the queue is not active anyway, it is not evicted */ ++ if (q->properties.is_active == true) ++ q->properties.is_evicted = true; ++ ++ retval = mqd->update_mqd(mqd, q->mqd, &q->properties); ++ if (sched_policy == KFD_SCHED_POLICY_NO_HWS && ++ q->properties.type == KFD_QUEUE_TYPE_COMPUTE) ++ retval = mqd->load_mqd(mqd, q->mqd, q->pipe, ++ q->queue, ++ (uint32_t __user *)q->properties.write_ptr, 0); ++ if (q->properties.is_evicted) ++ dqm->queue_count--; ++ } ++ if (sched_policy != KFD_SCHED_POLICY_NO_HWS) ++ retval = execute_queues_cpsch(dqm); ++ ++ mutex_unlock(&dqm->lock); ++ return retval; ++ ++} ++ ++int process_restore_queues(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct queue *q, *next; ++ struct mqd_manager *mqd; ++ int retval = 0; ++ ++ ++ BUG_ON(!dqm || !qpd); ++ ++ mutex_lock(&dqm->lock); ++ if (qpd->evicted == 0) { /* already restored, do nothing */ ++ mutex_unlock(&dqm->lock); ++ return 0; ++ } ++ ++ if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ ++ qpd->evicted--; ++ mutex_unlock(&dqm->lock); ++ return 0; ++ } ++ ++ /* activate all active queues on the qpd */ ++ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { ++ mqd = dqm->ops.get_mqd_manager(dqm, ++ get_mqd_type_from_queue_type(q->properties.type)); ++ if (!mqd) { /* should not be here */ ++ BUG(); ++ continue; ++ } ++ if (q->properties.is_evicted) { ++ q->properties.is_evicted = false; ++ retval = mqd->update_mqd(mqd, q->mqd, &q->properties); ++ if (sched_policy == KFD_SCHED_POLICY_NO_HWS && ++ q->properties.type == KFD_QUEUE_TYPE_COMPUTE) ++ retval = ++ mqd->load_mqd( ++ mqd, ++ q->mqd, ++ q->pipe, ++ q->queue, ++ (uint32_t __user *)q->properties.write_ptr, ++ 0); ++ dqm->queue_count++; ++ } ++ } ++ if (sched_policy != KFD_SCHED_POLICY_NO_HWS) ++ retval = execute_queues_cpsch(dqm); ++ ++ if (retval == 0) ++ qpd->evicted = 0; ++ mutex_unlock(&dqm->lock); ++ return retval; ++ ++} ++ + static int register_process_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) + { ++ struct kfd_process_device *pdd; + struct device_process_node *n; + int retval; + + BUG_ON(!dqm || !qpd); + +- pr_debug("kfd: In func %s\n", __func__); ++ pr_debug("In func %s\n", __func__); + + n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL); + if (!n) +@@ -434,6 +574,11 @@ static int register_process_nocpsch(struct device_queue_manager *dqm, + mutex_lock(&dqm->lock); + list_add(&n->list, &dqm->queues); + ++ pdd = qpd_to_pdd(qpd); ++ qpd->page_table_base = ++ dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); ++ pr_debug("Retrieved PD address == 0x%08u\n", qpd->page_table_base); ++ + retval = dqm->ops_asic_specific.register_process(dqm, qpd); + + dqm->processes_count++; +@@ -499,7 +644,6 @@ static void init_interrupts(struct device_queue_manager *dqm) + if (is_pipe_enabled(dqm, 0, i)) + dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i); + } +- + static int init_scheduler(struct device_queue_manager *dqm) + { + int retval = 0; +@@ -534,7 +678,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) + for (i = 0; i < get_pipes_per_mec(dqm); i++) + dqm->allocated_queues[i] = (1 << get_queues_per_pipe(dqm)) - 1; + +- dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; ++ dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; + dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; + + init_scheduler(dqm); +@@ -607,8 +751,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + if (retval != 0) + return retval; + +- q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; +- q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM; ++ q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; ++ q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + + pr_debug("kfd: sdma id is: %d\n", q->sdma_id); + pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); +@@ -623,7 +767,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + } + + retval = mqd->load_mqd(mqd, q->mqd, 0, +- 0, NULL); ++ 0, NULL, 0); + if (retval != 0) { + deallocate_sdma_queue(dqm, q->sdma_id); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); +@@ -646,8 +790,7 @@ static int set_sched_resources(struct device_queue_manager *dqm) + + pr_debug("kfd: In func %s\n", __func__); + +- res.vmid_mask = (1 << VMID_PER_DEVICE) - 1; +- res.vmid_mask <<= KFD_VMID_START_OFFSET; ++ res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap; + + res.queue_mask = 0; + for (i = 0; i < KGD_MAX_QUEUES; ++i) { +@@ -696,6 +839,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) + dqm->queue_count = dqm->processes_count = 0; + dqm->sdma_queue_count = 0; + dqm->active_runlist = false; ++ dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; + retval = dqm->ops_asic_specific.initialize(dqm); + if (retval != 0) + goto fail_init_pipelines; +@@ -716,7 +860,7 @@ static int start_cpsch(struct device_queue_manager *dqm) + + retval = 0; + +- retval = pm_init(&dqm->packets, dqm); ++ retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); + if (retval != 0) + goto fail_packet_manager_init; + +@@ -743,7 +887,9 @@ static int start_cpsch(struct device_queue_manager *dqm) + kfd_bind_process_to_device(dqm->dev, + node->qpd->pqm->process); + +- execute_queues_cpsch(dqm, true); ++ mutex_lock(&dqm->lock); ++ execute_queues_cpsch(dqm); ++ mutex_unlock(&dqm->lock); + + return 0; + fail_allocate_vidmem: +@@ -760,7 +906,11 @@ static int stop_cpsch(struct device_queue_manager *dqm) + + BUG_ON(!dqm); + +- destroy_queues_cpsch(dqm, true, true); ++ mutex_lock(&dqm->lock); ++ ++ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false); ++ ++ mutex_unlock(&dqm->lock); + + list_for_each_entry(node, &dqm->queues, list) { + pdd = qpd_to_pdd(node->qpd); +@@ -799,7 +949,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, + list_add(&kq->list, &qpd->priv_queue_list); + dqm->queue_count++; + qpd->is_debug = true; +- execute_queues_cpsch(dqm, false); ++ execute_queues_cpsch(dqm); + mutex_unlock(&dqm->lock); + + return 0; +@@ -815,11 +965,11 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, + + mutex_lock(&dqm->lock); + /* here we actually preempt the DIQ */ +- destroy_queues_cpsch(dqm, true, false); ++ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false); + list_del(&kq->list); + dqm->queue_count--; + qpd->is_debug = false; +- execute_queues_cpsch(dqm, false); ++ execute_queues_cpsch(dqm); + /* + * Unconditionally decrement this counter, regardless of the queue's + * type. +@@ -830,14 +980,6 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, + mutex_unlock(&dqm->lock); + } + +-static void select_sdma_engine_id(struct queue *q) +-{ +- static int sdma_id; +- +- q->sdma_id = sdma_id; +- sdma_id = (sdma_id + 1) % 2; +-} +- + static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd, int *allocate_vmid) + { +@@ -860,9 +1002,15 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + goto out; + } + +- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) +- select_sdma_engine_id(q); +- ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { ++ retval = allocate_sdma_queue(dqm, &q->sdma_id); ++ if (retval != 0) ++ goto out; ++ q->properties.sdma_queue_id = ++ q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; ++ q->properties.sdma_engine_id = ++ q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; ++ } + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + +@@ -870,8 +1018,19 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + mutex_unlock(&dqm->lock); + return -ENOMEM; + } ++ /* ++ * Eviction state logic: we only mark active queues as evicted ++ * to avoid the overhead of restoring inactive queues later ++ */ ++ if (qpd->evicted) ++ q->properties.is_evicted = (q->properties.queue_size > 0 && ++ q->properties.queue_percent > 0 && ++ q->properties.queue_address != 0); + + dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); ++ ++ q->properties.tba_addr = qpd->tba_addr; ++ q->properties.tma_addr = qpd->tma_addr; + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval != 0) +@@ -880,7 +1039,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + list_add(&q->list, &qpd->queues_list); + if (q->properties.is_active) { + dqm->queue_count++; +- retval = execute_queues_cpsch(dqm, false); ++ retval = execute_queues_cpsch(dqm); + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) +@@ -917,20 +1076,20 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + return 0; + } + +-static int destroy_sdma_queues(struct device_queue_manager *dqm, ++static int unmap_sdma_queues(struct device_queue_manager *dqm, + unsigned int sdma_engine) + { + return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, +- KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false, ++ KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, + sdma_engine); + } + +-static int destroy_queues_cpsch(struct device_queue_manager *dqm, +- bool preempt_static_queues, bool lock) ++/* dqm->lock mutex has to be locked before calling this function */ ++static int unmap_queues_cpsch(struct device_queue_manager *dqm, ++ enum kfd_unmap_queues_filter filter, ++ uint32_t filter_param, bool reset) + { + int retval; +- enum kfd_preempt_type_filter preempt_type; +- struct kfd_process_device *pdd; + + BUG_ON(!dqm); + +@@ -940,23 +1099,21 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm, + mutex_lock(&dqm->lock); + if (!dqm->active_runlist) + goto out; ++ if (dqm->active_runlist == false) ++ return retval; + + pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n", + dqm->sdma_queue_count); + + if (dqm->sdma_queue_count > 0) { +- destroy_sdma_queues(dqm, 0); +- destroy_sdma_queues(dqm, 1); ++ unmap_sdma_queues(dqm, 0); ++ unmap_sdma_queues(dqm, 1); + } + +- preempt_type = preempt_static_queues ? +- KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES : +- KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES; +- + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, +- preempt_type, 0, false, 0); ++ filter, filter_param, reset, 0); + if (retval != 0) +- goto out; ++ return retval; + + *dqm->fence_addr = KFD_FENCE_INIT; + pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, +@@ -965,55 +1122,47 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm, + retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); + if (retval != 0) { +- pdd = kfd_get_process_device_data(dqm->dev, +- kfd_get_process(current)); +- pdd->reset_wavefronts = true; +- goto out; ++ pr_err("kfd: unmapping queues failed."); ++ return retval; + } ++ + pm_release_ib(&dqm->packets); + dqm->active_runlist = false; + +-out: +- if (lock) +- mutex_unlock(&dqm->lock); + return retval; + } + +-static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock) ++/* dqm->lock mutex has to be locked before calling this function */ ++static int execute_queues_cpsch(struct device_queue_manager *dqm) + { + int retval; + + BUG_ON(!dqm); + +- if (lock) +- mutex_lock(&dqm->lock); +- +- retval = destroy_queues_cpsch(dqm, false, false); ++ retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, ++ 0, false); + if (retval != 0) { + pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption"); +- goto out; ++ return retval; + } + + if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { + retval = 0; +- goto out; ++ return retval; + } + + if (dqm->active_runlist) { + retval = 0; +- goto out; ++ return retval; + } + + retval = pm_send_runlist(&dqm->packets, &dqm->queues); + if (retval != 0) { + pr_err("kfd: failed to execute runlist"); +- goto out; ++ return retval; + } + dqm->active_runlist = true; + +-out: +- if (lock) +- mutex_unlock(&dqm->lock); + return retval; + } + +@@ -1051,14 +1200,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, + goto failed; + } + +- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; ++ deallocate_sdma_queue(dqm, q->sdma_id); ++ } + + list_del(&q->list); + if (q->properties.is_active) + dqm->queue_count--; + +- execute_queues_cpsch(dqm, false); ++ retval = execute_queues_cpsch(dqm); + + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + +@@ -1072,7 +1223,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, + + mutex_unlock(&dqm->lock); + +- return 0; ++ return retval; + + failed: + failed_try_destroy_debugged_queue: +@@ -1156,6 +1307,172 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, + return false; + } + ++static int set_trap_handler(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd, ++ uint64_t tba_addr, ++ uint64_t tma_addr) ++{ ++ uint64_t *tma; ++ ++ tma = (uint64_t *)(qpd->cwsr_kaddr + dqm->dev->tma_offset); ++ tma[0] = tba_addr; ++ tma[1] = tma_addr; ++ return 0; ++} ++ ++ ++static int set_page_directory_base(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct kfd_process_device *pdd; ++ uint32_t pd_base; ++ int retval = 0; ++ ++ BUG_ON(!dqm || !qpd); ++ ++ mutex_lock(&dqm->lock); ++ ++ pdd = qpd_to_pdd(qpd); ++ ++ /* Retrieve PD base */ ++ pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); ++ ++ /* If it has not changed, just get out */ ++ if (qpd->page_table_base == pd_base) ++ goto out; ++ ++ /* Update PD Base in QPD */ ++ qpd->page_table_base = pd_base; ++ pr_debug("Updated PD address == 0x%08u\n", pd_base); ++ ++ /* ++ * Preempt queues, destroy runlist and create new runlist. Queues ++ * will have the update PD base address ++ */ ++ if (sched_policy != KFD_SCHED_POLICY_NO_HWS) ++ retval = execute_queues_cpsch(dqm); ++ ++out: ++ mutex_unlock(&dqm->lock); ++ ++ return retval; ++} ++ ++static int process_termination_nocpsch(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct queue *q, *next; ++ struct mqd_manager *mqd; ++ struct device_process_node *cur, *next_dpn; ++ ++ mutex_lock(&dqm->lock); ++ ++ /* Clear all user mode queues */ ++ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { ++ mqd = dqm->ops.get_mqd_manager(dqm, ++ get_mqd_type_from_queue_type(q->properties.type)); ++ if (!mqd) { ++ mutex_unlock(&dqm->lock); ++ return -ENOMEM; ++ } ++ ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { ++ dqm->sdma_queue_count--; ++ deallocate_sdma_queue(dqm, q->sdma_id); ++ } ++ ++ list_del(&q->list); ++ if (q->properties.is_active) ++ dqm->queue_count--; ++ ++ dqm->total_queue_count--; ++ mqd->destroy_mqd(mqd, q->mqd, ++ KFD_PREEMPT_TYPE_WAVEFRONT_RESET, ++ QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, ++ q->pipe, q->queue); ++ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); ++ if (list_empty(&qpd->queues_list)) ++ deallocate_vmid(dqm, qpd, q); ++ } ++ ++ /* Unregister process */ ++ list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { ++ if (qpd == cur->qpd) { ++ list_del(&cur->list); ++ kfree(cur); ++ dqm->processes_count--; ++ break; ++ } ++ } ++ ++ mutex_unlock(&dqm->lock); ++ ++ return 0; ++} ++ ++ ++static int process_termination_cpsch(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ int retval; ++ struct queue *q, *next; ++ struct kernel_queue *kq, *kq_next; ++ struct mqd_manager *mqd; ++ struct device_process_node *cur, *next_dpn; ++ ++ retval = 0; ++ ++ mutex_lock(&dqm->lock); ++ ++ /* Clean all kernel queues */ ++ list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) { ++ list_del(&kq->list); ++ dqm->queue_count--; ++ qpd->is_debug = false; ++ dqm->total_queue_count--; ++ } ++ ++ /* Clear all user mode queues */ ++ list_for_each_entry(q, &qpd->queues_list, list) { ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { ++ dqm->sdma_queue_count--; ++ deallocate_sdma_queue(dqm, q->sdma_id); ++ } ++ ++ if (q->properties.is_active) ++ dqm->queue_count--; ++ ++ dqm->total_queue_count--; ++ } ++ ++ /* Unregister process */ ++ list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { ++ if (qpd == cur->qpd) { ++ list_del(&cur->list); ++ kfree(cur); ++ dqm->processes_count--; ++ break; ++ } ++ } ++ ++ retval = execute_queues_cpsch(dqm); ++ ++ /* lastly, free mqd resources */ ++ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { ++ mqd = dqm->ops.get_mqd_manager(dqm, ++ get_mqd_type_from_queue_type(q->properties.type)); ++ if (!mqd) { ++ mutex_unlock(&dqm->lock); ++ return -ENOMEM; ++ } ++ list_del(&q->list); ++ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); ++ } ++ ++ mutex_unlock(&dqm->lock); ++ return retval; ++} ++ + struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + { + struct device_queue_manager *dqm; +@@ -1186,6 +1503,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; + dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; ++ dqm->ops.set_trap_handler = set_trap_handler; ++ dqm->ops.set_page_directory_base = set_page_directory_base; ++ dqm->ops.process_termination = process_termination_cpsch; + break; + case KFD_SCHED_POLICY_NO_HWS: + /* initialize dqm for no cp scheduling */ +@@ -1200,6 +1520,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + dqm->ops.initialize = initialize_nocpsch; + dqm->ops.uninitialize = uninitialize_nocpsch; + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; ++ dqm->ops.set_trap_handler = set_trap_handler; ++ dqm->ops.set_page_directory_base = set_page_directory_base; ++ dqm->ops.process_termination = process_termination_nocpsch; + break; + default: + BUG(); +@@ -1214,6 +1537,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + case CHIP_KAVERI: + device_queue_manager_init_cik(&dqm->ops_asic_specific); + break; ++ ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ device_queue_manager_init_vi_tonga(&dqm->ops_asic_specific); ++ break; + } + + if (dqm->ops.initialize(dqm) != 0) { +@@ -1231,3 +1559,20 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) + dqm->ops.uninitialize(dqm); + kfree(dqm); + } ++ ++int kfd_process_vm_fault(struct device_queue_manager *dqm, ++ unsigned int pasid) ++{ ++ struct kfd_process_device *pdd; ++ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); ++ int ret = 0; ++ ++ if (!p) ++ return -EINVAL; ++ pdd = kfd_get_process_device_data(dqm->dev, p); ++ if (pdd) ++ ret = process_evict_queues(dqm, &pdd->qpd); ++ up_read(&p->lock); ++ ++ return ret; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +index faf820a..d6af017 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +@@ -29,10 +29,7 @@ + #include "kfd_priv.h" + #include "kfd_mqd_manager.h" + +-#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500) +-#define CIK_VMID_NUM (8) +-#define KFD_VMID_START_OFFSET (8) +-#define VMID_PER_DEVICE CIK_VMID_NUM ++#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (9000) + #define KFD_DQM_FIRST_PIPE (0) + #define CIK_SDMA_QUEUES (4) + #define CIK_SDMA_QUEUES_PER_ENGINE (2) +@@ -79,6 +76,12 @@ struct device_process_node { + * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the + * memory apertures. + * ++ * @set_page_directory_base: Sets the PD base address (GPU local memory) ++ * in all the queues of the relevant process running on the specified device. ++ * It preempts the queues, updates the value and execute the runlist again. ++ * ++ * @process_termination: Clears all process queues belongs to that device. ++ * + */ + + struct device_queue_manager_ops { +@@ -122,6 +125,16 @@ struct device_queue_manager_ops { + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); ++ ++ int (*set_trap_handler)(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd, ++ uint64_t tba_addr, ++ uint64_t tma_addr); ++ ++ int (*set_page_directory_base)(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++ int (*process_termination)(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); + }; + + struct device_queue_manager_asic_ops { +@@ -178,12 +191,20 @@ struct device_queue_manager { + + void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops); + void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops); ++void device_queue_manager_init_vi_tonga( ++ struct device_queue_manager_asic_ops *ops); + void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + unsigned int get_queues_num(struct device_queue_manager *dqm); + unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); + unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); + ++int process_evict_queues(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++int process_restore_queues(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++ ++ + static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) + { + return (pdd->lds_base >> 16) & 0xFF; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +index 48dc056..da55e39c 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +@@ -24,6 +24,7 @@ + #include "kfd_device_queue_manager.h" + #include "cik_regs.h" + #include "oss/oss_2_4_sh_mask.h" ++#include "gca/gfx_7_2_sh_mask.h" + + static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, +@@ -125,6 +126,7 @@ static int register_process_cik(struct device_queue_manager *dqm, + } else { + temp = get_sh_mem_bases_nybble_64(pdd); + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); ++ qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; + } + + pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +index 7e9cae9..c023e50 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +@@ -39,6 +39,31 @@ static int initialize_cpsch_vi(struct device_queue_manager *dqm); + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + ++/* ++ * Tonga device queue manager functions ++ */ ++static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd, ++ enum cache_policy default_policy, ++ enum cache_policy alternate_policy, ++ void __user *alternate_aperture_base, ++ uint64_t alternate_aperture_size); ++static int register_process_vi_tonga(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++static void init_sdma_vm_tonga(struct device_queue_manager *dqm, ++ struct queue *q, ++ struct qcm_process_device *qpd); ++ ++void device_queue_manager_init_vi_tonga( ++ struct device_queue_manager_asic_ops *ops) ++{ ++ ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; ++ ops->register_process = register_process_vi_tonga; ++ ops->initialize = initialize_cpsch_vi; ++ ops->init_sdma_vm = init_sdma_vm_tonga; ++} ++ ++ + void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops) + { + ops->set_cache_memory_policy = set_cache_memory_policy_vi; +@@ -104,6 +129,33 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, + return true; + } + ++static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd, ++ enum cache_policy default_policy, ++ enum cache_policy alternate_policy, ++ void __user *alternate_aperture_base, ++ uint64_t alternate_aperture_size) ++{ ++ uint32_t default_mtype; ++ uint32_t ape1_mtype; ++ ++ default_mtype = (default_policy == cache_policy_coherent) ? ++ MTYPE_UC : ++ MTYPE_NC_NV; ++ ++ ape1_mtype = (alternate_policy == cache_policy_coherent) ? ++ MTYPE_UC : ++ MTYPE_NC_NV; ++ ++ qpd->sh_mem_config = ++ SH_MEM_ALIGNMENT_MODE_UNALIGNED << ++ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | ++ default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | ++ ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; ++ ++ return true; ++} ++ + static int register_process_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) + { +@@ -137,6 +189,8 @@ static int register_process_vi(struct device_queue_manager *dqm, + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 << + SH_MEM_CONFIG__ADDRESS_MODE__SHIFT; ++ qpd->sh_mem_config |= 1 << ++ SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; + } + + pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", +@@ -145,6 +199,41 @@ static int register_process_vi(struct device_queue_manager *dqm, + return 0; + } + ++static int register_process_vi_tonga(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct kfd_process_device *pdd; ++ unsigned int temp; ++ ++ BUG_ON(!dqm || !qpd); ++ ++ pdd = qpd_to_pdd(qpd); ++ ++ /* check if sh_mem_config register already configured */ ++ if (qpd->sh_mem_config == 0) { ++ qpd->sh_mem_config = ++ SH_MEM_ALIGNMENT_MODE_UNALIGNED << ++ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | ++ MTYPE_UC << ++ SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | ++ MTYPE_UC << ++ SH_MEM_CONFIG__APE1_MTYPE__SHIFT; ++ ++ qpd->sh_mem_ape1_limit = 0; ++ qpd->sh_mem_ape1_base = 0; ++ } ++ ++ /* On dGPU we're always in GPUVM64 addressing mode with 64-bit ++ * aperture addresses. */ ++ temp = get_sh_mem_bases_nybble_64(pdd); ++ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); ++ ++ pr_debug("kfd: sh_mem_bases nybble: 0x%X and register 0x%X\n", ++ temp, qpd->sh_mem_bases); ++ ++ return 0; ++} ++ + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) + { +@@ -161,6 +250,23 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + q->properties.sdma_vm_addr = value; + } + ++static void init_sdma_vm_tonga(struct device_queue_manager *dqm, ++ struct queue *q, ++ struct qcm_process_device *qpd) ++{ ++ uint32_t value = 0; ++ ++ if (q->process->is_32bit_user_mode) ++ value |= (1 << SDMA0_RLC0_VIRTUAL_ADDR__PTR32__SHIFT) | ++ get_sh_mem_bases_32(qpd_to_pdd(qpd)); ++ else ++ value |= ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << ++ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & ++ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; ++ q->properties.sdma_vm_addr = value; ++} ++ ++ + static int initialize_cpsch_vi(struct device_queue_manager *dqm) + { + return 0; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +index 453c5d6..d6a7e2a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +@@ -142,12 +142,11 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + +- pr_debug("kfd: mapping doorbell page in %s\n" ++ pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n" + " target user address == 0x%08llX\n" + " physical address == 0x%08llX\n" + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", +- __func__, + (unsigned long long) vma->vm_start, address, vma->vm_flags, + doorbell_process_allocation()); + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +index d1ce83d..23b5936 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +@@ -32,11 +32,10 @@ + #include "kfd_events.h" + #include <linux/device.h> + +-/* +- * A task can only be on a single wait_queue at a time, but we need to support ++/* A task can only be on a single wait_queue at a time, but we need to support + * waiting on multiple events (any/all). +- * Instead of each event simply having a wait_queue with sleeping tasks, it +- * has a singly-linked list of tasks. ++ * Instead of each event simply having a wait_queue with sleeping tasks, it has a ++ * singly-linked list of tasks. + * A thread that wants to sleep creates an array of these, one for each event + * and adds one to each event's waiter chain. + */ +@@ -52,12 +51,11 @@ struct kfd_event_waiter { + uint32_t input_index; + }; + +-/* +- * Over-complicated pooled allocator for event notification slots. ++/* Over-complicated pooled allocator for event notification slots. + * +- * Each signal event needs a 64-bit signal slot where the signaler will write +- * a 1 before sending an interrupt.l (This is needed because some interrupts +- * do not contain enough spare data bits to identify an event.) ++ * Each signal event needs a 64-bit signal slot where the signaler will write a 1 ++ * before sending an interrupt.l (This is needed because some interrupts do not ++ * contain enough spare data bits to identify an event.) + * We get whole pages from vmalloc and map them to the process VA. + * Individual signal events are then allocated a slot in a page. + */ +@@ -65,6 +63,7 @@ struct kfd_event_waiter { + struct signal_page { + struct list_head event_pages; /* kfd_process.signal_event_pages */ + uint64_t *kernel_address; ++ uint64_t handle; + uint64_t __user *user_address; + uint32_t page_index; /* Index into the mmap aperture. */ + unsigned int free_slots; +@@ -74,8 +73,7 @@ struct signal_page { + #define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT + #define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE) + #define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1) +-#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \ +- SLOT_BITMAP_SIZE * sizeof(long)) ++#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + SLOT_BITMAP_SIZE * sizeof(long)) + + /* + * For signal events, the event ID is used as the interrupt user data. +@@ -85,23 +83,27 @@ struct signal_page { + #define INTERRUPT_DATA_BITS 8 + #define SIGNAL_EVENT_ID_SLOT_SHIFT 0 + ++/* We can only create 8 debug events */ ++ ++#define KFD_DEBUG_EVENT_LIMIT 8 ++#define KFD_DEBUG_EVENT_MASK 0x1F ++#define KFD_DEBUG_EVENT_SHIFT 5 ++ + static uint64_t *page_slots(struct signal_page *page) + { + return page->kernel_address; + } + +-static bool allocate_free_slot(struct kfd_process *process, +- struct signal_page **out_page, +- unsigned int *out_slot_index) ++static bool ++allocate_free_slot(struct kfd_process *process, ++ struct signal_page **out_page, ++ unsigned int *out_slot_index) + { + struct signal_page *page; + + list_for_each_entry(page, &process->signal_event_pages, event_pages) { + if (page->free_slots > 0) { +- unsigned int slot = +- find_first_zero_bit(page->used_slot_bitmap, +- SLOTS_PER_PAGE); +- ++ unsigned int slot = find_first_zero_bit(page->used_slot_bitmap, SLOTS_PER_PAGE); + __set_bit(slot, page->used_slot_bitmap); + page->free_slots--; + +@@ -130,6 +132,8 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) + { + void *backing_store; + struct signal_page *page; ++ unsigned int slot; ++ int i; + + page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); + if (!page) +@@ -137,17 +141,23 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) + + page->free_slots = SLOTS_PER_PAGE; + +- backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, ++ backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, \ + get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + if (!backing_store) + goto fail_alloc_signal_store; + + /* prevent user-mode info leaks */ +- memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, +- KFD_SIGNAL_EVENT_LIMIT * 8); +- ++ memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, KFD_SIGNAL_EVENT_LIMIT * 8); + page->kernel_address = backing_store; + ++ /* Set bits of debug events to prevent allocation */ ++ for (i = 0 ; i < KFD_DEBUG_EVENT_LIMIT ; i++) { ++ slot = (i << KFD_DEBUG_EVENT_SHIFT) | ++ KFD_DEBUG_EVENT_MASK; ++ __set_bit(slot, page->used_slot_bitmap); ++ page->free_slots--; ++ } ++ + if (list_empty(&p->signal_event_pages)) + page->page_index = 0; + else +@@ -169,10 +179,10 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) + return false; + } + +-static bool allocate_event_notification_slot(struct file *devkfd, +- struct kfd_process *p, +- struct signal_page **page, +- unsigned int *signal_slot_index) ++static bool ++allocate_event_notification_slot(struct file *devkfd, struct kfd_process *p, ++ struct signal_page **page, ++ unsigned int *signal_slot_index) + { + bool ret; + +@@ -186,6 +196,88 @@ static bool allocate_event_notification_slot(struct file *devkfd, + return ret; + } + ++static bool ++allocate_signal_page_dgpu(struct kfd_process *p, ++ uint64_t *kernel_address, uint64_t handle) ++{ ++ struct signal_page *my_page; ++ ++ my_page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); ++ if (!my_page) ++ return false; ++ ++ /* prevent user-mode info leaks */ ++ memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, ++ KFD_SIGNAL_EVENT_LIMIT * 8); ++ ++ my_page->kernel_address = kernel_address; ++ my_page->handle = handle; ++ my_page->user_address = NULL; ++ my_page->free_slots = SLOTS_PER_PAGE; ++ if (list_empty(&p->signal_event_pages)) ++ my_page->page_index = 0; ++ else ++ my_page->page_index = list_tail_entry(&p->signal_event_pages, ++ struct signal_page, ++ event_pages)->page_index + 1; ++ ++ pr_debug("allocated new event signal page at %p, for process %p\n", ++ my_page, p); ++ pr_debug("page index is %d\n", my_page->page_index); ++ ++ list_add(&my_page->event_pages, &p->signal_event_pages); ++ ++ return true; ++} ++ ++void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle) ++{ ++ struct signal_page *page, *tmp; ++ ++ list_for_each_entry_safe(page, tmp, &p->signal_event_pages, ++ event_pages) { ++ if (page->handle == handle) { ++ list_del(&page->event_pages); ++ kfree(page); ++ break; ++ } ++ } ++} ++ ++static bool ++allocate_debug_event_notification_slot(struct file *devkfd, ++ struct kfd_process *p, ++ struct signal_page **out_page, ++ unsigned int *out_slot_index) ++{ ++ struct signal_page *page; ++ unsigned int slot; ++ bool ret; ++ ++ if (list_empty(&p->signal_event_pages)) { ++ ret = allocate_signal_page(devkfd, p); ++ if (ret == false) ++ return ret; ++ } ++ ++ page = list_entry((&p->signal_event_pages)->next, struct signal_page, ++ event_pages); ++ slot = (p->debug_event_count << KFD_DEBUG_EVENT_SHIFT) | ++ KFD_DEBUG_EVENT_MASK; ++ ++ pr_debug("page == %p\n", page); ++ pr_debug("slot == %d\n", slot); ++ ++ page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT; ++ *out_page = page; ++ *out_slot_index = slot; ++ ++ pr_debug("allocated debug event signal slot in page %p, slot %d\n", ++ page, slot); ++ ++ return true; ++} ++ + /* Assumes that the process's event_mutex is locked. */ + static void release_event_notification_slot(struct signal_page *page, + size_t slot_index) +@@ -202,10 +294,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, + { + struct signal_page *page; + +- /* +- * This is safe because we don't delete signal pages until the +- * process exits. +- */ ++ /* This is safe because we don't delete signal pages until the process exits. */ + list_for_each_entry(page, &p->signal_event_pages, event_pages) + if (page->page_index == page_index) + return page; +@@ -213,10 +302,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, + return NULL; + } + +-/* +- * Assumes that p->event_mutex is held and of course that p is not going +- * away (current or locked). +- */ ++/* Assumes that p->event_mutex is held and of course that p is not going away (current or locked). */ + static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) + { + struct kfd_event *ev; +@@ -231,32 +317,27 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) + static u32 make_signal_event_id(struct signal_page *page, + unsigned int signal_slot_index) + { +- return page->page_index | +- (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT); ++ return page->page_index | (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT); + } + +-/* +- * Produce a kfd event id for a nonsignal event. +- * These are arbitrary numbers, so we do a sequential search through +- * the hash table for an unused number. ++/* Produce a kfd event id for a nonsignal event. ++ * These are arbitrary numbers, so we do a sequential search through the hash table ++ * for an unused number. + */ + static u32 make_nonsignal_event_id(struct kfd_process *p) + { + u32 id; + + for (id = p->next_nonsignal_event_id; +- id < KFD_LAST_NONSIGNAL_EVENT_ID && +- lookup_event_by_id(p, id) != NULL; +- id++) ++ id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL; ++ id++) + ; + + if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { + +- /* +- * What if id == LAST_NONSIGNAL_EVENT_ID - 1? +- * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so +- * the first loop fails immediately and we proceed with the +- * wraparound loop below. ++ /* What if id == LAST_NONSIGNAL_EVENT_ID - 1? ++ * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so the first loop ++ * fails immediately and we proceed with the wraparound loop below. + */ + p->next_nonsignal_event_id = id + 1; + +@@ -264,54 +345,68 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) + } + + for (id = KFD_FIRST_NONSIGNAL_EVENT_ID; +- id < KFD_LAST_NONSIGNAL_EVENT_ID && +- lookup_event_by_id(p, id) != NULL; +- id++) ++ id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL; ++ id++) + ; + + + if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { + p->next_nonsignal_event_id = id + 1; + return id; ++ } else { ++ p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; ++ return 0; + } +- +- p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; +- return 0; + } + +-static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p, +- struct signal_page *page, +- unsigned int signal_slot) ++static struct kfd_event * ++lookup_event_by_page_slot(struct kfd_process *p, ++ struct signal_page *page, unsigned int signal_slot) + { + return lookup_event_by_id(p, make_signal_event_id(page, signal_slot)); + } + +-static int create_signal_event(struct file *devkfd, +- struct kfd_process *p, +- struct kfd_event *ev) ++static int ++create_signal_event(struct file *devkfd, struct kfd_process *p, struct kfd_event *ev) + { +- if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { ++ if ((ev->type == KFD_EVENT_TYPE_SIGNAL) && ++ (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT)) { + pr_warn("amdkfd: Signal event wasn't created because limit was reached\n"); + return -ENOMEM; ++ } else if ((ev->type == KFD_EVENT_TYPE_DEBUG) && ++ (p->debug_event_count == KFD_DEBUG_EVENT_LIMIT)) { ++ pr_warn("amdkfd: Debug event wasn't created because limit was reached\n"); ++ return -ENOMEM; + } + +- if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page, ++ if (ev->type == KFD_EVENT_TYPE_SIGNAL) { ++ if (!allocate_event_notification_slot(devkfd, p, ++ &ev->signal_page, + &ev->signal_slot_index)) { +- pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n"); +- return -ENOMEM; +- } ++ pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n"); ++ return -ENOMEM; ++ } + +- p->signal_event_count++; ++ p->signal_event_count++; + +- ev->user_signal_address = +- &ev->signal_page->user_address[ev->signal_slot_index]; ++ if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) == ++ KFD_DEBUG_EVENT_MASK) ++ p->signal_event_count++; + +- ev->event_id = make_signal_event_id(ev->signal_page, +- ev->signal_slot_index); ++ } else if (ev->type == KFD_EVENT_TYPE_DEBUG) { ++ if (!allocate_debug_event_notification_slot(devkfd, p, ++ &ev->signal_page, ++ &ev->signal_slot_index)) { ++ pr_warn("amdkfd: Debug event wasn't created because out of kernel memory\n"); ++ return -ENOMEM; ++ } + +- pr_debug("signal event number %zu created with id %d, address %p\n", +- p->signal_event_count, ev->event_id, +- ev->user_signal_address); ++ p->debug_event_count++; ++ } ++ ++ ev->user_signal_address = &ev->signal_page->user_address[ev->signal_slot_index]; ++ ++ ev->event_id = make_signal_event_id(ev->signal_page, ev->signal_slot_index); + + pr_debug("signal event number %zu created with id %d, address %p\n", + p->signal_event_count, ev->event_id, +@@ -320,12 +415,10 @@ static int create_signal_event(struct file *devkfd, + return 0; + } + +-/* +- * No non-signal events are supported yet. +- * We create them as events that never signal. +- * Set event calls from user-mode are failed. +- */ +-static int create_other_event(struct kfd_process *p, struct kfd_event *ev) ++/* No non-signal events are supported yet. ++ * We create them as events that never signal. Set event calls from user-mode are failed. */ ++static int ++create_other_event(struct kfd_process *p, struct kfd_event *ev) + { + ev->event_id = make_nonsignal_event_id(p); + if (ev->event_id == 0) +@@ -341,20 +434,25 @@ void kfd_event_init_process(struct kfd_process *p) + INIT_LIST_HEAD(&p->signal_event_pages); + p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; + p->signal_event_count = 0; ++ p->debug_event_count = 0; + } + + static void destroy_event(struct kfd_process *p, struct kfd_event *ev) + { + if (ev->signal_page != NULL) { +- release_event_notification_slot(ev->signal_page, +- ev->signal_slot_index); +- p->signal_event_count--; ++ if (ev->type == KFD_EVENT_TYPE_SIGNAL) { ++ release_event_notification_slot(ev->signal_page, ++ ev->signal_slot_index); ++ p->signal_event_count--; ++ if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) == ++ KFD_DEBUG_EVENT_MASK) ++ p->signal_event_count--; ++ } else if (ev->type == KFD_EVENT_TYPE_DEBUG) { ++ p->debug_event_count--; ++ } + } + +- /* +- * Abandon the list of waiters. Individual waiting threads will +- * clean up their own data. +- */ ++ /* Abandon the list of waiters. Individual waiting threads will clean up their own data.*/ + list_del(&ev->waiters); + + hash_del(&ev->events); +@@ -371,18 +469,17 @@ static void destroy_events(struct kfd_process *p) + destroy_event(p, ev); + } + +-/* +- * We assume that the process is being destroyed and there is no need to +- * unmap the pages or keep bookkeeping data in order. +- */ ++/* We assume that the process is being destroyed and there is no need to unmap the pages ++ * or keep bookkeeping data in order. */ + static void shutdown_signal_pages(struct kfd_process *p) + { + struct signal_page *page, *tmp; + +- list_for_each_entry_safe(page, tmp, &p->signal_event_pages, +- event_pages) { +- free_pages((unsigned long)page->kernel_address, +- get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); ++ list_for_each_entry_safe(page, tmp, &p->signal_event_pages, event_pages) { ++ if (page->user_address) { ++ free_pages((unsigned long)page->kernel_address, ++ get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); ++ } + kfree(page); + } + } +@@ -395,8 +492,7 @@ void kfd_event_free_process(struct kfd_process *p) + + static bool event_can_be_gpu_signaled(const struct kfd_event *ev) + { +- return ev->type == KFD_EVENT_TYPE_SIGNAL || +- ev->type == KFD_EVENT_TYPE_DEBUG; ++ return ev->type == KFD_EVENT_TYPE_SIGNAL || ev->type == KFD_EVENT_TYPE_DEBUG; + } + + static bool event_can_be_cpu_signaled(const struct kfd_event *ev) +@@ -407,11 +503,12 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) + int kfd_event_create(struct file *devkfd, struct kfd_process *p, + uint32_t event_type, bool auto_reset, uint32_t node_id, + uint32_t *event_id, uint32_t *event_trigger_data, +- uint64_t *event_page_offset, uint32_t *event_slot_index) ++ uint64_t *event_page_offset, uint32_t *event_slot_index, ++ void *kern_addr) + { + int ret = 0; +- struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); + ++ struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + return -ENOMEM; + +@@ -421,17 +518,20 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, + + INIT_LIST_HEAD(&ev->waiters); + +- *event_page_offset = 0; +- + mutex_lock(&p->event_mutex); + ++ if (kern_addr && list_empty(&p->signal_event_pages)) ++ allocate_signal_page_dgpu(p, kern_addr, *event_page_offset); ++ ++ *event_page_offset = 0; ++ + switch (event_type) { + case KFD_EVENT_TYPE_SIGNAL: + case KFD_EVENT_TYPE_DEBUG: + ret = create_signal_event(devkfd, p, ev); + if (!ret) { + *event_page_offset = (ev->signal_page->page_index | +- KFD_MMAP_EVENTS_MASK); ++ KFD_MMAP_TYPE_EVENTS); + *event_page_offset <<= PAGE_SHIFT; + *event_slot_index = ev->signal_slot_index; + } +@@ -538,8 +638,7 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id) + + static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev) + { +- page_slots(ev->signal_page)[ev->signal_slot_index] = +- UNSIGNALED_EVENT_SLOT; ++ page_slots(ev->signal_page)[ev->signal_slot_index] = UNSIGNALED_EVENT_SLOT; + } + + static bool is_slot_signaled(struct signal_page *page, unsigned int index) +@@ -547,8 +646,7 @@ static bool is_slot_signaled(struct signal_page *page, unsigned int index) + return page_slots(page)[index] != UNSIGNALED_EVENT_SLOT; + } + +-static void set_event_from_interrupt(struct kfd_process *p, +- struct kfd_event *ev) ++static void set_event_from_interrupt(struct kfd_process *p, struct kfd_event *ev) + { + if (ev && event_can_be_gpu_signaled(ev)) { + acknowledge_signal(p, ev); +@@ -561,42 +659,39 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, + { + struct kfd_event *ev; + +- /* +- * Because we are called from arbitrary context (workqueue) as opposed ++ /* Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are +- * running so the lookup function returns a locked process. +- */ ++ * running so the lookup function returns a read-locked process. */ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); +- + if (!p) + return; /* Presumably process exited. */ + + mutex_lock(&p->event_mutex); + +- if (valid_id_bits >= INTERRUPT_DATA_BITS) { ++ if ((valid_id_bits >= INTERRUPT_DATA_BITS) && ++ ((partial_id & KFD_DEBUG_EVENT_MASK) == ++ KFD_DEBUG_EVENT_MASK)) { + /* Partial ID is a full ID. */ + ev = lookup_event_by_id(p, partial_id); + set_event_from_interrupt(p, ev); + } else { +- /* +- * Partial ID is in fact partial. For now we completely +- * ignore it, but we could use any bits we did receive to +- * search faster. +- */ ++ /* Partial ID is in fact partial. For now we completely ignore it, ++ * but we could use any bits we did receive to search faster. */ + struct signal_page *page; + unsigned i; + +- list_for_each_entry(page, &p->signal_event_pages, event_pages) +- for (i = 0; i < SLOTS_PER_PAGE; i++) ++ list_for_each_entry(page, &p->signal_event_pages, event_pages) { ++ for (i = 0; i < SLOTS_PER_PAGE; i++) { + if (is_slot_signaled(page, i)) { +- ev = lookup_event_by_page_slot(p, +- page, i); ++ ev = lookup_event_by_page_slot(p, page, i); + set_event_from_interrupt(p, ev); + } ++ } ++ } + } + + mutex_unlock(&p->event_mutex); +- mutex_unlock(&p->mutex); ++ up_read(&p->lock); + } + + static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) +@@ -604,20 +699,20 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) + struct kfd_event_waiter *event_waiters; + uint32_t i; + +- event_waiters = kmalloc_array(num_events, +- sizeof(struct kfd_event_waiter), +- GFP_KERNEL); ++ event_waiters = kmalloc(num_events * sizeof(struct kfd_event_waiter), GFP_KERNEL); + +- for (i = 0; (event_waiters) && (i < num_events) ; i++) { +- INIT_LIST_HEAD(&event_waiters[i].waiters); +- event_waiters[i].sleeping_task = current; +- event_waiters[i].activated = false; ++ if (event_waiters) { ++ for (i = 0; i < num_events; i++) { ++ INIT_LIST_HEAD(&event_waiters[i].waiters); ++ event_waiters[i].sleeping_task = current; ++ event_waiters[i].activated = false; ++ } + } + + return event_waiters; + } + +-static int init_event_waiter(struct kfd_process *p, ++static int init_event_waiter_get_status(struct kfd_process *p, + struct kfd_event_waiter *waiter, + uint32_t event_id, + uint32_t input_index) +@@ -632,13 +727,21 @@ static int init_event_waiter(struct kfd_process *p, + waiter->activated = ev->signaled; + ev->signaled = ev->signaled && !ev->auto_reset; + +- list_add(&waiter->waiters, &ev->waiters); +- + return 0; + } + ++static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) ++{ ++ struct kfd_event *ev = waiter->event; ++ ++ /* Only add to the wait list if we actually need to ++ * wait on this event. */ ++ if (!waiter->activated) ++ list_add(&waiter->waiters, &ev->waiters); ++} ++ + static bool test_event_condition(bool all, uint32_t num_events, +- struct kfd_event_waiter *event_waiters) ++ struct kfd_event_waiter *event_waiters) + { + uint32_t i; + uint32_t activated_count = 0; +@@ -663,23 +766,15 @@ static bool copy_signaled_event_data(uint32_t num_events, + struct kfd_event_waiter *event_waiters, + struct kfd_event_data __user *data) + { +- struct kfd_hsa_memory_exception_data *src; +- struct kfd_hsa_memory_exception_data __user *dst; +- struct kfd_event_waiter *waiter; +- struct kfd_event *event; + uint32_t i; + +- for (i = 0; i < num_events; i++) { +- waiter = &event_waiters[i]; +- event = waiter->event; +- if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) { +- dst = &data[waiter->input_index].memory_exception_data; +- src = &event->memory_exception_data; +- if (copy_to_user(dst, src, +- sizeof(struct kfd_hsa_memory_exception_data))) ++ for (i = 0; i < num_events; i++) ++ if (event_waiters[i].activated && ++ event_waiters[i].event->type == KFD_EVENT_TYPE_MEMORY) ++ if (copy_to_user(&data[event_waiters[i].input_index].memory_exception_data, ++ &event_waiters[i].event->memory_exception_data, ++ sizeof(struct kfd_hsa_memory_exception_data))) + return false; +- } +- } + + return true; + +@@ -695,11 +790,9 @@ static long user_timeout_to_jiffies(uint32_t user_timeout_ms) + if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE) + return MAX_SCHEDULE_TIMEOUT; + +- /* +- * msecs_to_jiffies interprets all values above 2^31-1 as infinite, ++ /* msecs_to_jiffies interprets all values above 2^31-1 as infinite, + * but we consider them finite. +- * This hack is wrong, but nobody is likely to notice. +- */ ++ * This hack is wrong, but nobody is likely to notice. */ + user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF); + + return msecs_to_jiffies(user_timeout_ms) + 1; +@@ -724,11 +817,16 @@ int kfd_wait_on_events(struct kfd_process *p, + (struct kfd_event_data __user *) data; + uint32_t i; + int ret = 0; ++ + struct kfd_event_waiter *event_waiters = NULL; + long timeout = user_timeout_to_jiffies(user_timeout_ms); + + mutex_lock(&p->event_mutex); + ++ /* Set to something unreasonable - this is really ++ * just a bool for now. */ ++ *wait_result = KFD_WAIT_TIMEOUT; ++ + event_waiters = alloc_event_waiters(num_events); + if (!event_waiters) { + ret = -ENOMEM; +@@ -744,14 +842,34 @@ int kfd_wait_on_events(struct kfd_process *p, + goto fail; + } + +- ret = init_event_waiter(p, &event_waiters[i], ++ ret = init_event_waiter_get_status(p, &event_waiters[i], + event_data.event_id, i); + if (ret) + goto fail; + } + ++ /* Check condition once. */ ++ if (test_event_condition(all, num_events, event_waiters)) { ++ if (copy_signaled_event_data(num_events, ++ event_waiters, events)) ++ *wait_result = KFD_WAIT_COMPLETE; ++ else ++ *wait_result = KFD_WAIT_ERROR; ++ free_waiters(num_events, event_waiters); ++ } else { ++ /* Add to wait lists if we need to wait. */ ++ for (i = 0; i < num_events; i++) ++ init_event_waiter_add_to_waitlist(&event_waiters[i]); ++ } ++ + mutex_unlock(&p->event_mutex); + ++ /* Return if all waits were already satisfied. */ ++ if (*wait_result != KFD_WAIT_TIMEOUT) { ++ __set_current_state(TASK_RUNNING); ++ return ret; ++ } ++ + while (true) { + if (fatal_signal_pending(current)) { + ret = -EINTR; +@@ -760,17 +878,17 @@ int kfd_wait_on_events(struct kfd_process *p, + + if (signal_pending(current)) { + /* +- * This is wrong when a nonzero, non-infinite timeout +- * is specified. We need to use +- * ERESTARTSYS_RESTARTBLOCK, but struct restart_block +- * contains a union with data for each user and it's +- * in generic kernel code that I don't want to +- * touch yet. ++ * This is wrong when a nonzero, non-infinite timeout is specified. ++ * We need to use ERESTARTSYS_RESTARTBLOCK, but struct restart_block ++ * contains a union with data for each user and it's in generic ++ * kernel code that I don't want to touch yet. + */ + ret = -ERESTARTSYS; + break; + } + ++ set_current_state(TASK_INTERRUPTIBLE); ++ + if (test_event_condition(all, num_events, event_waiters)) { + if (copy_signaled_event_data(num_events, + event_waiters, events)) +@@ -785,7 +903,7 @@ int kfd_wait_on_events(struct kfd_process *p, + break; + } + +- timeout = schedule_timeout_interruptible(timeout); ++ timeout = schedule_timeout(timeout); + } + __set_current_state(TASK_RUNNING); + +@@ -825,8 +943,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) + page = lookup_signal_page_by_index(p, page_index); + if (!page) { + /* Probably KFD bug, but mmap is user-accessible. */ +- pr_debug("signal page could not be found for page_index %u\n", +- page_index); ++ pr_debug("signal page could not be found for page_index %u\n", page_index); + return -EINVAL; + } + +@@ -858,23 +975,29 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) + static void lookup_events_by_type_and_signal(struct kfd_process *p, + int type, void *event_data) + { +- struct kfd_hsa_memory_exception_data *ev_data; + struct kfd_event *ev; + int bkt; + bool send_signal = true; + +- ev_data = (struct kfd_hsa_memory_exception_data *) event_data; +- +- hash_for_each(p->events, bkt, ev, events) ++ hash_for_each(p->events, bkt, ev, events) { + if (ev->type == type) { + send_signal = false; + dev_dbg(kfd_device, + "Event found: id %X type %d", + ev->event_id, ev->type); + set_event(ev); +- if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data) +- ev->memory_exception_data = *ev_data; ++ if (ev->type == KFD_EVENT_TYPE_MEMORY && event_data) ++ ev->memory_exception_data = ++ *(struct kfd_hsa_memory_exception_data *)event_data; + } ++ } ++ ++ if (type == KFD_EVENT_TYPE_MEMORY) { ++ dev_warn(kfd_device, ++ "Sending SIGSEGV to HSA Process with PID %d ", ++ p->lead_thread->pid); ++ send_sig(SIGSEGV, p->lead_thread, 0); ++ } + + /* Send SIGTERM no event of type "type" has been found*/ + if (send_signal) { +@@ -901,7 +1024,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + /* + * Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are +- * running so the lookup function returns a locked process. ++ * running so the lookup function returns a read-locked process. + */ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + +@@ -916,24 +1039,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + memory_exception_data.gpu_id = dev->id; + memory_exception_data.va = address; + /* Set failure reason */ +- memory_exception_data.failure.NotPresent = 1; +- memory_exception_data.failure.NoExecute = 0; +- memory_exception_data.failure.ReadOnly = 0; ++ memory_exception_data.failure.NotPresent = true; ++ memory_exception_data.failure.NoExecute = false; ++ memory_exception_data.failure.ReadOnly = false; + if (vma) { + if (vma->vm_start > address) { +- memory_exception_data.failure.NotPresent = 1; +- memory_exception_data.failure.NoExecute = 0; +- memory_exception_data.failure.ReadOnly = 0; ++ memory_exception_data.failure.NotPresent = true; ++ memory_exception_data.failure.NoExecute = false; ++ memory_exception_data.failure.ReadOnly = false; + } else { +- memory_exception_data.failure.NotPresent = 0; ++ memory_exception_data.failure.NotPresent = false; + if (is_write_requested && !(vma->vm_flags & VM_WRITE)) +- memory_exception_data.failure.ReadOnly = 1; ++ memory_exception_data.failure.ReadOnly = true; + else +- memory_exception_data.failure.ReadOnly = 0; ++ memory_exception_data.failure.ReadOnly = false; + if (is_execute_requested && !(vma->vm_flags & VM_EXEC)) +- memory_exception_data.failure.NoExecute = 1; ++ memory_exception_data.failure.NoExecute = true; + else +- memory_exception_data.failure.NoExecute = 0; ++ memory_exception_data.failure.NoExecute = false; + } + } + +@@ -946,7 +1069,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + &memory_exception_data); + + mutex_unlock(&p->event_mutex); +- mutex_unlock(&p->mutex); ++ up_read(&p->lock); + } + + void kfd_signal_hw_exception_event(unsigned int pasid) +@@ -954,7 +1077,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid) + /* + * Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are +- * running so the lookup function returns a locked process. ++ * running so the lookup function returns a read-locked process. + */ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + +@@ -967,5 +1090,42 @@ void kfd_signal_hw_exception_event(unsigned int pasid) + lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL); + + mutex_unlock(&p->event_mutex); +- mutex_unlock(&p->mutex); ++ up_read(&p->lock); ++} ++ ++void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, ++ struct kfd_vm_fault_info *info) ++{ ++ struct kfd_event *ev; ++ int bkt; ++ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); ++ struct kfd_hsa_memory_exception_data memory_exception_data; ++ ++ if (!p) ++ return; /* Presumably process exited. */ ++ memset(&memory_exception_data, 0, sizeof(memory_exception_data)); ++ memory_exception_data.gpu_id = dev->id; ++ /* Set failure reason */ ++ if (info) { ++ memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; ++ memory_exception_data.failure.NotPresent = ++ info->prot_valid ? true : false; ++ memory_exception_data.failure.NoExecute = ++ info->prot_exec ? true : false; ++ memory_exception_data.failure.ReadOnly = ++ info->prot_write ? true : false; ++ } ++ mutex_lock(&p->event_mutex); ++ ++ hash_for_each(p->events, bkt, ev, events) { ++ if (ev->type == KFD_EVENT_TYPE_MEMORY) { ++ ev->memory_exception_data = memory_exception_data; ++ set_event(ev); ++ } ++ } ++ ++ mutex_unlock(&p->event_mutex); ++ up_read(&p->lock); ++ + } ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h +index 28f6838..d7987eb 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h +@@ -34,8 +34,7 @@ + #define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK + #define KFD_LAST_NONSIGNAL_EVENT_ID UINT_MAX + +-/* +- * Written into kfd_signal_slot_t to indicate that the event is not signaled. ++/* Written into kfd_signal_slot_t to indicate that the event is not signaled. + * Since the event protocol may need to write the event ID into memory, this + * must not be a valid event ID. + * For the sake of easy memset-ing, this must be a byte pattern. +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +index 2b65510..587f847 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +@@ -33,7 +33,7 @@ + #include <linux/time.h> + #include "kfd_priv.h" + #include <linux/mm.h> +-#include <linux/mman.h> ++#include <uapi/asm-generic/mman-common.h> + #include <asm/processor.h> + + /* +@@ -278,21 +278,36 @@ + #define MAKE_GPUVM_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) + +-#define MAKE_GPUVM_APP_LIMIT(base) \ +- (((uint64_t)(base) & \ +- 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) ++#define MAKE_GPUVM_APP_LIMIT(base, size) \ ++ (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) + +-#define MAKE_SCRATCH_APP_BASE(gpu_num) \ +- (((uint64_t)(gpu_num) << 61) + 0x100000000L) ++#define MAKE_SCRATCH_APP_BASE() \ ++ (((uint64_t)(0x1UL) << 61) + 0x100000000L) + + #define MAKE_SCRATCH_APP_LIMIT(base) \ + (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +-#define MAKE_LDS_APP_BASE(gpu_num) \ +- (((uint64_t)(gpu_num) << 61) + 0x0) ++#define MAKE_LDS_APP_BASE() \ ++ (((uint64_t)(0x1UL) << 61) + 0x0) ++ + #define MAKE_LDS_APP_LIMIT(base) \ + (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + ++ ++#define DGPU_VM_BASE_DEFAULT 0x100000 ++ ++int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, ++ uint64_t base, uint64_t limit) ++{ ++ if (base < (pdd->qpd.cwsr_base + pdd->dev->cwsr_size)) { ++ pr_err("Set dgpu vm base 0x%llx failed.\n", base); ++ return -EINVAL; ++ } ++ pdd->dgpu_base = base; ++ pdd->dgpu_limit = limit; ++ return 0; ++} ++ + int kfd_init_apertures(struct kfd_process *process) + { + uint8_t id = 0; +@@ -300,13 +315,16 @@ int kfd_init_apertures(struct kfd_process *process) + struct kfd_process_device *pdd; + + /*Iterating over all devices*/ +- while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && +- id < NUM_OF_SUPPORTED_GPUS) { ++ while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { ++ if (!dev) { ++ id++; /* Skip non GPU devices */ ++ continue; ++ } + + pdd = kfd_create_process_device_data(dev, process); + if (pdd == NULL) { + pr_err("Failed to create process device data\n"); +- return -1; ++ goto err; + } + /* + * For 64 bit process aperture will be statically reserved in +@@ -322,19 +340,24 @@ int kfd_init_apertures(struct kfd_process *process) + * node id couldn't be 0 - the three MSB bits of + * aperture shoudn't be 0 + */ +- pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); ++ pdd->lds_base = MAKE_LDS_APP_BASE(); + + pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); + + pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); + +- pdd->gpuvm_limit = +- MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); ++ pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( ++ pdd->gpuvm_base, ++ dev->shared_resources.gpuvm_size); + +- pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); ++ pdd->scratch_base = MAKE_SCRATCH_APP_BASE(); + + pdd->scratch_limit = + MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); ++ ++ if (KFD_IS_DGPU(dev->device_info->asic_family)) ++ pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT; ++ + } + + dev_dbg(kfd_device, "node id %u\n", id); +@@ -350,6 +373,32 @@ int kfd_init_apertures(struct kfd_process *process) + } + + return 0; ++ ++err: ++ return -1; + } + ++void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid) ++{ ++ uint8_t vmid; ++ int first_vmid_to_scan = 8; ++ int last_vmid_to_scan = 15; + ++ const struct kfd2kgd_calls *f2g = dev->kfd2kgd; ++ /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. ATC_VMID15_PASID_MAPPING ++ * to check which VMID the current process is mapped to ++ * and flush TLB for this VMID if found*/ ++ for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { ++ if (f2g->get_atc_vmid_pasid_mapping_valid( ++ dev->kgd, vmid)) { ++ if (f2g->get_atc_vmid_pasid_mapping_pasid( ++ dev->kgd, vmid) == pasid) { ++ dev_dbg(kfd_device, ++ "TLB of vmid %u", vmid); ++ f2g->write_vmid_invalidate_request( ++ dev->kgd, vmid); ++ break; ++ } ++ } ++ } ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +index 7f134aa..a8cdbc8 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +@@ -172,8 +172,7 @@ static void interrupt_wq(struct work_struct *work) + sizeof(uint32_t))]; + + while (dequeue_ih_ring_entry(dev, ih_ring_entry)) +- dev->device_info->event_interrupt_class->interrupt_wq(dev, +- ih_ring_entry); ++ dev->device_info->event_interrupt_class->interrupt_wq(dev, ih_ring_entry); + } + + bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) +@@ -181,8 +180,7 @@ bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) + /* integer and bitwise OR so there is no boolean short-circuiting */ + unsigned wanted = 0; + +- wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, +- ih_ring_entry); ++ wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, ih_ring_entry); + + return wanted != 0; + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +index d135cd0..513cfe6 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +@@ -143,7 +143,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + kq->queue->pipe = KFD_CIK_HIQ_PIPE; + kq->queue->queue = KFD_CIK_HIQ_QUEUE; + kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe, +- kq->queue->queue, NULL); ++ kq->queue->queue, NULL, 0); + } else { + /* allocate fence for DIQ */ + +@@ -213,20 +213,23 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + + BUG_ON(!kq || !buffer_ptr); + ++ /* When rptr == wptr, the buffer is empty. ++ * When rptr == wptr + 1, the buffer is full. ++ * It is always rptr that advances to the position of wptr, rather than ++ * the opposite. So we can only use up to queue_size_dwords - 1 dwords. ++ */ + rptr = *kq->rptr_kernel; + wptr = *kq->wptr_kernel; + queue_address = (unsigned int *)kq->pq_kernel_addr; + queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); + +- pr_debug("rptr: %d\n", rptr); +- pr_debug("wptr: %d\n", wptr); +- pr_debug("queue_address 0x%p\n", queue_address); ++ pr_debug("amdkfd: In func %s\n rptr: %d\n wptr: %d\n queue_address 0x%p\n", ++ __func__, rptr, wptr, queue_address); + +- available_size = (rptr - 1 - wptr + queue_size_dwords) % ++ available_size = (rptr + queue_size_dwords - 1 - wptr) % + queue_size_dwords; + +- if (packet_size_in_dwords >= queue_size_dwords || +- packet_size_in_dwords >= available_size) { ++ if (packet_size_in_dwords > available_size) { + /* + * make sure calling functions know + * acquire_packet_buffer() failed +@@ -236,6 +239,13 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + } + + if (wptr + packet_size_in_dwords >= queue_size_dwords) { ++ /* make sure after rolling back to position 0, there is ++ * still enough space. */ ++ if (packet_size_in_dwords >= rptr) { ++ *buffer_ptr = NULL; ++ return -ENOMEM; ++ } ++ /* fill nops, roll back and start at position 0 */ + while (wptr > 0) { + queue_address[wptr] = kq->nop_packet; + wptr = (wptr + 1) % queue_size_dwords; +@@ -295,6 +305,8 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + + switch (dev->device_info->asic_family) { + case CHIP_CARRIZO: ++ case CHIP_TONGA: ++ case CHIP_FIJI: + kernel_queue_init_vi(&kq->ops_asic_specific); + break; + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c +index 850a562..e9b886d 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c +@@ -29,10 +29,11 @@ + #define KFD_DRIVER_AUTHOR "AMD Inc. and others" + + #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" +-#define KFD_DRIVER_DATE "20150421" +-#define KFD_DRIVER_MAJOR 0 +-#define KFD_DRIVER_MINOR 7 +-#define KFD_DRIVER_PATCHLEVEL 2 ++#define KFD_DRIVER_DATE "20160129" ++#define KFD_DRIVER_MAJOR 1 ++#define KFD_DRIVER_MINOR 8 ++#define KFD_DRIVER_PATCHLEVEL 1 ++#define KFD_DRIVER_RC_LEVEL "" + + static const struct kgd2kfd_calls kgd2kfd = { + .exit = kgd2kfd_exit, +@@ -42,6 +43,10 @@ static const struct kgd2kfd_calls kgd2kfd = { + .interrupt = kgd2kfd_interrupt, + .suspend = kgd2kfd_suspend, + .resume = kgd2kfd_resume, ++ .evict_bo = kgd2kfd_evict_bo, ++ .restore = kgd2kfd_restore, ++ .quiesce_mm = kgd2kfd_quiesce_mm, ++ .resume_mm = kgd2kfd_resume_mm, + }; + + int sched_policy = KFD_SCHED_POLICY_HWS; +@@ -49,6 +54,15 @@ module_param(sched_policy, int, 0444); + MODULE_PARM_DESC(sched_policy, + "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); + ++int hws_max_conc_proc = 0; ++module_param(hws_max_conc_proc, int, 0444); ++MODULE_PARM_DESC(hws_max_conc_proc, ++ "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency (Default), #VMIDs for KFD = Maximum)"); ++ ++int cwsr_enable = 1; ++module_param(cwsr_enable, int, 0444); ++MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); ++ + int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; + module_param(max_num_of_queues_per_device, int, 0444); + MODULE_PARM_DESC(max_num_of_queues_per_device, +@@ -61,6 +75,11 @@ MODULE_PARM_DESC(send_sigterm, + + static int amdkfd_init_completed; + ++int debug_largebar = 0; ++module_param(debug_largebar, int, 0444); ++MODULE_PARM_DESC(debug_largebar, ++ "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); ++ + int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f) + { + if (!amdkfd_init_completed) +@@ -149,4 +168,5 @@ MODULE_DESCRIPTION(KFD_DRIVER_DESC); + MODULE_LICENSE("GPL and additional rights"); + MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "." + __stringify(KFD_DRIVER_MINOR) "." +- __stringify(KFD_DRIVER_PATCHLEVEL)); ++ __stringify(KFD_DRIVER_PATCHLEVEL) ++ KFD_DRIVER_RC_LEVEL); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +index b1ef136..ef1dc9b 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +@@ -31,6 +31,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + return mqd_manager_init_cik(type, dev); + case CHIP_CARRIZO: + return mqd_manager_init_vi(type, dev); ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ return mqd_manager_init_vi_tonga(type, dev); + } + + return NULL; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +index 213a71e..eb60192 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +@@ -67,7 +67,8 @@ struct mqd_manager { + + int (*load_mqd)(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, +- uint32_t __user *wptr); ++ uint32_t __user *wptr, ++ uint32_t page_table_base); + + int (*update_mqd)(struct mqd_manager *mm, void *mqd, + struct queue_properties *q); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +index 6acc431..62dbdca 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +@@ -31,11 +31,71 @@ + #include "cik_structs.h" + #include "oss/oss_2_4_sh_mask.h" + ++#define AQL_ENABLE 1 ++ + static inline struct cik_mqd *get_mqd(void *mqd) + { + return (struct cik_mqd *)mqd; + } + ++static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) ++{ ++ return (struct cik_sdma_rlc_registers *)mqd; ++} ++ ++static void update_cu_mask(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct cik_mqd *m; ++ struct kfd_cu_info cu_info; ++ uint32_t mgmt_se_mask; ++ uint32_t cu_sh_mask, cu_sh_shift; ++ uint32_t cu_mask; ++ int se, sh; ++ ++ if (q->cu_mask == 0) ++ return; ++ ++ m = get_mqd(mqd); ++ m->compute_static_thread_mgmt_se0 = 0; ++ m->compute_static_thread_mgmt_se1 = 0; ++ m->compute_static_thread_mgmt_se2 = 0; ++ m->compute_static_thread_mgmt_se3 = 0; ++ ++ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); ++ cu_mask = q->cu_mask; ++ for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) { ++ mgmt_se_mask = 0; ++ for (sh = 0; sh < 2 && cu_mask; sh++) { ++ cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]); ++ cu_sh_mask = (1 << cu_sh_shift) - 1; ++ mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16); ++ cu_mask >>= cu_sh_shift; ++ } ++ switch (se) { ++ case 0: ++ m->compute_static_thread_mgmt_se0 = mgmt_se_mask; ++ break; ++ case 1: ++ m->compute_static_thread_mgmt_se1 = mgmt_se_mask; ++ break; ++ case 2: ++ m->compute_static_thread_mgmt_se2 = mgmt_se_mask; ++ break; ++ case 3: ++ m->compute_static_thread_mgmt_se3 = mgmt_se_mask; ++ break; ++ default: ++ break; ++ } ++ } ++ pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n", ++ m->compute_static_thread_mgmt_se0, ++ m->compute_static_thread_mgmt_se1, ++ m->compute_static_thread_mgmt_se2, ++ m->compute_static_thread_mgmt_se3); ++} ++ + static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +@@ -152,15 +212,16 @@ static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + } + + static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr) ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t page_table_base) + { + return mm->dev->kfd2kgd->hqd_load +- (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); ++ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base); + } + + static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, +- uint32_t __user *wptr) ++ uint32_t __user *wptr, uint32_t page_table_base) + { + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); + } +@@ -197,11 +258,14 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, + m->cp_hqd_pq_control |= NO_UPDATE_RPTR; + } + ++ update_cu_mask(mm, mqd, q); ++ + m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0) { ++ q->queue_percent > 0 && ++ !q->is_evicted) { + m->cp_hqd_active = 1; + q->is_active = true; + } +@@ -217,8 +281,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + BUG_ON(!mm || !mqd || !q); + + m = get_sdma_mqd(mqd); +- m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << +- SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | ++ m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) ++ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; +@@ -239,7 +303,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0) { ++ q->queue_percent > 0 && ++ !q->is_evicted) { + m->sdma_rlc_rb_cntl |= + 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; + +@@ -388,7 +453,8 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0) { ++ q->queue_percent > 0 && ++ !q->is_evicted) { + m->cp_hqd_active = 1; + q->is_active = true; + } +@@ -396,16 +462,6 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + return 0; + } + +-struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) +-{ +- struct cik_sdma_rlc_registers *m; +- +- BUG_ON(!mqd); +- +- m = (struct cik_sdma_rlc_registers *)mqd; +- +- return m; +-} + + struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +index a9b9882..4260c2f 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +@@ -28,9 +28,9 @@ + #include "kfd_priv.h" + #include "kfd_mqd_manager.h" + #include "vi_structs.h" +-#include "gca/gfx_8_0_sh_mask.h" +-#include "gca/gfx_8_0_enum.h" +- ++#include "asic_reg/gca/gfx_8_0_sh_mask.h" ++#include "asic_reg/gca/gfx_8_0_enum.h" ++#include "oss/oss_3_0_sh_mask.h" + #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 + + static inline struct vi_mqd *get_mqd(void *mqd) +@@ -38,6 +38,64 @@ static inline struct vi_mqd *get_mqd(void *mqd) + return (struct vi_mqd *)mqd; + } + ++static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) ++{ ++ return (struct vi_sdma_mqd *)mqd; ++} ++ ++static void update_cu_mask(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct vi_mqd *m; ++ struct kfd_cu_info cu_info; ++ uint32_t mgmt_se_mask; ++ uint32_t cu_sh_mask, cu_sh_shift; ++ uint32_t cu_mask; ++ int se, sh; ++ ++ if (q->cu_mask == 0) ++ return; ++ ++ m = get_mqd(mqd); ++ m->compute_static_thread_mgmt_se0 = 0; ++ m->compute_static_thread_mgmt_se1 = 0; ++ m->compute_static_thread_mgmt_se2 = 0; ++ m->compute_static_thread_mgmt_se3 = 0; ++ ++ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); ++ cu_mask = q->cu_mask; ++ for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) { ++ mgmt_se_mask = 0; ++ for (sh = 0; sh < 2 && cu_mask; sh++) { ++ cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]); ++ cu_sh_mask = (1 << cu_sh_shift) - 1; ++ mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16); ++ cu_mask >>= cu_sh_shift; ++ } ++ switch (se) { ++ case 0: ++ m->compute_static_thread_mgmt_se0 = mgmt_se_mask; ++ break; ++ case 1: ++ m->compute_static_thread_mgmt_se1 = mgmt_se_mask; ++ break; ++ case 2: ++ m->compute_static_thread_mgmt_se2 = mgmt_se_mask; ++ break; ++ case 3: ++ m->compute_static_thread_mgmt_se3 = mgmt_se_mask; ++ break; ++ default: ++ break; ++ } ++ } ++ pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n", ++ m->compute_static_thread_mgmt_se0, ++ m->compute_static_thread_mgmt_se1, ++ m->compute_static_thread_mgmt_se2, ++ m->compute_static_thread_mgmt_se3); ++} ++ + static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +@@ -84,6 +142,25 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + if (q->format == KFD_QUEUE_FORMAT_AQL) + m->cp_hqd_iq_rptr = 1; + ++ if (q->tba_addr) { ++ m->cp_hqd_persistent_state |= ++ (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); ++ m->compute_pgm_rsrc2 |= ++ (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); ++ m->cp_hqd_ctx_save_base_addr_lo = ++ lower_32_bits(q->ctx_save_restore_area_address); ++ m->cp_hqd_ctx_save_base_addr_hi = ++ upper_32_bits(q->ctx_save_restore_area_address); ++ m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; ++ m->cp_hqd_cntl_stack_size = q->ctl_stack_size; ++ m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; ++ m->cp_hqd_wg_state_offset = q->ctl_stack_size; ++ m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); ++ m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); ++ m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); ++ m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); ++ } ++ + *mqd = m; + if (gart_addr != NULL) + *gart_addr = addr; +@@ -94,10 +171,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + + static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, +- uint32_t __user *wptr) ++ uint32_t __user *wptr, uint32_t page_table_base) + { + return mm->dev->kfd2kgd->hqd_load +- (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); ++ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base); + } + + static int __update_mqd(struct mqd_manager *mm, void *mqd, +@@ -155,12 +232,19 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; + } ++ if (q->tba_addr) ++ m->cp_hqd_ctx_save_control = ++ atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | ++ mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; ++ ++ update_cu_mask(mm, mqd, q); + + m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0) { ++ q->queue_percent > 0 && ++ !q->is_evicted) { + m->cp_hqd_active = 1; + q->is_active = true; + } +@@ -175,6 +259,12 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, + return __update_mqd(mm, mqd, q, MTYPE_CC, 1); + } + ++static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ return __update_mqd(mm, mqd, q, MTYPE_UC, 0); ++} ++ + static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, +@@ -233,6 +323,111 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + return retval; + } + ++static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, ++ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, ++ struct queue_properties *q) ++{ ++ int retval; ++ struct vi_sdma_mqd *m; ++ ++ ++ BUG_ON(!mm || !mqd || !mqd_mem_obj); ++ ++ retval = kfd_gtt_sa_allocate(mm->dev, ++ sizeof(struct vi_sdma_mqd), ++ mqd_mem_obj); ++ ++ if (retval != 0) ++ return -ENOMEM; ++ ++ m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; ++ ++ memset(m, 0, sizeof(struct vi_sdma_mqd)); ++ ++ *mqd = m; ++ if (gart_addr != NULL) ++ *gart_addr = (*mqd_mem_obj)->gpu_addr; ++ ++ retval = mm->update_mqd(mm, m, q); ++ ++ return retval; ++} ++ ++static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ struct kfd_mem_obj *mqd_mem_obj) ++{ ++ BUG_ON(!mm || !mqd); ++ kfd_gtt_sa_free(mm->dev, mqd_mem_obj); ++} ++ ++static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ uint32_t pipe_id, uint32_t queue_id, ++ uint32_t __user *wptr, uint32_t page_table_base) ++{ ++ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); ++} ++ ++static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct vi_sdma_mqd *m; ++ BUG_ON(!mm || !mqd || !q); ++ ++ m = get_sdma_mqd(mqd); ++ m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) ++ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | ++ q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | ++ 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | ++ 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; ++ ++ m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); ++ m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); ++ m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); ++ m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); ++ m->sdmax_rlcx_doorbell = q->doorbell_off << ++ SDMA0_RLC0_DOORBELL__OFFSET__SHIFT | ++ 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; ++ ++ m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr; ++ ++ m->sdma_engine_id = q->sdma_engine_id; ++ m->sdma_queue_id = q->sdma_queue_id; ++ ++ q->is_active = false; ++ if (q->queue_size > 0 && ++ q->queue_address != 0 && ++ q->queue_percent > 0 && ++ !q->is_evicted) { ++ m->sdmax_rlcx_rb_cntl |= ++ 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; ++ ++ q->is_active = true; ++ } ++ ++ return 0; ++} ++ ++/* ++ * * preempt type here is ignored because there is only one way ++ * * to preempt sdma queue ++ */ ++static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ enum kfd_preempt_type type, ++ unsigned int timeout, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); ++} ++ ++static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, ++ uint64_t queue_address, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); ++} ++ ++ ++ + struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) + { +@@ -268,6 +463,12 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + mqd->is_occupied = is_occupied; + break; + case KFD_MQD_TYPE_SDMA: ++ mqd->init_mqd = init_mqd_sdma; ++ mqd->uninit_mqd = uninit_mqd_sdma; ++ mqd->load_mqd = load_mqd_sdma; ++ mqd->update_mqd = update_mqd_sdma; ++ mqd->destroy_mqd = destroy_mqd_sdma; ++ mqd->is_occupied = is_occupied_sdma; + break; + default: + kfree(mqd); +@@ -276,3 +477,17 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + + return mqd; + } ++ ++struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev) ++{ ++ struct mqd_manager *mqd; ++ ++ mqd = mqd_manager_init_vi(type, dev); ++ if (!mqd) ++ return NULL; ++ if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) ++ mqd->update_mqd = update_mqd_tonga; ++ return mqd; ++} ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +index 7e92921..55f7098 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +@@ -57,25 +57,37 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + { + unsigned int process_count, queue_count; + unsigned int map_queue_size; ++ unsigned int max_proc_per_quantum = 1; + +- BUG_ON(!pm || !rlib_size || !over_subscription); ++ struct kfd_dev *dev = pm->dqm->dev; ++ ++ BUG_ON(!pm || !rlib_size || !over_subscription || !dev); + + process_count = pm->dqm->processes_count; + queue_count = pm->dqm->queue_count; + +- /* check if there is over subscription*/ ++ /* check if there is over subscription ++ * Note: the arbitration between the number of VMIDs and ++ * hws_max_conc_proc has been done in ++ * kgd2kfd_device_init(). ++ */ ++ + *over_subscription = false; +- if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) { ++ ++ if (dev->max_proc_per_quantum > 1) ++ max_proc_per_quantum = dev->max_proc_per_quantum; ++ ++ if ((process_count > max_proc_per_quantum) || ++ queue_count > get_queues_num(pm->dqm)) { + *over_subscription = true; + pr_debug("kfd: over subscribed runlist\n"); + } + +- map_queue_size = +- (pm->dqm->dev->device_info->asic_family == CHIP_CARRIZO) ? ++ map_queue_size = KFD_IS_VI(pm->dqm->dev->device_info->asic_family) ? + sizeof(struct pm4_mes_map_queues) : + sizeof(struct pm4_map_queues); + /* calculate run list ib allocation size */ +- *rlib_size = process_count * sizeof(struct pm4_map_process) + ++ *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + + queue_count * map_queue_size; + + /* +@@ -102,11 +114,14 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, + + pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + ++ mutex_lock(&pm->lock); ++ + retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, + &pm->ib_buffer_obj); + + if (retval != 0) { + pr_err("kfd: failed to allocate runlist IB\n"); ++ mutex_unlock(&pm->lock); + return retval; + } + +@@ -115,6 +130,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, + + memset(*rl_buffer, 0, *rl_buffer_size); + pm->allocated = true; ++ ++ mutex_unlock(&pm->lock); + return retval; + } + +@@ -122,9 +139,24 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) + { + struct pm4_runlist *packet; ++ int concurrent_proc_cnt = 0; ++ struct kfd_dev *kfd = pm->dqm->dev; + + BUG_ON(!pm || !buffer || !ib); + ++ /* Determine the number of processes to map together to HW: ++ * it can not exceed the number of VMIDs available to the ++ * scheduler, and it is determined by the smaller of the number ++ * of processes in the runlist and kfd module parameter ++ * hws_max_conc_proc. ++ * Note: the arbitration between the number of VMIDs and ++ * hws_max_conc_proc has been done in ++ * kgd2kfd_device_init(). ++ */ ++ concurrent_proc_cnt = min(pm->dqm->processes_count, ++ kfd->max_proc_per_quantum); ++ ++ + packet = (struct pm4_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_runlist)); +@@ -135,6 +167,7 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; ++ packet->bitfields4.process_cnt = concurrent_proc_cnt; + packet->ordinal2 = lower_32_bits(ib); + packet->bitfields3.ib_base_hi = upper_32_bits(ib); + +@@ -181,6 +214,90 @@ static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, + return 0; + } + ++static int pm_create_map_process_scratch_kv(struct packet_manager *pm, ++ uint32_t *buffer, struct qcm_process_device *qpd) ++{ ++ struct pm4_map_process_scratch_kv *packet; ++ struct queue *cur; ++ uint32_t num_queues; ++ ++ BUG_ON(!pm || !buffer || !qpd); ++ ++ packet = (struct pm4_map_process_scratch_kv *)buffer; ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); ++ ++ packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_map_process_scratch_kv)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields3.page_table_base = qpd->page_table_base; ++ packet->bitfields14.gds_size = qpd->gds_size; ++ packet->bitfields14.num_gws = qpd->num_gws; ++ packet->bitfields14.num_oac = qpd->num_oac; ++ num_queues = 0; ++ list_for_each_entry(cur, &qpd->queues_list, list) ++ num_queues++; ++ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; ++ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; ++ ++ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ return 0; ++} ++ ++static int pm_create_map_process_scratch(struct packet_manager *pm, ++ uint32_t *buffer, struct qcm_process_device *qpd) ++{ ++ struct pm4_map_process_scratch *packet; ++ struct queue *cur; ++ uint32_t num_queues; ++ ++ BUG_ON(!pm || !buffer || !qpd); ++ ++ packet = (struct pm4_map_process_scratch *)buffer; ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ memset(buffer, 0, sizeof(struct pm4_map_process_scratch)); ++ ++ packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_map_process_scratch)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields3.page_table_base = qpd->page_table_base; ++ packet->bitfields10.gds_size = qpd->gds_size; ++ packet->bitfields10.num_gws = qpd->num_gws; ++ packet->bitfields10.num_oac = qpd->num_oac; ++ num_queues = 0; ++ list_for_each_entry(cur, &qpd->queues_list, list) ++ num_queues++; ++ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; ++ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; ++ ++ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ return 0; ++} ++ + static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static) + { +@@ -218,7 +335,7 @@ static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, + queue_type__mes_map_queues__debug_interface_queue_vi; + break; + case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = ++ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + + engine_sel__mes_map_queues__sdma0_vi; + use_static = false; /* no static queues under SDMA */ + break; +@@ -278,7 +395,7 @@ static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, + engine_sel__mes_map_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = ++ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + + engine_sel__mes_map_queues__sdma0; + use_static = false; /* no static queues under SDMA */ + break; +@@ -347,12 +464,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + return -ENOMEM; + } + +- retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); ++ retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); + if (retval != 0) + return retval; + + proccesses_mapped++; +- inc_wptr(&rl_wptr, sizeof(struct pm4_map_process), ++ inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(), + alloc_size_bytes); + + list_for_each_entry(kq, &qpd->priv_queue_list, list) { +@@ -362,8 +479,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n", + kq->queue->queue, qpd->is_debug); + +- if (pm->dqm->dev->device_info->asic_family == +- CHIP_CARRIZO) ++ if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) + retval = pm_create_map_queue_vi(pm, + &rl_buffer[rl_wptr], + kq->queue, +@@ -388,8 +504,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n", + q->queue, qpd->is_debug); + +- if (pm->dqm->dev->device_info->asic_family == +- CHIP_CARRIZO) ++ if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) + retval = pm_create_map_queue_vi(pm, + &rl_buffer[rl_wptr], + q, +@@ -422,7 +537,23 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + return 0; + } + +-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) ++static int get_map_process_packet_size(void) ++{ ++ return sizeof(struct pm4_map_process); ++} ++ ++static int get_map_process_packet_size_scratch_kv(void) ++{ ++ return sizeof(struct pm4_map_process_scratch_kv); ++} ++ ++static int get_map_process_packet_size_scratch(void) ++{ ++ return sizeof(struct pm4_map_process_scratch); ++} ++ ++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, ++ uint16_t fw_ver) + { + BUG_ON(!dqm); + +@@ -433,8 +564,37 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) + mutex_destroy(&pm->lock); + return -ENOMEM; + } ++ pm->pmf = kzalloc(sizeof(struct packet_manager_firmware), GFP_KERNEL); + pm->allocated = false; + ++ switch (pm->dqm->dev->device_info->asic_family) { ++ case CHIP_KAVERI: ++ if (fw_ver >= KFD_SCRATCH_KV_FW_VER) { ++ pm->pmf->map_process = pm_create_map_process_scratch_kv; ++ pm->pmf->get_map_process_packet_size = ++ get_map_process_packet_size_scratch_kv; ++ } else { ++ pm->pmf->map_process = pm_create_map_process; ++ pm->pmf->get_map_process_packet_size = ++ get_map_process_packet_size; ++ } ++ break; ++ case CHIP_CARRIZO: ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ if (fw_ver >= KFD_SCRATCH_CZ_FW_VER) { ++ pm->pmf->map_process = pm_create_map_process_scratch; ++ pm->pmf->get_map_process_packet_size = ++ get_map_process_packet_size_scratch; ++ } else { ++ pm->pmf->map_process = pm_create_map_process; ++ pm->pmf->get_map_process_packet_size = ++ get_map_process_packet_size; ++ } ++ break; ++ ++ } ++ + return 0; + } + +@@ -444,6 +604,7 @@ void pm_uninit(struct packet_manager *pm) + + mutex_destroy(&pm->lock); + kernel_queue_uninit(pm->priv_queue); ++ kfree(pm->pmf); + } + + int pm_send_set_resources(struct packet_manager *pm, +@@ -576,7 +737,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + } + + int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, +- enum kfd_preempt_type_filter mode, ++ enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) + { +@@ -596,8 +757,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + + packet = (struct pm4_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_unmap_queues)); +- pr_debug("kfd: static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n", +- mode, reset, type); ++ pr_debug("kfd: static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n", ++ filter, reset, type); + packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_unmap_queues)); + switch (type) { +@@ -622,26 +783,26 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + +- switch (mode) { +- case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: ++ switch (filter) { ++ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; +- case KFD_PREEMPT_TYPE_FILTER_BY_PASID: ++ case KFD_UNMAP_QUEUES_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; +- case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES: ++ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; + break; +- case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES: ++ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: + /* in this case, we do not preempt static queues */ +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; + break; + default: + BUG(); +@@ -669,3 +830,4 @@ void pm_release_ib(struct packet_manager *pm) + } + mutex_unlock(&pm->lock); + } ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h +index 5b393f3..e7570cc 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h +@@ -127,7 +127,8 @@ struct pm4_runlist { + uint32_t offload_polling:1; + uint32_t reserved3:1; + uint32_t valid:1; +- uint32_t reserved4:8; ++ uint32_t process_cnt:4; ++ uint32_t reserved4:4; + } bitfields4; + uint32_t ordinal4; + }; +@@ -186,6 +187,123 @@ struct pm4_map_process { + }; + #endif + ++/*--------------------MES_MAP_PROCESS_SCRATCH-------------------- */ ++ ++#ifndef PM4_MES_MAP_PROCESS_SCRATCH_DEFINED ++#define PM4_MES_MAP_PROCESS_SCRATCH_DEFINED ++ ++struct pm4_map_process_scratch { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved1:8; ++ uint32_t diq_enable:1; ++ uint32_t process_quantum:7; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t page_table_base:28; ++ uint32_t reserved3:4; ++ } bitfields3; ++ uint32_t ordinal3; ++ }; ++ ++ uint32_t reserved; ++ ++ uint32_t sh_mem_bases; ++ uint32_t sh_mem_config; ++ uint32_t sh_mem_ape1_base; ++ uint32_t sh_mem_ape1_limit; ++ ++ uint32_t sh_hidden_private_base_vmid; ++ ++ uint32_t reserved2; ++ uint32_t reserved3; ++ ++ uint32_t gds_addr_lo; ++ uint32_t gds_addr_hi; ++ ++ union { ++ struct { ++ uint32_t num_gws:6; ++ uint32_t reserved4:2; ++ uint32_t num_oac:4; ++ uint32_t reserved5:4; ++ uint32_t gds_size:6; ++ uint32_t num_queues:10; ++ } bitfields10; ++ uint32_t ordinal10; ++ }; ++ ++ uint32_t completion_signal_lo; ++ uint32_t completion_signal_hi; ++ ++}; ++#endif ++ ++#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH ++#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH ++ ++struct pm4_map_process_scratch_kv { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved1:8; ++ uint32_t diq_enable:1; ++ uint32_t process_quantum:7; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t page_table_base:28; ++ uint32_t reserved2:4; ++ } bitfields3; ++ uint32_t ordinal3; ++ }; ++ ++ uint32_t reserved3; ++ uint32_t sh_mem_bases; ++ uint32_t sh_mem_config; ++ uint32_t sh_mem_ape1_base; ++ uint32_t sh_mem_ape1_limit; ++ uint32_t sh_hidden_private_base_vmid; ++ uint32_t reserved4; ++ uint32_t reserved5; ++ uint32_t gds_addr_lo; ++ uint32_t gds_addr_hi; ++ ++ union { ++ struct { ++ uint32_t num_gws:6; ++ uint32_t reserved6:2; ++ uint32_t num_oac:4; ++ uint32_t reserved7:4; ++ uint32_t gds_size:6; ++ uint32_t num_queues:10; ++ } bitfields14; ++ uint32_t ordinal14; ++ }; ++ ++ uint32_t completion_signal_lo32; ++uint32_t completion_signal_hi32; ++}; ++#endif ++ + /*--------------------MES_MAP_QUEUES--------------------*/ + + #ifndef PM4_MES_MAP_QUEUES_DEFINED +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +index 4750cab..c654471 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -30,13 +30,45 @@ + #include <linux/atomic.h> + #include <linux/workqueue.h> + #include <linux/spinlock.h> ++#include <linux/idr.h> + #include <linux/kfd_ioctl.h> ++#include <linux/pid.h> ++#include <linux/interval_tree.h> + #include <kgd_kfd_interface.h> + ++#include <drm/amd_rdma.h> ++ + #define KFD_SYSFS_FILE_MODE 0444 + +-#define KFD_MMAP_DOORBELL_MASK 0x8000000000000 +-#define KFD_MMAP_EVENTS_MASK 0x4000000000000 ++/* GPU ID hash width in bits */ ++#define KFD_GPU_ID_HASH_WIDTH 16 ++ ++/* Use upper bits of mmap offset to store KFD driver specific information. ++ * BITS[63:62] - Encode MMAP type ++ * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to ++ * BITS[45:40] - Reserved. Not Used. ++ * BITS[39:0] - MMAP offset value. Used by TTM. ++ * ++ * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these ++ * defines are w.r.t to PAGE_SIZE ++ */ ++#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) ++#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) ++#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) ++#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) ++#define KFD_MMAP_TYPE_MAP_BO (0x1ULL << KFD_MMAP_TYPE_SHIFT) ++#define KFD_MMAP_TYPE_RESERVED_MEM (0x0ULL << KFD_MMAP_TYPE_SHIFT) ++ ++#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) ++#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ ++ << KFD_MMAP_GPU_ID_SHIFT) ++#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ ++ & KFD_MMAP_GPU_ID_MASK) ++#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ ++ >> KFD_MMAP_GPU_ID_SHIFT) ++ ++#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT) ++#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) + + /* + * When working with cp scheduler we should assign the HIQ manually or via +@@ -48,8 +80,6 @@ + #define KFD_CIK_HIQ_PIPE 4 + #define KFD_CIK_HIQ_QUEUE 0 + +-/* GPU ID hash width in bits */ +-#define KFD_GPU_ID_HASH_WIDTH 16 + + /* Macro for allocating structures */ + #define kfd_alloc_struct(ptr_to_struct) \ +@@ -74,12 +104,26 @@ extern int max_num_of_queues_per_device; + /* Kernel module parameter to specify the scheduling policy */ + extern int sched_policy; + ++extern int cwsr_enable; ++ ++/* ++ * Kernel module parameter to specify the maximum process ++ * number per HW scheduler ++ */ ++extern int hws_max_conc_proc; ++ + /* + * Kernel module parameter to specify whether to send sigterm to HSA process on + * unhandled exception + */ + extern int send_sigterm; + ++/* ++ * This kernel module is used to simulate large bar machine on non-large bar ++ * enabled machines. ++ */ ++extern int debug_largebar; ++ + /** + * enum kfd_sched_policy + * +@@ -114,14 +158,17 @@ enum cache_policy { + + enum asic_family_type { + CHIP_KAVERI = 0, +- CHIP_CARRIZO ++ CHIP_CARRIZO, ++ CHIP_TONGA, ++ CHIP_FIJI + }; + ++#define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_FIJI) ++#define KFD_IS_DGPU(chip) ((chip) >= CHIP_TONGA && (chip) <= CHIP_FIJI) ++ + struct kfd_event_interrupt_class { +- bool (*interrupt_isr)(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry); +- void (*interrupt_wq)(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry); ++ bool (*interrupt_isr)(struct kfd_dev *dev, const uint32_t *ih_ring_entry); ++ void (*interrupt_wq)(struct kfd_dev *dev, const uint32_t *ih_ring_entry); + }; + + struct kfd_device_info { +@@ -132,6 +179,7 @@ struct kfd_device_info { + size_t ih_ring_entry_size; + uint8_t num_of_watch_points; + uint16_t mqd_size_aligned; ++ bool is_need_iommu_device; + }; + + struct kfd_mem_obj { +@@ -141,6 +189,12 @@ struct kfd_mem_obj { + uint32_t *cpu_ptr; + }; + ++struct kfd_vmid_info { ++ uint32_t first_vmid_kfd; ++ uint32_t last_vmid_kfd; ++ uint32_t vmid_num_kfd; ++}; ++ + struct kfd_dev { + struct kgd_dev *kgd; + +@@ -165,11 +219,12 @@ struct kfd_dev { + */ + + struct kgd2kfd_shared_resources shared_resources; ++ struct kfd_vmid_info vm_info; + + const struct kfd2kgd_calls *kfd2kgd; + struct mutex doorbell_mutex; +- DECLARE_BITMAP(doorbell_available_index, +- KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); ++ unsigned long doorbell_available_index[DIV_ROUND_UP( ++ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; + + void *gtt_mem; + uint64_t gtt_start_gpu_addr; +@@ -179,6 +234,11 @@ struct kfd_dev { + unsigned int gtt_sa_chunk_size; + unsigned int gtt_sa_num_of_chunks; + ++ /* QCM Device instance */ ++ struct device_queue_manager *dqm; ++ ++ bool init_complete; ++ + /* Interrupts */ + void *interrupt_ring; + size_t interrupt_ring_size; +@@ -187,10 +247,6 @@ struct kfd_dev { + struct work_struct interrupt_work; + spinlock_t interrupt_lock; + +- /* QCM Device instance */ +- struct device_queue_manager *dqm; +- +- bool init_complete; + /* + * Interrupts of interest to KFD are copied + * from the HW ring into a SW ring. +@@ -198,7 +254,26 @@ struct kfd_dev { + bool interrupts_active; + + /* Debug manager */ +- struct kfd_dbgmgr *dbgmgr; ++ struct kfd_dbgmgr *dbgmgr; ++ ++ /* MEC firmware version*/ ++ uint16_t mec_fw_version; ++ ++ /* Maximum process number mapped to HW scheduler */ ++ unsigned int max_proc_per_quantum; ++ ++ /* cwsr */ ++ bool cwsr_enabled; ++ struct page *cwsr_pages; ++ uint32_t cwsr_size; ++ uint32_t tma_offset; /*Offset for TMA from the start of cwsr_mem*/ ++}; ++ ++struct kfd_bo { ++ void *mem; ++ struct interval_tree_node it; ++ struct kfd_dev *dev; ++ struct list_head cb_data_head; + }; + + /* KGD2KFD callbacks */ +@@ -221,22 +296,22 @@ void kfd_chardev_exit(void); + struct device *kfd_chardev(void); + + /** +- * enum kfd_preempt_type_filter ++ * enum kfd_unmap_queues_filter + * +- * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue. ++ * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue. + * +- * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the ++ * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the + * running queues list. + * +- * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to ++ * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to + * specific process. + * + */ +-enum kfd_preempt_type_filter { +- KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE, +- KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, +- KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, +- KFD_PREEMPT_TYPE_FILTER_BY_PASID ++enum kfd_unmap_queues_filter { ++ KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE, ++ KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, ++ KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, ++ KFD_UNMAP_QUEUES_FILTER_BY_PASID + }; + + enum kfd_preempt_type { +@@ -324,6 +399,7 @@ struct queue_properties { + uint32_t __iomem *doorbell_ptr; + uint32_t doorbell_off; + bool is_interop; ++ bool is_evicted; /* true -> queue is evicted */ + bool is_active; + /* Not relevant for user mode queues in cp scheduling */ + unsigned int vmid; +@@ -336,6 +412,11 @@ struct queue_properties { + uint32_t eop_ring_buffer_size; + uint64_t ctx_save_restore_area_address; + uint32_t ctx_save_restore_area_size; ++ uint32_t ctl_stack_size; ++ uint64_t tba_addr; ++ uint64_t tma_addr; ++ /* Relevant for CU */ ++ uint32_t cu_mask; + }; + + /** +@@ -424,6 +505,7 @@ struct qcm_process_device { + unsigned int queue_count; + unsigned int vmid; + bool is_debug; ++ unsigned evicted; /* eviction counter, 0=active */ + /* + * All the memory management data should be here too + */ +@@ -436,8 +518,22 @@ struct qcm_process_device { + uint32_t gds_size; + uint32_t num_gws; + uint32_t num_oac; ++ uint32_t sh_hidden_private_base; ++ ++ /*cwsr memory*/ ++ int cwsr_mem_handle; ++ uint64_t cwsr_base; ++ uint64_t tba_addr; ++ uint64_t tma_addr; ++ void *cwsr_kaddr; + }; + ++/*8 byte handle containing GPU ID in the most significant 4 bytes and ++ * idr_handle in the least significant 4 bytes*/ ++#define MAKE_HANDLE(gpu_id, idr_handle) (((uint64_t)(gpu_id) << 32) + idr_handle) ++#define GET_GPU_ID(handle) (handle >> 32) ++#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) ++ + /* Data that is per-process-per device. */ + struct kfd_process_device { + /* +@@ -449,6 +545,8 @@ struct kfd_process_device { + /* The device that owns this data. */ + struct kfd_dev *dev; + ++ /* The process that owns this kfd_process_device. */ ++ struct kfd_process *process; + + /* per-process-per device QCM data structure */ + struct qcm_process_device qpd; +@@ -460,10 +558,23 @@ struct kfd_process_device { + uint64_t gpuvm_limit; + uint64_t scratch_base; + uint64_t scratch_limit; ++ uint64_t dgpu_base; ++ uint64_t dgpu_limit; ++ uint64_t mapped_size; ++ uint64_t last_eviction; ++ bool evicted; ++ ++ uint64_t sh_hidden_private_base_vmid; + + /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ + bool bound; + ++ /* VM context for GPUVM allocations */ ++ void *vm; ++ ++ /* GPUVM allocations storage */ ++ struct idr alloc_idr; ++ + /* This flag tells if we should reset all + * wavefronts on process termination + */ +@@ -482,7 +593,7 @@ struct kfd_process { + + struct mm_struct *mm; + +- struct mutex mutex; ++ struct rw_semaphore lock; + + /* + * In any process, the thread that started main() is the lead +@@ -513,6 +624,8 @@ struct kfd_process { + /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ + struct kfd_queue **queues; + ++ unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; ++ + /*Is the user space process 32 bit?*/ + bool is_32bit_user_mode; + +@@ -520,10 +633,12 @@ struct kfd_process { + struct mutex event_mutex; + /* All events in process hashed by ID, linked on kfd_event.events. */ + DECLARE_HASHTABLE(events, 4); +- struct list_head signal_event_pages; /* struct slot_page_header. +- event_pages */ ++ struct list_head signal_event_pages; /* struct slot_page_header.event_pages */ + u32 next_nonsignal_event_id; + size_t signal_event_count; ++ size_t debug_event_count; ++ ++ struct rb_root bo_interval_tree; + }; + + /** +@@ -546,9 +661,10 @@ struct amdkfd_ioctl_desc { + + void kfd_process_create_wq(void); + void kfd_process_destroy_wq(void); +-struct kfd_process *kfd_create_process(const struct task_struct *); ++struct kfd_process *kfd_create_process(struct file *filep); + struct kfd_process *kfd_get_process(const struct task_struct *); + struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); ++struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); + + struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + struct kfd_process *p); +@@ -558,6 +674,29 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); + ++int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma); ++ ++/* KFD process API for creating and translating handles */ ++int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, ++ void *mem, uint64_t start, ++ uint64_t length); ++void *kfd_process_device_translate_handle(struct kfd_process_device *p, ++ int handle); ++struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, ++ int handle); ++void *kfd_process_find_bo_from_interval(struct kfd_process *p, ++ uint64_t start_addr, ++ uint64_t last_addr); ++void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, ++ int handle); ++ ++void run_rdma_free_callback(struct kfd_bo *buf_obj); ++struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); ++ ++/* kfd dgpu memory */ ++int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem, ++ struct kfd_process *p, struct kfd_process_device *pdd); ++ + /* Process device data iterator */ + struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p); + struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, +@@ -600,7 +739,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu); + int kfd_topology_remove_device(struct kfd_dev *gpu); + struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); + struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); +-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); ++struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd); ++uint32_t kfd_get_gpu_id(struct kfd_dev *dev); ++int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); ++int kfd_numa_node_to_apic_id(int numa_node_id); ++int kfd_get_proximity_domain(const struct pci_bus *bus); + + /* Interrupts */ + int kfd_interrupt_init(struct kfd_dev *dev); +@@ -615,9 +758,12 @@ int kgd2kfd_resume(struct kfd_dev *kfd); + + /* amdkfd Apertures */ + int kfd_init_apertures(struct kfd_process *process); ++int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, ++ uint64_t base, uint64_t limit); + + /* Queue Context Management */ +-struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); ++inline uint32_t lower_32(uint64_t x); ++inline uint32_t upper_32(uint64_t x); + + int init_queue(struct queue **q, const struct queue_properties *properties); + void uninit_queue(struct queue *q); +@@ -630,11 +776,15 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); + struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); ++struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev); + struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); + void device_queue_manager_uninit(struct device_queue_manager *dqm); + struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + enum kfd_queue_type type); + void kernel_queue_uninit(struct kernel_queue *kq); ++int kfd_process_vm_fault(struct device_queue_manager *dqm, ++ unsigned int pasid); + + /* Process Queue Manager */ + struct process_queue_node { +@@ -649,18 +799,16 @@ int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, +- unsigned int flags, +- enum kfd_queue_type type, + unsigned int *qid); + int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); + int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p); ++int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, ++ struct queue_properties *p); + struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, + unsigned int qid); +- +-int amdkfd_fence_wait_timeout(unsigned int *fence_addr, +- unsigned int fence_value, +- unsigned long timeout); ++int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm); ++int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); + + /* Packet Manager */ + +@@ -668,7 +816,9 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + + #define KFD_FENCE_COMPLETED (100) + #define KFD_FENCE_INIT (10) +-#define KFD_UNMAP_LATENCY (150) ++#define KFD_UNMAP_LATENCY (40) ++ ++struct packet_manager_firmware; + + struct packet_manager { + struct device_queue_manager *dqm; +@@ -676,9 +826,19 @@ struct packet_manager { + struct mutex lock; + bool allocated; + struct kfd_mem_obj *ib_buffer_obj; ++ ++ struct packet_manager_firmware *pmf; + }; + +-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); ++struct packet_manager_firmware { ++ /* Support different firmware versions for map process packet */ ++ int (*map_process)(struct packet_manager *pm, uint32_t *buffer, ++ struct qcm_process_device *qpd); ++ int (*get_map_process_packet_size)(void); ++}; ++ ++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, ++ uint16_t fw_ver); + void pm_uninit(struct packet_manager *pm); + int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res); +@@ -687,7 +847,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value); + + int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, +- enum kfd_preempt_type_filter mode, ++ enum kfd_unmap_queues_filter mode, + uint32_t filter_param, bool reset, + unsigned int sdma_engine); + +@@ -696,6 +856,9 @@ void pm_release_ib(struct packet_manager *pm); + uint64_t kfd_get_number_elems(struct kfd_dev *kfd); + phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process); ++int amdkfd_fence_wait_timeout(unsigned int *fence_addr, ++ unsigned int fence_value, ++ unsigned long timeout); + + /* Events */ + extern const struct kfd_event_interrupt_class event_interrupt_class_cik; +@@ -714,8 +877,7 @@ int kfd_wait_on_events(struct kfd_process *p, + uint32_t num_events, void __user *data, + bool all, uint32_t user_timeout_ms, + enum kfd_event_wait_result *wait_result); +-void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, +- uint32_t valid_id_bits); ++void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits); + void kfd_signal_iommu_event(struct kfd_dev *dev, + unsigned int pasid, unsigned long address, + bool is_write_requested, bool is_execute_requested); +@@ -723,11 +885,28 @@ void kfd_signal_hw_exception_event(unsigned int pasid); + int kfd_set_event(struct kfd_process *p, uint32_t event_id); + int kfd_reset_event(struct kfd_process *p, uint32_t event_id); + int kfd_event_create(struct file *devkfd, struct kfd_process *p, +- uint32_t event_type, bool auto_reset, uint32_t node_id, +- uint32_t *event_id, uint32_t *event_trigger_data, +- uint64_t *event_page_offset, uint32_t *event_slot_index); ++ uint32_t event_type, bool auto_reset, uint32_t node_id, ++ uint32_t *event_id, uint32_t *event_trigger_data, ++ uint64_t *event_page_offset, uint32_t *event_slot_index, ++ void *kern_addr); + int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); ++void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle); ++ ++void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, ++ struct kfd_vm_fault_info *info); ++ ++void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid); + + int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); ++int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem); ++int kgd2kfd_restore(struct kfd_dev *kfd); ++int evict_size(struct kfd_process *p, int size, int type); ++int evict_bo(struct kfd_dev *dev, void *mem); ++int restore(struct kfd_dev *kfd); ++ ++#define KFD_SCRATCH_CZ_FW_VER 600 ++#define KFD_SCRATCH_KV_FW_VER 413 ++#define KFD_MULTI_PROC_MAPPING_HWS_SUPPORT 600 ++#define KFD_CWSR_CZ_FW_VER 625 + + #endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +index 035bbc9..a069c3d 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +@@ -28,6 +28,10 @@ + #include <linux/amd-iommu.h> + #include <linux/notifier.h> + #include <linux/compat.h> ++#include <linux/mm.h> ++#include <asm/tlb.h> ++#include <linux/highmem.h> ++#include <uapi/asm-generic/mman-common.h> + + struct mm_struct; + +@@ -41,6 +45,7 @@ struct mm_struct; + */ + #define INITIAL_QUEUE_ARRAY_SIZE 16 + ++static int evict_pdd(struct kfd_process_device *pdd); + /* + * List of struct kfd_process (field kfd_process). + * Unique/indexed by mm_struct* +@@ -58,8 +63,14 @@ struct kfd_process_release_work { + struct kfd_process *p; + }; + +-static struct kfd_process *find_process(const struct task_struct *thread); ++#define MIN_IDR_ID 1 ++#define MAX_IDR_ID 0 /*0 - for unlimited*/ ++ ++static struct kfd_process *find_process(const struct task_struct *thread, ++ bool lock); + static struct kfd_process *create_process(const struct task_struct *thread); ++static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); ++ + + void kfd_process_create_wq(void) + { +@@ -75,10 +86,12 @@ void kfd_process_destroy_wq(void) + } + } + +-struct kfd_process *kfd_create_process(const struct task_struct *thread) ++struct kfd_process *kfd_create_process(struct file *filep) + { + struct kfd_process *process; + ++ struct task_struct *thread = current; ++ + BUG_ON(!kfd_process_wq); + + if (thread->mm == NULL) +@@ -99,7 +112,7 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread) + mutex_lock(&kfd_processes_mutex); + + /* A prior open of /dev/kfd could have already created the process. */ +- process = find_process(thread); ++ process = find_process(thread, false); + if (process) + pr_debug("kfd: process already found\n"); + +@@ -110,6 +123,8 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread) + + up_write(&thread->mm->mmap_sem); + ++ kfd_process_init_cwsr(process, filep); ++ + return process; + } + +@@ -124,7 +139,7 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + +- process = find_process(thread); ++ process = find_process(thread, false); + + return process; + } +@@ -141,23 +156,164 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) + return NULL; + } + +-static struct kfd_process *find_process(const struct task_struct *thread) ++static struct kfd_process *find_process(const struct task_struct *thread, ++ bool lock) + { + struct kfd_process *p; + int idx; + + idx = srcu_read_lock(&kfd_processes_srcu); + p = find_process_by_mm(thread->mm); ++ if (p && lock) ++ down_read(&p->lock); + srcu_read_unlock(&kfd_processes_srcu, idx); + + return p; + } + ++/* This returns with process->lock read-locked. */ ++struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid) ++{ ++ struct task_struct *task = NULL; ++ struct kfd_process *p = NULL; ++ ++ if (!pid) ++ task = current; ++ else ++ task = get_pid_task(pid, PIDTYPE_PID); ++ ++ if (task) ++ p = find_process(task, true); ++ ++ return p; ++} ++ ++int evict_size(struct kfd_process *process, int size, int type) ++{ ++ struct kfd_process_device *pdd, *temp_pdd = NULL; ++ struct kfd_process *p = process; ++ int temp = 0; ++ ++ down_write(&p->lock); ++ ++ if (type == EVICT_FIRST_PDD) { ++ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { ++ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", ++ pdd->dev->id, p->pasid); ++ if (pdd->mapped_size >= size) { ++ evict_pdd(pdd); ++ return 0; ++ } ++ ++ } ++ } else if (type == EVICT_BIGGEST_PDD) { ++ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { ++ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", ++ pdd->dev->id, p->pasid); ++ if (pdd->mapped_size >= temp) { ++ temp = pdd->mapped_size; ++ temp_pdd = pdd; ++ } ++ ++ } ++ if (temp_pdd->mapped_size > size) { ++ evict_pdd(temp_pdd); ++ return 0; ++ } ++ ++ } ++ up_write(&p->lock); ++ return 0; ++ ++} ++ ++int evict_bo(struct kfd_dev *dev, void *mem) ++{ ++ struct kfd_process_device *pdd; ++ ++ pdd = dev->kfd2kgd->get_pdd_from_buffer_object(dev->kgd, ++ ((struct kgd_mem *)mem)); ++ ++ if (pdd) ++ evict_pdd(pdd); ++ ++ return 0; ++} ++ ++static int evict_pdd(struct kfd_process_device *pdd) ++{ ++ void *mem; ++ int id; ++ ++ /*process_evict_queues(struct device_queue_manager *dqm, pdd->qpd)*/ ++ /* ++ * Remove all handles from idr and release appropriate ++ * local memory object ++ */ ++ idr_for_each_entry(&pdd->alloc_idr, mem, id) { ++ pdd->dev->kfd2kgd->unmap_memory_to_gpu( ++ pdd->dev->kgd, mem, pdd->vm); ++ } ++ pdd->last_eviction = jiffies; ++ pdd->mapped_size = 0; ++ pdd->evicted = true; ++ ++ /*flush_tlb_all();*/ ++ ++ return 0; ++} ++ ++int restore(struct kfd_dev *kfd) ++{ ++ struct kfd_process *p = NULL; ++ /* TODO still working on how to get the process */ ++ struct kfd_process_device *pdd = kfd_get_process_device_data(kfd, p); ++ void *mem; ++ int id; ++ ++ /* need to run on all processes*/ ++ down_write(&p->lock); ++ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { ++ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", ++ pdd->dev->id, p->pasid); ++ ++ /* ++ * Remove all handles from idr and release appropriate ++ * local memory object ++ */ ++ if (pdd->evicted) { ++ idr_for_each_entry(&pdd->alloc_idr, mem, id) { ++ pdd->dev->kfd2kgd->map_memory_to_gpu( ++ pdd->dev->kgd, ++ mem, pdd->vm); ++ pdd->last_eviction = 0; ++ pdd->mapped_size = 0; ++ } ++ ++ /*process_restore_queues ++ * (struct device_queue_manager *dqm, pdd->qpd)*/ ++ } else { ++ pdd->evicted = false; ++ } ++ } ++ up_write(&p->lock); ++ return 0; ++} ++ ++/* No process locking is needed in this function, because the process ++ * is not findable any more. We must assume that no other thread is ++ * using it any more, otherwise we couldn't safely free the process ++ * stucture in the end. */ + static void kfd_process_wq_release(struct work_struct *work) + { + struct kfd_process_release_work *my_work; +- struct kfd_process_device *pdd, *temp; ++ struct kfd_process_device *pdd, *temp, *peer_pdd; + struct kfd_process *p; ++ struct kfd_bo *buf_obj; ++ int id; + + my_work = (struct kfd_process_release_work *) work; + +@@ -166,19 +322,40 @@ static void kfd_process_wq_release(struct work_struct *work) + pr_debug("Releasing process (pasid %d) in workqueue\n", + p->pasid); + +- mutex_lock(&p->mutex); +- +- list_for_each_entry_safe(pdd, temp, &p->per_device_data, +- per_device_list) { ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", + pdd->dev->id, p->pasid); + +- if (pdd->reset_wavefronts) +- dbgdev_wave_reset_wavefronts(pdd->dev, p); ++ if (pdd->dev->device_info->is_need_iommu_device) ++ amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); ++ ++ /* ++ * Remove all handles from idr and release appropriate ++ * local memory object ++ */ ++ idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) { ++ list_for_each_entry(peer_pdd, ++ &p->per_device_data, per_device_list) { ++ pdd->dev->kfd2kgd->unmap_memory_to_gpu( ++ peer_pdd->dev->kgd, ++ buf_obj->mem, peer_pdd->vm); ++ } ++ ++ run_rdma_free_callback(buf_obj); ++ pdd->dev->kfd2kgd->free_memory_of_gpu( ++ pdd->dev->kgd, buf_obj->mem); ++ kfd_process_device_remove_obj_handle(pdd, id); ++ } ++ } + +- amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); ++ list_for_each_entry_safe(pdd, temp, &p->per_device_data, ++ per_device_list) { ++ radeon_flush_tlb(pdd->dev, p->pasid); ++ /* Destroy the GPUVM VM context */ ++ if (pdd->vm) ++ pdd->dev->kfd2kgd->destroy_process_vm( ++ pdd->dev->kgd, pdd->vm); + list_del(&pdd->per_device_list); +- + kfree(pdd); + } + +@@ -186,15 +363,11 @@ static void kfd_process_wq_release(struct work_struct *work) + + kfd_pasid_free(p->pasid); + +- mutex_unlock(&p->mutex); +- +- mutex_destroy(&p->mutex); +- + kfree(p->queues); + + kfree(p); + +- kfree(work); ++ kfree((void *)work); + } + + static void kfd_process_destroy_delayed(struct rcu_head *rcu) +@@ -223,6 +396,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, + { + struct kfd_process *p; + struct kfd_process_device *pdd = NULL; ++ struct kfd_dev *dev = NULL; ++ long status = -EFAULT; + + /* + * The kfd_process structure can not be free because the +@@ -236,9 +411,31 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, + mutex_unlock(&kfd_processes_mutex); + synchronize_srcu(&kfd_processes_srcu); + +- mutex_lock(&p->mutex); ++ down_write(&p->lock); ++ ++ /* Iterate over all process device data structures and if the pdd is in ++ * debug mode,we should first force unregistration, then we will be ++ * able to destroy the queues */ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { ++ dev = pdd->dev; ++ mutex_lock(get_dbgmgr_mutex()); ++ ++ if ((dev != NULL) && ++ (dev->dbgmgr) && ++ (dev->dbgmgr->pasid == p->pasid)) { ++ ++ status = kfd_dbgmgr_unregister(dev->dbgmgr, p); ++ if (status == 0) { ++ kfd_dbgmgr_destroy(dev->dbgmgr); ++ dev->dbgmgr = NULL; ++ } ++ } ++ mutex_unlock(get_dbgmgr_mutex()); ++ } ++ ++ ++ /* now we can uninit the pqm: */ + +- /* In case our notifier is called before IOMMU notifier */ + pqm_uninit(&p->pqm); + + /* Iterate over all process device data structure and check +@@ -256,7 +453,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, + } + } + +- mutex_unlock(&p->mutex); ++ up_write(&p->lock); + + /* + * Because we drop mm_count inside kfd_process_destroy_delayed +@@ -272,6 +469,94 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { + .release = kfd_process_notifier_release, + }; + ++static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep) ++{ ++ int err; ++ unsigned long offset; ++ struct kfd_process_device *temp, *pdd = NULL; ++ void *mem = NULL; ++ struct kfd_dev *dev = NULL; ++ struct qcm_process_device *qpd = NULL; ++ ++ down_write(&p->lock); ++ list_for_each_entry_safe(pdd, temp, &p->per_device_data, ++ per_device_list) { ++ dev = pdd->dev; ++ qpd = &pdd->qpd; ++ if (!dev->cwsr_enabled || qpd->tba_addr) ++ continue; ++ if (qpd->cwsr_base) { ++ /* cwsr_base is only set for DGPU */ ++ ++ /* can't hold the process lock while ++ * allocating from KGD */ ++ up_write(&p->lock); ++ ++ err = dev->kfd2kgd->alloc_memory_of_gpu( ++ dev->kgd, qpd->cwsr_base, dev->cwsr_size, ++ pdd->vm, (struct kgd_mem **)&mem, ++ NULL, &qpd->cwsr_kaddr, pdd, ++ ALLOC_MEM_FLAGS_GTT | ++ ALLOC_MEM_FLAGS_NONPAGED | ++ ALLOC_MEM_FLAGS_EXECUTE_ACCESS | ++ ALLOC_MEM_FLAGS_NO_SUBSTITUTE); ++ if (err) ++ goto err_alloc_tba; ++ err = kfd_map_memory_to_gpu(dev, mem, p, pdd); ++ if (err) ++ goto err_map_tba; ++ ++ down_write(&p->lock); ++ /* Check if someone else allocated the memory ++ * while we weren't looking */ ++ if (qpd->tba_addr) { ++ up_write(&p->lock); ++ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, ++ (struct kgd_mem *)mem, pdd->vm); ++ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem); ++ down_write(&p->lock); ++ } else { ++ qpd->cwsr_mem_handle = ++ kfd_process_device_create_obj_handle( ++ pdd, mem, qpd->cwsr_base, ++ dev->cwsr_size); ++ if (qpd->cwsr_mem_handle < 0) ++ goto err_create_handle; ++ ++ memcpy(qpd->cwsr_kaddr, kmap(dev->cwsr_pages), ++ PAGE_SIZE); ++ kunmap(dev->cwsr_pages); ++ qpd->tba_addr = qpd->cwsr_base; ++ } ++ } else { ++ offset = (kfd_get_gpu_id(dev) | ++ KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT; ++ qpd->tba_addr = (uint64_t)vm_mmap(filep, 0, ++ dev->cwsr_size, PROT_READ | PROT_EXEC, ++ MAP_SHARED, offset); ++ qpd->cwsr_kaddr = (void *)qpd->tba_addr; ++ } ++ if (IS_ERR_VALUE(qpd->tba_addr)) { ++ pr_err("Failure to set tba address. error -%d.\n", ++ (int)qpd->tba_addr); ++ qpd->tba_addr = 0; ++ qpd->cwsr_kaddr = NULL; ++ } else ++ qpd->tma_addr = qpd->tba_addr + dev->tma_offset; ++ pr_debug("set tba :0x%llx, tma:0x%llx for pqm.\n", ++ qpd->tba_addr, qpd->tma_addr); ++ } ++ ++err_create_handle: ++ up_write(&p->lock); ++ return err; ++ ++err_map_tba: ++ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem); ++err_alloc_tba: ++ return err; ++} ++ + static struct kfd_process *create_process(const struct task_struct *thread) + { + struct kfd_process *process; +@@ -282,6 +567,8 @@ static struct kfd_process *create_process(const struct task_struct *thread) + if (!process) + goto err_alloc_process; + ++ process->bo_interval_tree = RB_ROOT; ++ + process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, + sizeof(process->queues[0]), GFP_KERNEL); + if (!process->queues) +@@ -291,7 +578,7 @@ static struct kfd_process *create_process(const struct task_struct *thread) + if (process->pasid == 0) + goto err_alloc_pasid; + +- mutex_init(&process->mutex); ++ init_rwsem(&process->lock); + + process->mm = thread->mm; + +@@ -364,8 +651,22 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + INIT_LIST_HEAD(&pdd->qpd.queues_list); + INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); + pdd->qpd.dqm = dev->dqm; ++ pdd->qpd.pqm = &p->pqm; ++ pdd->qpd.evicted = 0; + pdd->reset_wavefronts = false; ++ pdd->process = p; + list_add(&pdd->per_device_list, &p->per_device_data); ++ ++ /* Init idr used for memory handle translation */ ++ idr_init(&pdd->alloc_idr); ++ ++ /* Create the GPUVM context for this specific device */ ++ if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm)) { ++ pr_err("Failed to create process VM object\n"); ++ list_del(&pdd->per_device_list); ++ kfree(pdd); ++ pdd = NULL; ++ } + } + + return pdd; +@@ -393,9 +694,11 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + if (pdd->bound) + return pdd; + +- err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); +- if (err < 0) +- return ERR_PTR(err); ++ if (dev->device_info->is_need_iommu_device) { ++ err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); ++ if (err < 0) ++ return ERR_PTR(err); ++ } + + pdd->bound = true; + +@@ -420,18 +723,21 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) + + pr_debug("Unbinding process %d from IOMMU\n", pasid); + +- if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) +- kfd_dbgmgr_destroy(dev->dbgmgr); +- +- pqm_uninit(&p->pqm); ++ mutex_lock(get_dbgmgr_mutex()); + +- pdd = kfd_get_process_device_data(dev, p); ++ if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) { + +- if (!pdd) { +- mutex_unlock(&p->mutex); +- return; ++ if (kfd_dbgmgr_unregister(dev->dbgmgr, p) == 0) { ++ kfd_dbgmgr_destroy(dev->dbgmgr); ++ dev->dbgmgr = NULL; ++ } + } + ++ mutex_unlock(get_dbgmgr_mutex()); ++ ++ pqm_uninit(&p->pqm); ++ ++ pdd = kfd_get_process_device_data(dev, p); + if (pdd->reset_wavefronts) { + dbgdev_wave_reset_wavefronts(pdd->dev, p); + pdd->reset_wavefronts = false; +@@ -444,9 +750,10 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) + * We don't call amd_iommu_unbind_pasid() here + * because the IOMMU called us. + */ +- pdd->bound = false; ++ if (pdd) ++ pdd->bound = false; + +- mutex_unlock(&p->mutex); ++ up_write(&p->lock); + } + + struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p) +@@ -469,7 +776,116 @@ bool kfd_has_process_device_data(struct kfd_process *p) + return !(list_empty(&p->per_device_data)); + } + +-/* This returns with process->mutex locked. */ ++/* Create specific handle mapped to mem from process local memory idr ++ * Assumes that the process lock is held. */ ++int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, ++ void *mem, uint64_t start, ++ uint64_t length) ++{ ++ int handle; ++ struct kfd_bo *buf_obj; ++ struct kfd_process *p; ++ ++ BUG_ON(pdd == NULL); ++ BUG_ON(mem == NULL); ++ ++ p = pdd->process; ++ ++ buf_obj = kmalloc(sizeof(*buf_obj), GFP_KERNEL); ++ ++ if (!buf_obj) ++ return -ENOMEM; ++ ++ buf_obj->it.start = start; ++ buf_obj->it.last = start + length - 1; ++ interval_tree_insert(&buf_obj->it, &p->bo_interval_tree); ++ ++ buf_obj->mem = mem; ++ buf_obj->dev = pdd->dev; ++ ++ INIT_LIST_HEAD(&buf_obj->cb_data_head); ++ ++ idr_preload(GFP_KERNEL); ++ ++ handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID, ++ GFP_NOWAIT); ++ ++ idr_preload_end(); ++ ++ return handle; ++} ++ ++struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, ++ int handle) ++{ ++ BUG_ON(pdd == NULL); ++ ++ if (handle < 0) ++ return NULL; ++ ++ return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle); ++} ++ ++/* Translate specific handle from process local memory idr ++ * Assumes that the process lock is held. */ ++void *kfd_process_device_translate_handle(struct kfd_process_device *pdd, ++ int handle) ++{ ++ struct kfd_bo *buf_obj; ++ ++ buf_obj = kfd_process_device_find_bo(pdd, handle); ++ ++ return buf_obj->mem; ++} ++ ++void *kfd_process_find_bo_from_interval(struct kfd_process *p, ++ uint64_t start_addr, ++ uint64_t last_addr) ++{ ++ struct interval_tree_node *it_node; ++ struct kfd_bo *buf_obj; ++ ++ it_node = interval_tree_iter_first(&p->bo_interval_tree, ++ start_addr, last_addr); ++ if (!it_node) { ++ pr_err("%llu - %llu does not relate to an existing buffer\n", ++ start_addr, last_addr); ++ return NULL; ++ } ++ ++ BUG_ON(NULL != interval_tree_iter_next(it_node, ++ start_addr, last_addr)); ++ ++ buf_obj = container_of(it_node, struct kfd_bo, it); ++ ++ return buf_obj; ++} ++ ++/* Remove specific handle from process local memory idr ++ * Assumes that the process lock is held. */ ++void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, ++ int handle) ++{ ++ struct kfd_bo *buf_obj; ++ struct kfd_process *p; ++ ++ BUG_ON(pdd == NULL); ++ ++ p = pdd->process; ++ ++ if (handle < 0) ++ return; ++ ++ buf_obj = kfd_process_device_find_bo(pdd, handle); ++ ++ idr_remove(&pdd->alloc_idr, handle); ++ ++ interval_tree_remove(&buf_obj->it, &p->bo_interval_tree); ++ ++ kfree(buf_obj); ++} ++ ++/* This returns with process->lock read-locked. */ + struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) + { + struct kfd_process *p; +@@ -479,7 +895,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + if (p->pasid == pasid) { +- mutex_lock(&p->mutex); ++ down_read(&p->lock); + break; + } + } +@@ -488,3 +904,53 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) + + return p; + } ++ ++/* This returns with process->lock read-locked. */ ++struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) ++{ ++ struct kfd_process *p; ++ ++ int idx = srcu_read_lock(&kfd_processes_srcu); ++ ++ p = find_process_by_mm(mm); ++ if (p != NULL) ++ down_read(&p->lock); ++ ++ srcu_read_unlock(&kfd_processes_srcu, idx); ++ ++ return p; ++} ++ ++int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma) ++{ ++ unsigned long pfn, i; ++ int ret = 0; ++ struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); ++ ++ if (dev == NULL) ++ return -EINVAL; ++ if ((vma->vm_start & (PAGE_SIZE - 1)) || ++ (vma->vm_end & (PAGE_SIZE - 1))) { ++ pr_err("KFD only support page aligned memory map.\n"); ++ return -EINVAL; ++ } ++ ++ pr_debug("kfd reserved mem mmap been called.\n"); ++ /* We supported two reserved memory mmap in the future . ++ 1. Trap handler code and parameter (TBA and TMA , 2 pages total) ++ 2. Relaunch stack (control block, 1 page for Carrizo) ++ */ ++ ++ for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); ++i) { ++ pfn = page_to_pfn(&dev->cwsr_pages[i]); ++ vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND ++ | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; ++ /* mapping the page to user process */ ++ ret = remap_pfn_range(vma, vma->vm_start + (i << PAGE_SHIFT), ++ pfn, PAGE_SIZE, vma->vm_page_prot); ++ if (ret) ++ break; ++ } ++ return ret; ++} ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +index 46f497e..e79cd42 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +@@ -89,23 +89,36 @@ void pqm_uninit(struct process_queue_manager *pqm) + { + int retval; + struct process_queue_node *pqn, *next; ++ struct kfd_process_device *pdd; ++ struct kfd_dev *dev = NULL; + + BUG_ON(!pqm); + + pr_debug("In func %s\n", __func__); + + list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { +- retval = pqm_destroy_queue( +- pqm, +- (pqn->q != NULL) ? +- pqn->q->properties.queue_id : +- pqn->kq->queue->properties.queue_id); +- +- if (retval != 0) { +- pr_err("kfd: failed to destroy queue\n"); +- return; ++ if (pqn->q) ++ dev = pqn->q->device; ++ else if (pqn->kq) ++ dev = pqn->kq->dev; ++ else ++ BUG(); ++ ++ pdd = kfd_get_process_device_data(dev, pqm->process); ++ if (pdd) { ++ retval = dev->dqm->ops.process_termination ++ (dev->dqm, &pdd->qpd); ++ if (retval != 0) ++ pdd->reset_wavefronts = true; + } + } ++ ++ list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { ++ uninit_queue(pqn->q); ++ list_del(&pqn->process_queue_list); ++ kfree(pqn); ++ } ++ + kfree(pqm->queue_slot_bitmap); + pqm->queue_slot_bitmap = NULL; + } +@@ -148,23 +161,19 @@ int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, +- unsigned int flags, +- enum kfd_queue_type type, + unsigned int *qid) + { + int retval; + struct kfd_process_device *pdd; +- struct queue_properties q_properties; + struct queue *q; + struct process_queue_node *pqn; + struct kernel_queue *kq; + int num_queues = 0; + struct queue *cur; ++ enum kfd_queue_type type = properties->type; + + BUG_ON(!pqm || !dev || !properties || !qid); + +- memset(&q_properties, 0, sizeof(struct queue_properties)); +- memcpy(&q_properties, properties, sizeof(struct queue_properties)); + q = NULL; + kq = NULL; + +@@ -192,10 +201,9 @@ int pqm_create_queue(struct process_queue_manager *pqm, + if (retval != 0) + return retval; + +- if (list_empty(&pqm->queues)) { +- pdd->qpd.pqm = pqm; ++ if (list_empty(&pdd->qpd.queues_list) && ++ list_empty(&pdd->qpd.priv_queue_list)) + dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); +- } + + pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL); + if (!pqn) { +@@ -205,18 +213,34 @@ int pqm_create_queue(struct process_queue_manager *pqm, + + switch (type) { + case KFD_QUEUE_TYPE_SDMA: ++ if (dev->dqm->sdma_queue_count >= CIK_SDMA_QUEUES) { ++ pr_err("kfd: over-subscription is not allowed for SDMA.\n"); ++ retval = -EPERM; ++ goto err_create_queue; ++ } ++ ++ retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); ++ if (retval != 0) ++ goto err_create_queue; ++ pqn->q = q; ++ pqn->kq = NULL; ++ retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, ++ &q->properties.vmid); ++ pr_debug("DQM returned %d for create_queue\n", retval); ++ print_queue(q); ++ break; + + case KFD_QUEUE_TYPE_COMPUTE: + /* check if there is over subscription */ + if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && +- ((dev->dqm->processes_count >= VMID_PER_DEVICE) || ++ ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || + (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) { + pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); + retval = -EPERM; + goto err_create_queue; + } + +- retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid); ++ retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; +@@ -253,9 +277,8 @@ int pqm_create_queue(struct process_queue_manager *pqm, + list_add(&pqn->process_queue_list, &pqm->queues); + + if (q) { +- *properties = q->properties; + pr_debug("kfd: PQM done creating queue\n"); +- print_queue_properties(properties); ++ print_queue_properties(&q->properties); + } + + return retval; +@@ -265,7 +288,8 @@ int pqm_create_queue(struct process_queue_manager *pqm, + err_allocate_pqn: + /* check if queues list is empty unregister process from device */ + clear_bit(*qid, pqm->queue_slot_bitmap); +- if (list_empty(&pqm->queues)) ++ if (list_empty(&pdd->qpd.queues_list) && ++ list_empty(&pdd->qpd.priv_queue_list)) + dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); + return retval; + } +@@ -314,9 +338,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) + if (pqn->q) { + dqm = pqn->q->device->dqm; + retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); +- if (retval != 0) ++ if (retval != 0) { ++ if (retval == -ETIME) ++ pdd->reset_wavefronts = true; + return retval; +- ++ } + uninit_queue(pqn->q); + } + +@@ -324,7 +350,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) + kfree(pqn); + clear_bit(qid, pqm->queue_slot_bitmap); + +- if (list_empty(&pqm->queues)) ++ if (list_empty(&pdd->qpd.queues_list) && ++ list_empty(&pdd->qpd.priv_queue_list)) + dqm->ops.unregister_process(dqm, &pdd->qpd); + + return retval; +@@ -358,6 +385,31 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + return 0; + } + ++int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, ++ struct queue_properties *p) ++{ ++ int retval; ++ struct process_queue_node *pqn; ++ ++ BUG_ON(!pqm); ++ ++ pqn = get_queue_by_qid(pqm, qid); ++ if (!pqn) { ++ pr_debug("amdkfd: No queue %d exists for update operation\n", ++ qid); ++ return -EFAULT; ++ } ++ ++ pqn->q->properties.cu_mask = p->cu_mask; ++ ++ retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, ++ pqn->q); ++ if (retval != 0) ++ return retval; ++ ++ return 0; ++} ++ + struct kernel_queue *pqm_get_kernel_queue( + struct process_queue_manager *pqm, + unsigned int qid) +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c +new file mode 100644 +index 0000000..69bdaf1 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c +@@ -0,0 +1,296 @@ ++/* ++ * Copyright 2015 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include <linux/device.h> ++#include <linux/export.h> ++#include <linux/pid.h> ++#include <linux/err.h> ++#include <linux/slab.h> ++#include "amd_rdma.h" ++#include "kfd_priv.h" ++ ++ ++struct rdma_cb { ++ struct list_head node; ++ struct amd_p2p_info amd_p2p_data; ++ void (*free_callback)(void *client_priv); ++ void *client_priv; ++}; ++ ++/** ++ * This function makes the pages underlying a range of GPU virtual memory ++ * accessible for DMA operations from another PCIe device ++ * ++ * \param address - The start address in the Unified Virtual Address ++ * space in the specified process ++ * \param length - The length of requested mapping ++ * \param pid - Pointer to structure pid to which address belongs. ++ * Could be NULL for current process address space. ++ * \param p2p_data - On return: Pointer to structure describing ++ * underlying pages/locations ++ * \param free_callback - Pointer to callback which will be called when access ++ * to such memory must be stopped immediately: Memory ++ * was freed, GECC events, etc. ++ * Client should immediately stop any transfer ++ * operations and returned as soon as possible. ++ * After return all resources associated with address ++ * will be release and no access will be allowed. ++ * \param client_priv - Pointer to be passed as parameter on ++ * 'free_callback; ++ * ++ * \return 0 if operation was successful ++ */ ++static int get_pages(uint64_t address, uint64_t length, struct pid *pid, ++ struct amd_p2p_info **amd_p2p_data, ++ void (*free_callback)(void *client_priv), ++ void *client_priv) ++{ ++ struct kfd_bo *buf_obj; ++ struct kgd_mem *mem; ++ struct sg_table *sg_table_tmp; ++ struct kfd_dev *dev; ++ uint64_t last = address + length - 1; ++ uint64_t offset; ++ struct kfd_process *p; ++ struct rdma_cb *rdma_cb_data; ++ int ret = 0; ++ ++ p = kfd_lookup_process_by_pid(pid); ++ if (!p) { ++ pr_err("could not find the process in %s.\n", ++ __func__); ++ return -EINVAL; ++ } ++ ++ buf_obj = kfd_process_find_bo_from_interval(p, address, last); ++ if (!buf_obj) { ++ pr_err("can not find a kfd_bo for the range\n"); ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL); ++ if (!rdma_cb_data) { ++ *amd_p2p_data = NULL; ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ mem = buf_obj->mem; ++ dev = buf_obj->dev; ++ offset = address - buf_obj->it.start; ++ ++ ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem, ++ offset, length, &sg_table_tmp); ++ ++ if (ret) { ++ pr_err("pin_get_sg_table_bo failed.\n"); ++ *amd_p2p_data = NULL; ++ goto free_mem; ++ } ++ ++ rdma_cb_data->amd_p2p_data.va = address; ++ rdma_cb_data->amd_p2p_data.size = length; ++ rdma_cb_data->amd_p2p_data.pid = pid; ++ rdma_cb_data->amd_p2p_data.priv = buf_obj; ++ rdma_cb_data->amd_p2p_data.pages = sg_table_tmp; ++ ++ rdma_cb_data->free_callback = free_callback; ++ rdma_cb_data->client_priv = client_priv; ++ ++ list_add(&rdma_cb_data->node, &buf_obj->cb_data_head); ++ ++ *amd_p2p_data = &rdma_cb_data->amd_p2p_data; ++ ++ goto out; ++ ++free_mem: ++ kfree(rdma_cb_data); ++out: ++ up_read(&p->lock); ++ ++ return ret; ++} ++ ++static int put_pages_helper(struct amd_p2p_info *p2p_data) ++{ ++ struct kfd_bo *buf_obj; ++ struct kfd_dev *dev; ++ struct sg_table *sg_table_tmp; ++ struct rdma_cb *rdma_cb_data; ++ ++ if (!p2p_data) { ++ pr_err("amd_p2p_info pointer is invalid.\n"); ++ return -EINVAL; ++ } ++ ++ rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data); ++ ++ buf_obj = p2p_data->priv; ++ dev = buf_obj->dev; ++ sg_table_tmp = p2p_data->pages; ++ ++ list_del(&rdma_cb_data->node); ++ kfree(rdma_cb_data); ++ ++ dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp); ++ ++ ++ return 0; ++} ++ ++void run_rdma_free_callback(struct kfd_bo *buf_obj) ++{ ++ struct rdma_cb *tmp, *rdma_cb_data; ++ ++ list_for_each_entry_safe(rdma_cb_data, tmp, ++ &buf_obj->cb_data_head, node) { ++ if (rdma_cb_data->free_callback) ++ rdma_cb_data->free_callback( ++ rdma_cb_data->client_priv); ++ ++ put_pages_helper(&rdma_cb_data->amd_p2p_data); ++ } ++} ++ ++/** ++ * ++ * This function release resources previously allocated by get_pages() call. ++ * ++ * \param p_p2p_data - A pointer to pointer to amd_p2p_info entries ++ * allocated by get_pages() call. ++ * ++ * \return 0 if operation was successful ++ */ ++static int put_pages(struct amd_p2p_info **p_p2p_data) ++{ ++ struct kfd_process *p = NULL; ++ int ret = 0; ++ ++ if (!(*p_p2p_data)) { ++ pr_err("amd_p2p_info pointer is invalid.\n"); ++ return -EINVAL; ++ } ++ ++ p = kfd_lookup_process_by_pid((*p_p2p_data)->pid); ++ if (!p) { ++ pr_err("could not find the process in %s\n", ++ __func__); ++ return -EINVAL; ++ } ++ ++ ret = put_pages_helper(*p_p2p_data); ++ ++ if (!ret) ++ *p_p2p_data = NULL; ++ ++ up_read(&p->lock); ++ ++ return ret; ++} ++ ++/** ++ * Check if given address belongs to GPU address space. ++ * ++ * \param address - Address to check ++ * \param pid - Process to which given address belongs. ++ * Could be NULL if current one. ++ * ++ * \return 0 - This is not GPU address managed by AMD driver ++ * 1 - This is GPU address managed by AMD driver ++ */ ++static int is_gpu_address(uint64_t address, struct pid *pid) ++{ ++ struct kfd_bo *buf_obj; ++ struct kfd_process *p; ++ ++ p = kfd_lookup_process_by_pid(pid); ++ if (!p) { ++ pr_err("could not find the process in %s.\n", ++ __func__); ++ return 0; ++ } ++ ++ buf_obj = kfd_process_find_bo_from_interval(p, address, address); ++ ++ up_read(&p->lock); ++ if (!buf_obj) ++ return 0; ++ else ++ return 1; ++} ++ ++/** ++ * Return the single page size to be used when building scatter/gather table ++ * for given range. ++ * ++ * \param address - Address ++ * \param length - Range length ++ * \param pid - Process id structure. Could be NULL if current one. ++ * \param page_size - On return: Page size ++ * ++ * \return 0 if operation was successful ++ */ ++static int get_page_size(uint64_t address, uint64_t length, struct pid *pid, ++ unsigned long *page_size) ++{ ++ /* ++ * As local memory is always consecutive, we can assume the local ++ * memory page size to be arbitrary. ++ * Currently we assume the local memory page size to be the same ++ * as system memory, which is 4KB. ++ */ ++ *page_size = PAGE_SIZE; ++ ++ return 0; ++} ++ ++ ++/** ++ * Singleton object: rdma interface function pointers ++ */ ++static const struct amd_rdma_interface rdma_ops = { ++ .get_pages = get_pages, ++ .put_pages = put_pages, ++ .is_gpu_address = is_gpu_address, ++ .get_page_size = get_page_size, ++}; ++ ++/** ++ * amdkfd_query_rdma_interface - Return interface (function pointers table) for ++ * rdma interface ++ * ++ * ++ * \param interace - OUT: Pointer to interface ++ * ++ * \return 0 if operation was successful. ++ */ ++int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops) ++{ ++ *ops = &rdma_ops; ++ ++ return 0; ++} ++EXPORT_SYMBOL(amdkfd_query_rdma_interface); ++ ++ ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +index 1e50647..ba1c61c 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +@@ -28,16 +28,19 @@ + #include <linux/hash.h> + #include <linux/cpufreq.h> + #include <linux/log2.h> ++#include <linux/dmi.h> ++#include <linux/atomic.h> + + #include "kfd_priv.h" + #include "kfd_crat.h" + #include "kfd_topology.h" + +-static struct list_head topology_device_list; +-static int topology_crat_parsed; ++/* topology_device_list - Master list of all topology devices */ ++struct list_head topology_device_list; + static struct kfd_system_properties sys_props; + + static DECLARE_RWSEM(topology_lock); ++static atomic_t topology_crat_proximity_domain; + + struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) + { +@@ -57,311 +60,61 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) + return device; + } + +-struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) ++uint32_t kfd_get_gpu_id(struct kfd_dev *dev) + { + struct kfd_topology_device *top_dev; +- struct kfd_dev *device = NULL; ++ uint32_t gpu_id = 0; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) +- if (top_dev->gpu->pdev == pdev) { +- device = top_dev->gpu; ++ if (top_dev->gpu == dev) { ++ gpu_id = top_dev->gpu_id; + break; + } + + up_read(&topology_lock); + +- return device; +-} +- +-static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) +-{ +- struct acpi_table_header *crat_table; +- acpi_status status; +- +- if (!size) +- return -EINVAL; +- +- /* +- * Fetch the CRAT table from ACPI +- */ +- status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); +- if (status == AE_NOT_FOUND) { +- pr_warn("CRAT table not found\n"); +- return -ENODATA; +- } else if (ACPI_FAILURE(status)) { +- const char *err = acpi_format_exception(status); +- +- pr_err("CRAT table error: %s\n", err); +- return -EINVAL; +- } +- +- if (*size >= crat_table->length && crat_image != NULL) +- memcpy(crat_image, crat_table, crat_table->length); +- +- *size = crat_table->length; +- +- return 0; ++ return gpu_id; + } + +-static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, +- struct crat_subtype_computeunit *cu) +-{ +- BUG_ON(!dev); +- BUG_ON(!cu); +- +- dev->node_props.cpu_cores_count = cu->num_cpu_cores; +- dev->node_props.cpu_core_id_base = cu->processor_id_low; +- if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) +- dev->node_props.capability |= HSA_CAP_ATS_PRESENT; +- +- pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, +- cu->processor_id_low); +-} +- +-static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, +- struct crat_subtype_computeunit *cu) +-{ +- BUG_ON(!dev); +- BUG_ON(!cu); +- +- dev->node_props.simd_id_base = cu->processor_id_low; +- dev->node_props.simd_count = cu->num_simd_cores; +- dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; +- dev->node_props.max_waves_per_simd = cu->max_waves_simd; +- dev->node_props.wave_front_size = cu->wave_front_size; +- dev->node_props.mem_banks_count = cu->num_banks; +- dev->node_props.array_count = cu->num_arrays; +- dev->node_props.cu_per_simd_array = cu->num_cu_per_array; +- dev->node_props.simd_per_cu = cu->num_simd_per_cu; +- dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; +- if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) +- dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; +- pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, +- cu->processor_id_low); +-} +- +-/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ +-static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) +-{ +- struct kfd_topology_device *dev; +- int i = 0; +- +- BUG_ON(!cu); +- +- pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", +- cu->proximity_domain, cu->hsa_capability); +- list_for_each_entry(dev, &topology_device_list, list) { +- if (cu->proximity_domain == i) { +- if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) +- kfd_populated_cu_info_cpu(dev, cu); +- +- if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) +- kfd_populated_cu_info_gpu(dev, cu); +- break; +- } +- i++; +- } +- +- return 0; +-} +- +-/* +- * kfd_parse_subtype_mem is called when the topology mutex is +- * already acquired +- */ +-static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) ++struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) + { +- struct kfd_mem_properties *props; +- struct kfd_topology_device *dev; +- int i = 0; +- +- BUG_ON(!mem); +- +- pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", +- mem->promixity_domain); +- list_for_each_entry(dev, &topology_device_list, list) { +- if (mem->promixity_domain == i) { +- props = kfd_alloc_struct(props); +- if (props == NULL) +- return -ENOMEM; +- +- if (dev->node_props.cpu_cores_count == 0) +- props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; +- else +- props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; +- +- if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) +- props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; +- if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) +- props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; +- +- props->size_in_bytes = +- ((uint64_t)mem->length_high << 32) + +- mem->length_low; +- props->width = mem->width; ++ struct kfd_topology_device *top_dev; ++ struct kfd_dev *device = NULL; + +- dev->mem_bank_count++; +- list_add_tail(&props->list, &dev->mem_props); ++ down_read(&topology_lock); + ++ list_for_each_entry(top_dev, &topology_device_list, list) ++ if (top_dev->gpu && top_dev->gpu->pdev == pdev) { ++ device = top_dev->gpu; + break; + } +- i++; +- } +- +- return 0; +-} +- +-/* +- * kfd_parse_subtype_cache is called when the topology mutex +- * is already acquired +- */ +-static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) +-{ +- struct kfd_cache_properties *props; +- struct kfd_topology_device *dev; +- uint32_t id; +- +- BUG_ON(!cache); +- +- id = cache->processor_id_low; + +- pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); +- list_for_each_entry(dev, &topology_device_list, list) +- if (id == dev->node_props.cpu_core_id_base || +- id == dev->node_props.simd_id_base) { +- props = kfd_alloc_struct(props); +- if (props == NULL) +- return -ENOMEM; +- +- props->processor_id_low = id; +- props->cache_level = cache->cache_level; +- props->cache_size = cache->cache_size; +- props->cacheline_size = cache->cache_line_size; +- props->cachelines_per_tag = cache->lines_per_tag; +- props->cache_assoc = cache->associativity; +- props->cache_latency = cache->cache_latency; +- +- if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) +- props->cache_type |= HSA_CACHE_TYPE_DATA; +- if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) +- props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; +- if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) +- props->cache_type |= HSA_CACHE_TYPE_CPU; +- if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) +- props->cache_type |= HSA_CACHE_TYPE_HSACU; +- +- dev->cache_count++; +- dev->node_props.caches_count++; +- list_add_tail(&props->list, &dev->cache_props); +- +- break; +- } ++ up_read(&topology_lock); + +- return 0; ++ return device; + } + +-/* +- * kfd_parse_subtype_iolink is called when the topology mutex +- * is already acquired +- */ +-static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) ++struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd) + { +- struct kfd_iolink_properties *props; +- struct kfd_topology_device *dev; +- uint32_t i = 0; +- uint32_t id_from; +- uint32_t id_to; +- +- BUG_ON(!iolink); +- +- id_from = iolink->proximity_domain_from; +- id_to = iolink->proximity_domain_to; ++ struct kfd_topology_device *top_dev; ++ struct kfd_dev *device = NULL; + +- pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); +- list_for_each_entry(dev, &topology_device_list, list) { +- if (id_from == i) { +- props = kfd_alloc_struct(props); +- if (props == NULL) +- return -ENOMEM; +- +- props->node_from = id_from; +- props->node_to = id_to; +- props->ver_maj = iolink->version_major; +- props->ver_min = iolink->version_minor; +- +- /* +- * weight factor (derived from CDIR), currently always 1 +- */ +- props->weight = 1; +- +- props->min_latency = iolink->minimum_latency; +- props->max_latency = iolink->maximum_latency; +- props->min_bandwidth = iolink->minimum_bandwidth_mbs; +- props->max_bandwidth = iolink->maximum_bandwidth_mbs; +- props->rec_transfer_size = +- iolink->recommended_transfer_size; +- +- dev->io_link_count++; +- dev->node_props.io_links_count++; +- list_add_tail(&props->list, &dev->io_link_props); ++ down_read(&topology_lock); + ++ list_for_each_entry(top_dev, &topology_device_list, list) ++ if (top_dev->gpu && top_dev->gpu->kgd == kgd) { ++ device = top_dev->gpu; + break; + } +- i++; +- } + +- return 0; +-} +- +-static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) +-{ +- struct crat_subtype_computeunit *cu; +- struct crat_subtype_memory *mem; +- struct crat_subtype_cache *cache; +- struct crat_subtype_iolink *iolink; +- int ret = 0; +- +- BUG_ON(!sub_type_hdr); +- +- switch (sub_type_hdr->type) { +- case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: +- cu = (struct crat_subtype_computeunit *)sub_type_hdr; +- ret = kfd_parse_subtype_cu(cu); +- break; +- case CRAT_SUBTYPE_MEMORY_AFFINITY: +- mem = (struct crat_subtype_memory *)sub_type_hdr; +- ret = kfd_parse_subtype_mem(mem); +- break; +- case CRAT_SUBTYPE_CACHE_AFFINITY: +- cache = (struct crat_subtype_cache *)sub_type_hdr; +- ret = kfd_parse_subtype_cache(cache); +- break; +- case CRAT_SUBTYPE_TLB_AFFINITY: +- /* +- * For now, nothing to do here +- */ +- pr_info("Found TLB entry in CRAT table (not processing)\n"); +- break; +- case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: +- /* +- * For now, nothing to do here +- */ +- pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); +- break; +- case CRAT_SUBTYPE_IOLINK_AFFINITY: +- iolink = (struct crat_subtype_iolink *)sub_type_hdr; +- ret = kfd_parse_subtype_iolink(iolink); +- break; +- default: +- pr_warn("Unknown subtype (%d) in CRAT\n", +- sub_type_hdr->type); +- } ++ up_read(&topology_lock); + +- return ret; ++ return device; + } + ++/* Called with write topology_lock acquired */ + static void kfd_release_topology_device(struct kfd_topology_device *dev) + { + struct kfd_mem_properties *mem; +@@ -398,20 +151,22 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) + sys_props.num_devices--; + } + +-static void kfd_release_live_view(void) ++void kfd_release_live_view(void) + { + struct kfd_topology_device *dev; + ++ down_write(&topology_lock); + while (topology_device_list.next != &topology_device_list) { + dev = container_of(topology_device_list.next, + struct kfd_topology_device, list); + kfd_release_topology_device(dev); +-} +- ++ } ++ up_write(&topology_lock); + memset(&sys_props, 0, sizeof(sys_props)); + } + +-static struct kfd_topology_device *kfd_create_topology_device(void) ++struct kfd_topology_device *kfd_create_topology_device( ++ struct list_head *device_list) + { + struct kfd_topology_device *dev; + +@@ -425,65 +180,12 @@ static struct kfd_topology_device *kfd_create_topology_device(void) + INIT_LIST_HEAD(&dev->cache_props); + INIT_LIST_HEAD(&dev->io_link_props); + +- list_add_tail(&dev->list, &topology_device_list); ++ list_add_tail(&dev->list, device_list); + sys_props.num_devices++; + + return dev; + } + +-static int kfd_parse_crat_table(void *crat_image) +-{ +- struct kfd_topology_device *top_dev; +- struct crat_subtype_generic *sub_type_hdr; +- uint16_t node_id; +- int ret; +- struct crat_header *crat_table = (struct crat_header *)crat_image; +- uint16_t num_nodes; +- uint32_t image_len; +- +- if (!crat_image) +- return -EINVAL; +- +- num_nodes = crat_table->num_domains; +- image_len = crat_table->length; +- +- pr_info("Parsing CRAT table with %d nodes\n", num_nodes); +- +- for (node_id = 0; node_id < num_nodes; node_id++) { +- top_dev = kfd_create_topology_device(); +- if (!top_dev) { +- kfd_release_live_view(); +- return -ENOMEM; +- } +- } +- +- sys_props.platform_id = +- (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; +- sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); +- sys_props.platform_rev = crat_table->revision; +- +- sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); +- while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < +- ((char *)crat_image) + image_len) { +- if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { +- ret = kfd_parse_subtype(sub_type_hdr); +- if (ret != 0) { +- kfd_release_live_view(); +- return ret; +- } +- } +- +- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + +- sub_type_hdr->length); +- } +- +- sys_props.generation_count++; +- topology_crat_parsed = 1; +- +- return 0; +-} +- +- + #define sysfs_show_gen_prop(buffer, fmt, ...) \ + snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) + #define sysfs_show_32bit_prop(buffer, name, value) \ +@@ -593,7 +295,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, + char *buffer) + { + ssize_t ret; +- uint32_t i; ++ uint32_t i, j; + struct kfd_cache_properties *cache; + + /* Making sure that the buffer is an empty string */ +@@ -611,12 +313,18 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, + sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); + sysfs_show_32bit_prop(buffer, "type", cache->cache_type); + snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); +- for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) +- ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", +- buffer, cache->sibling_map[i], +- (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? +- "\n" : ","); +- ++ for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) ++ for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { ++ /* Check each bit */ ++ if (cache->sibling_map[i] & (1 << j)) ++ ret = snprintf(buffer, PAGE_SIZE, ++ "%s%d%s", buffer, 1, ","); ++ else ++ ret = snprintf(buffer, PAGE_SIZE, ++ "%s%d%s", buffer, 0, ","); ++ } ++ /* Replace the last "," with end of line */ ++ *(buffer + strlen(buffer) - 1) = 0xA; + return ret; + } + +@@ -635,6 +343,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; + uint32_t i; + uint32_t log_max_watch_addr; ++ struct kfd_local_mem_info local_mem_info; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; +@@ -674,7 +383,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + } else { + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->node_props.mem_banks_count); +- } + + sysfs_show_32bit_prop(buffer, "caches_count", + dev->node_props.caches_count); +@@ -723,17 +431,30 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); + } + ++ if (dev->gpu->device_info->asic_family == CHIP_TONGA) ++ dev->node_props.capability |= ++ HSA_CAP_AQL_QUEUE_DOUBLE_MAP; ++ + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", +- dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( +- dev->gpu->kgd)); ++ dev->node_props.max_engine_clk_fcompute); + +- sysfs_show_64bit_prop(buffer, "local_mem_size", +- (unsigned long long int) 0); ++ /* ++ * If the ASIC is CZ, set local memory size to 0 to disable ++ * local memory support ++ */ ++ if (dev->gpu->device_info->asic_family != CHIP_CARRIZO) { ++ dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, ++ &local_mem_info); ++ sysfs_show_64bit_prop(buffer, "local_mem_size", ++ local_mem_info.local_mem_size_private + ++ local_mem_info.local_mem_size_public); ++ } ++ else ++ sysfs_show_64bit_prop(buffer, "local_mem_size", ++ (unsigned long long int) 0); + + sysfs_show_32bit_prop(buffer, "fw_version", +- dev->gpu->kfd2kgd->get_fw_version( +- dev->gpu->kgd, +- KGD_ENGINE_MEC1)); ++ dev->gpu->mec_fw_version); + sysfs_show_32bit_prop(buffer, "capability", + dev->node_props.capability); + } +@@ -928,6 +649,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + return 0; + } + ++/* Called with write topology lock acquired */ + static int kfd_build_sysfs_node_tree(void) + { + struct kfd_topology_device *dev; +@@ -944,6 +666,7 @@ static int kfd_build_sysfs_node_tree(void) + return 0; + } + ++/* Called with write topology lock acquired */ + static void kfd_remove_sysfs_node_tree(void) + { + struct kfd_topology_device *dev; +@@ -1015,88 +738,200 @@ static void kfd_topology_release_sysfs(void) + } + } + ++/* Called with write topology_lock acquired */ ++static int kfd_topology_update_device_list(struct list_head *temp_list, ++ struct list_head *master_list) ++{ ++ int num = 0; ++ ++ while (!list_empty(temp_list)) { ++ list_move_tail(temp_list->next, master_list); ++ num++; ++ } ++ return num; ++} ++ ++static void kfd_debug_print_topology(void) ++{ ++ struct kfd_topology_device *dev; ++ ++ down_read(&topology_lock); ++ ++ dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); ++ if (dev) { ++ if (dev->node_props.cpu_cores_count && dev->node_props.simd_count) { ++ pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", ++ dev->node_props.device_id, dev->node_props.vendor_id); ++ } ++ else if (dev->node_props.cpu_cores_count) ++ pr_info("Topology: Add CPU node\n"); ++ else if (dev->node_props.simd_count) ++ pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", ++ dev->node_props.device_id, dev->node_props.vendor_id); ++ } ++ up_read(&topology_lock); ++} ++ ++/* Helper function for intializing platform_xx members of kfd_system_properties ++ */ ++static void kfd_update_system_properties(void) ++{ ++ struct kfd_topology_device *dev; ++ ++ down_read(&topology_lock); ++ dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); ++ if (dev) { ++ sys_props.platform_id = ++ (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; ++ sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); ++ sys_props.platform_rev = dev->oem_revision; ++ } ++ up_read(&topology_lock); ++} ++ ++static void find_system_memory(const struct dmi_header *dm, ++ void *private) ++{ ++ struct kfd_mem_properties *mem; ++ u16 mem_width, mem_clock; ++ struct kfd_topology_device *kdev = ++ (struct kfd_topology_device *)private; ++ const u8 *dmi_data = (const u8 *)(dm + 1); ++ ++ if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { ++ mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); ++ mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); ++ list_for_each_entry(mem, &kdev->mem_props, list) { ++ if (mem_width != 0xFFFF && mem_width != 0) ++ mem->width = mem_width; ++ if (mem_clock != 0) ++ mem->mem_clk_max = mem_clock; ++ } ++ } ++} ++/* kfd_add_non_crat_information - Add information that is not currently ++ * defined in CRAT but is necessary for KFD topology ++ * @dev - topology device to which addition info is added ++ */ ++static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) ++{ ++ /* Check if CPU only node. */ ++ if (kdev->gpu == NULL) { ++ /* Add system memory information */ ++ dmi_walk(find_system_memory, kdev); ++ } ++ /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ ++} ++ + int kfd_topology_init(void) + { + void *crat_image = NULL; + size_t image_size = 0; + int ret; ++ struct list_head temp_topology_device_list; ++ int cpu_only_node = 0; ++ struct kfd_topology_device *kdev; ++ int proximity_domain; ++ int num_nodes; ++ ++ /* topology_device_list - Master list of all topology devices ++ * temp_topology_device_list - temporary list created while parsing CRAT ++ * or VCRAT. Once parsing is complete the contents of list is moved to ++ * topology_device_list ++ */ + +- /* +- * Initialize the head for the topology device list ++ /* Initialize the head for the both the lists + */ + INIT_LIST_HEAD(&topology_device_list); ++ INIT_LIST_HEAD(&temp_topology_device_list); + init_rwsem(&topology_lock); +- topology_crat_parsed = 0; + + memset(&sys_props, 0, sizeof(sys_props)); + ++ /* Proximity domains in ACPI CRAT tables start counting at ++ * 0. The same should be true for virtual CRAT tables created ++ * at this stage. GPUs added later in kfd_topology_add_device ++ * use a counter. */ ++ proximity_domain = 0; ++ + /* +- * Get the CRAT image from the ACPI ++ * Get the CRAT image from the ACPI. If ACPI doesn't have one ++ * create a virtual CRAT. ++ * NOTE: The current implementation expects all AMD APUs to have ++ * CRAT. If no CRAT is available, it is assumed to be a CPU + */ +- ret = kfd_topology_get_crat_acpi(crat_image, &image_size); +- if (ret == 0 && image_size > 0) { +- pr_info("Found CRAT image with size=%zd\n", image_size); +- crat_image = kmalloc(image_size, GFP_KERNEL); +- if (!crat_image) { +- ret = -ENOMEM; +- pr_err("No memory for allocating CRAT image\n"); +- goto err; +- } +- ret = kfd_topology_get_crat_acpi(crat_image, &image_size); +- +- if (ret == 0) { +- down_write(&topology_lock); +- ret = kfd_parse_crat_table(crat_image); +- if (ret == 0) +- ret = kfd_topology_update_sysfs(); +- up_write(&topology_lock); +- } else { +- pr_err("Couldn't get CRAT table size from ACPI\n"); +- } +- kfree(crat_image); +- } else if (ret == -ENODATA) { +- ret = 0; +- } else { +- pr_err("Couldn't get CRAT table size from ACPI\n"); ++ ret = kfd_create_crat_image_acpi(&crat_image, &image_size); ++ if (ret != 0) { ++ ret = kfd_create_crat_image_virtual(&crat_image, &image_size, ++ COMPUTE_UNIT_CPU, NULL, ++ proximity_domain); ++ cpu_only_node = 1; ++ } ++ ++ if (ret == 0) ++ ret = kfd_parse_crat_table(crat_image, ++ &temp_topology_device_list, ++ proximity_domain); ++ else { ++ pr_err("Error getting/creating CRAT table\n"); ++ goto err; ++ } ++ ++ down_write(&topology_lock); ++ num_nodes = kfd_topology_update_device_list(&temp_topology_device_list, ++ &topology_device_list); ++ atomic_set(&topology_crat_proximity_domain, num_nodes-1); ++ ret = kfd_topology_update_sysfs(); ++ up_write(&topology_lock); ++ ++ if (ret == 0) { ++ sys_props.generation_count++; ++ kfd_update_system_properties(); ++ kfd_debug_print_topology(); ++ pr_info("Finished initializing topology\n"); ++ } ++ else ++ pr_err("Failed to update topology in sysfs ret=%d\n", ret); ++ ++ /* For nodes with GPU, this information gets added ++ * when GPU is detected (kfd_topology_add_device). */ ++ if (cpu_only_node) { ++ /* Add additional information to CPU only node created above */ ++ down_write(&topology_lock); ++ kdev = list_first_entry(&topology_device_list, ++ struct kfd_topology_device, list); ++ up_write(&topology_lock); ++ kfd_add_non_crat_information(kdev); + } + + err: +- pr_info("Finished initializing topology ret=%d\n", ret); ++ kfd_destroy_crat_image(crat_image); + return ret; + } + + void kfd_topology_shutdown(void) + { ++ down_write(&topology_lock); + kfd_topology_release_sysfs(); ++ up_write(&topology_lock); + kfd_release_live_view(); + } + +-static void kfd_debug_print_topology(void) +-{ +- struct kfd_topology_device *dev; +- uint32_t i = 0; +- +- pr_info("DEBUG PRINT OF TOPOLOGY:"); +- list_for_each_entry(dev, &topology_device_list, list) { +- pr_info("Node: %d\n", i); +- pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); +- pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); +- pr_info("\tSIMD count: %d", dev->node_props.simd_count); +- i++; +- } +-} +- + static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) + { + uint32_t hashout; + uint32_t buf[7]; + uint64_t local_mem_size; + int i; ++ struct kfd_local_mem_info local_mem_info; + + if (!gpu) + return 0; + +- local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd); ++ gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info); ++ ++ local_mem_size = local_mem_info.local_mem_size_private + ++ local_mem_info.local_mem_size_public; + + buf[0] = gpu->pdev->devfn; + buf[1] = gpu->pdev->subsystem_vendor; +@@ -1111,7 +946,13 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) + + return hashout; + } +- ++/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If ++ * the GPU device is not already present in the topology device list ++ * then return NULL. This means a new topology device has to be ++ * created for this GPU. ++ * TODO: Rather than assiging @gpu to first topology device withtout ++ * gpu attached, it will better to have more stringent check. ++ */ + static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) + { + struct kfd_topology_device *dev; +@@ -1119,13 +960,14 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) + + BUG_ON(!gpu); + ++ down_write(&topology_lock); + list_for_each_entry(dev, &topology_device_list, list) + if (dev->gpu == NULL && dev->node_props.simd_count > 0) { + dev->gpu = gpu; + out_dev = dev; + break; + } +- ++ up_write(&topology_lock); + return out_dev; + } + +@@ -1137,70 +979,146 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) + */ + } + ++/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, ++ * patch this after CRAT parsing. ++ */ ++static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) ++{ ++ struct kfd_mem_properties *mem; ++ struct kfd_local_mem_info local_mem_info; ++ ++ if (dev == NULL) ++ return; ++ ++ /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with ++ * single bank of VRAM local memory. ++ * for dGPUs - VCRAT reports only one bank of Local Memory ++ * for APUs - If CRAT from ACPI reports more than one bank, then ++ * all the banks will report the same mem_clk_max information ++ */ ++ dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, ++ &local_mem_info); ++ ++ list_for_each_entry(mem, &dev->mem_props, list) ++ mem->mem_clk_max = local_mem_info.mem_clk_max; ++} ++ + int kfd_topology_add_device(struct kfd_dev *gpu) + { + uint32_t gpu_id; + struct kfd_topology_device *dev; +- int res; ++ struct kfd_cu_info cu_info; ++ int res = 0; ++ struct list_head temp_topology_device_list; ++ void *crat_image = NULL; ++ size_t image_size = 0; ++ int proximity_domain; + + BUG_ON(!gpu); + ++ INIT_LIST_HEAD(&temp_topology_device_list); ++ + gpu_id = kfd_generate_gpu_id(gpu); + + pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + +- down_write(&topology_lock); +- /* +- * Try to assign the GPU to existing topology device (generated from +- * CRAT table ++ proximity_domain = atomic_inc_return(& ++ topology_crat_proximity_domain); ++ ++ /* Check to see if this gpu device exists in the topology_device_list. ++ * If so, assign the gpu to that device, ++ * else create a Virtual CRAT for this gpu device and then parse that CRAT ++ * to create a new topology device. Once created assign the gpu to that ++ * topology device + */ + dev = kfd_assign_gpu(gpu); + if (!dev) { +- pr_info("GPU was not found in the current topology. Extending.\n"); +- kfd_debug_print_topology(); +- dev = kfd_create_topology_device(); +- if (!dev) { +- res = -ENOMEM; ++ res = kfd_create_crat_image_virtual(&crat_image, &image_size, ++ COMPUTE_UNIT_GPU, ++ gpu, proximity_domain); ++ if (res == 0) ++ res = kfd_parse_crat_table(crat_image, ++ &temp_topology_device_list, proximity_domain); ++ else { ++ pr_err("Error in VCRAT for GPU (ID: 0x%x)\n", gpu_id); + goto err; + } +- dev->gpu = gpu; + +- /* +- * TODO: Make a call to retrieve topology information from the +- * GPU vBIOS +- */ ++ down_write(&topology_lock); ++ kfd_topology_update_device_list(&temp_topology_device_list, ++ &topology_device_list); + + /* + * Update the SYSFS tree, since we added another topology device + */ +- if (kfd_topology_update_sysfs() < 0) +- kfd_topology_release_sysfs(); +- ++ res = kfd_topology_update_sysfs(); ++ up_write(&topology_lock); ++ ++ if (res == 0) ++ sys_props.generation_count++; ++ else ++ pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", ++ gpu_id, res); ++ dev = kfd_assign_gpu(gpu); ++ BUG_ON(!dev); + } + + dev->gpu_id = gpu_id; + gpu->id = gpu_id; ++ ++ /* TODO: Move the following lines to function ++ * kfd_add_non_crat_information */ ++ ++ /* Fill-in additional information that is not available in CRAT but ++ * needed for the topology */ ++ ++ dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); ++ dev->node_props.simd_arrays_per_engine = cu_info.num_shader_arrays_per_engine; ++ + dev->node_props.vendor_id = gpu->pdev->vendor; + dev->node_props.device_id = gpu->pdev->device; +- dev->node_props.location_id = (gpu->pdev->bus->number << 24) + +- (gpu->pdev->devfn & 0xffffff); +- /* +- * TODO: Retrieve max engine clock values from KGD +- */ +- +- if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { +- dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; +- pr_info("amdkfd: adding doorbell packet type capability\n"); ++ dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, ++ gpu->pdev->devfn); ++ dev->node_props.max_engine_clk_fcompute = ++ dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); ++ dev->node_props.max_engine_clk_ccompute = ++ cpufreq_quick_get_max(0) / 1000; ++ ++ kfd_fill_mem_clk_max_info(dev); ++ ++ switch (dev->gpu->device_info->asic_family) { ++ case CHIP_KAVERI: ++ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); ++ break; ++ case CHIP_CARRIZO: ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ pr_debug("amdkfd: adding doorbell packet type capability\n"); ++ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); ++ break; + } + +- res = 0; ++ /* Fix errors in CZ CRAT. ++ * simd_count: Carrizo CRAT reports wrong simd_count, probably because it ++ * doesn't consider masked out CUs ++ * capability flag: Carrizo CRAT doesn't report IOMMU flags. ++ */ ++ if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { ++ dev->node_props.simd_count = ++ cu_info.simd_per_cu * cu_info.cu_active_number; ++ dev->node_props.capability |= HSA_CAP_ATS_PRESENT; ++ } + ++ kfd_debug_print_topology(); + err: +- up_write(&topology_lock); +- + if (res == 0) + kfd_notify_gpu_change(gpu_id, 1); + ++ kfd_destroy_crat_image(crat_image); + return res; + } + +@@ -1233,22 +1151,26 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) + return res; + } + +-/* +- * When idx is out of bounds, the function will return NULL ++/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD ++ * topology. If GPU device is found @idx, then valid kfd_dev pointer is ++ * returned through @kdev ++ * Return - 0: On success (@kdev will be NULL for non GPU nodes) ++ * -1: If end of list + */ +-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) ++int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) + { + + struct kfd_topology_device *top_dev; +- struct kfd_dev *device = NULL; + uint8_t device_idx = 0; + ++ *kdev = NULL; + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) { + if (device_idx == idx) { +- device = top_dev->gpu; +- break; ++ *kdev = top_dev->gpu; ++ up_read(&topology_lock); ++ return 0; + } + + device_idx++; +@@ -1256,6 +1178,57 @@ struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) + + up_read(&topology_lock); + +- return device; ++ return -1; ++ ++} ++ ++static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) ++{ ++ const struct cpuinfo_x86 *cpuinfo; ++ int first_cpu_of_nuna_node; ++ ++ if (cpumask == NULL || cpumask == cpu_none_mask) ++ return -1; ++ first_cpu_of_nuna_node = cpumask_first(cpumask); ++ cpuinfo = &cpu_data(first_cpu_of_nuna_node); ++ ++ return cpuinfo->apicid; ++} ++ ++/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor ++ * of the given NUMA node (numa_node_id) ++ * Return -1 on failure ++ */ ++int kfd_numa_node_to_apic_id(int numa_node_id) ++{ ++ if (numa_node_id == -1) { ++ pr_warn("Invalid NUMA Node. Use online CPU mask\n"); ++ return kfd_cpumask_to_apic_id(cpu_online_mask); ++ } ++ return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); ++} ++ ++/* kfd_get_proximity_domain - Find proximity_domain (node id) to which ++ * given PCI bus belongs to. CRAT table contains only the APIC ID ++ * of the parent NUMA node. So use that as the search parameter. ++ * Return -1 on failure ++ */ ++int kfd_get_proximity_domain(const struct pci_bus *bus) ++{ ++ struct kfd_topology_device *dev; ++ int proximity_domain = -1; ++ ++ down_read(&topology_lock); ++ ++ list_for_each_entry(dev, &topology_device_list, list) ++ if (dev->node_props.cpu_cores_count && ++ dev->node_props.cpu_core_id_base == ++ kfd_cpumask_to_apic_id(cpumask_of_pcibus(bus))) { ++ proximity_domain = dev->proximity_domain; ++ break; ++ } ++ ++ up_read(&topology_lock); + ++ return proximity_domain; + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +index c3ddb9b..ab28188 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +@@ -39,8 +39,16 @@ + #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 + #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 + #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 +-#define HSA_CAP_RESERVED 0xfffff000 ++#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 ++#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 ++#define HSA_CAP_RESERVED 0xffffc000 ++ ++#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 ++#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 ++#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 ++#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 + #define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 ++#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 + + struct kfd_node_properties { + uint32_t cpu_cores_count; +@@ -91,8 +99,6 @@ struct kfd_mem_properties { + struct attribute attr; + }; + +-#define KFD_TOPOLOGY_CPU_SIBLINGS 256 +- + #define HSA_CACHE_TYPE_DATA 0x00000001 + #define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 + #define HSA_CACHE_TYPE_CPU 0x00000004 +@@ -109,7 +115,7 @@ struct kfd_cache_properties { + uint32_t cache_assoc; + uint32_t cache_latency; + uint32_t cache_type; +- uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; ++ uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + struct kobject *kobj; + struct attribute attr; + }; +@@ -135,8 +141,8 @@ struct kfd_iolink_properties { + struct kfd_topology_device { + struct list_head list; + uint32_t gpu_id; ++ uint32_t proximity_domain; + struct kfd_node_properties node_props; +- uint32_t mem_bank_count; + struct list_head mem_props; + uint32_t cache_count; + struct list_head cache_props; +@@ -150,6 +156,9 @@ struct kfd_topology_device { + struct attribute attr_gpuid; + struct attribute attr_name; + struct attribute attr_props; ++ uint8_t oem_id[CRAT_OEMID_LENGTH]; ++ uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; ++ uint32_t oem_revision; + }; + + struct kfd_system_properties { +@@ -164,6 +173,8 @@ struct kfd_system_properties { + struct attribute attr_props; + }; + +- ++struct kfd_topology_device *kfd_create_topology_device( ++ struct list_head *device_list); ++void kfd_release_live_view(void); + + #endif /* __KFD_TOPOLOGY_H__ */ +diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +index 36f3766..5403164 100644 +--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h ++++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +@@ -40,6 +40,41 @@ struct kfd_dev; + struct kgd_dev; + + struct kgd_mem; ++struct kfd_process_device; ++struct amdgpu_bo; ++ ++struct kfd_vm_fault_info { ++ uint64_t page_addr; ++ uint32_t vmid; ++ uint32_t mc_id; ++ uint32_t status; ++ bool prot_valid; ++ bool prot_read; ++ bool prot_write; ++ bool prot_exec; ++}; ++ ++struct kfd_cu_info { ++ uint32_t num_shader_engines; ++ uint32_t num_shader_arrays_per_engine; ++ uint32_t num_cu_per_sh; ++ uint32_t cu_active_number; ++ uint32_t cu_ao_mask; ++ uint32_t simd_per_cu; ++ uint32_t max_waves_per_simd; ++ uint32_t wave_front_size; ++ uint32_t max_scratch_slots_per_cu; ++ uint32_t lds_size; ++ uint32_t cu_bitmap[4][4]; ++}; ++ ++/* For getting GPU local memory information from KGD */ ++struct kfd_local_mem_info { ++ uint64_t local_mem_size_private; ++ uint64_t local_mem_size_public; ++ uint32_t vram_width; ++ uint32_t mem_clk_max; ++}; + + enum kgd_memory_pool { + KGD_POOL_SYSTEM_CACHEABLE = 1, +@@ -80,8 +115,28 @@ struct kgd2kfd_shared_resources { + + /* Number of bytes at start of aperture reserved for KGD. */ + size_t doorbell_start_offset; ++ ++ /* GPUVM address space size in bytes */ ++ uint64_t gpuvm_size; + }; + ++/* ++ * Allocation flag domains currently only VRAM and GTT domain supported ++ */ ++#define ALLOC_MEM_FLAGS_VRAM (1 << 0) ++#define ALLOC_MEM_FLAGS_GTT (1 << 1) ++#define ALLOC_MEM_FLAGS_USERPTR (1 << 2) ++ ++/* ++ * Allocation flags attributes/access options. ++ */ ++#define ALLOC_MEM_FLAGS_NONPAGED (1 << 31) ++#define ALLOC_MEM_FLAGS_READONLY (1 << 30) ++#define ALLOC_MEM_FLAGS_PUBLIC (1 << 29) ++#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) ++#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) ++#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) ++ + /** + * struct kfd2kgd_calls + * +@@ -90,7 +145,7 @@ struct kgd2kfd_shared_resources { + * + * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture + * +- * @get_vmem_size: Retrieves (physical) size of VRAM ++ * @get_local_mem_info: Retrieves information about GPU local memory + * + * @get_gpu_clock_counter: Retrieves GPU clock counter + * +@@ -121,8 +176,23 @@ struct kgd2kfd_shared_resources { + * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that + * SDMA hqd slot. + * ++ * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs ++ * ++ * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs ++ * + * @get_fw_version: Returns FW versions from the header + * ++ * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to ++ * IOMMU when address translation failed ++ * ++ * @get_cu_info: Retrieves activated cu info ++ * ++ * @get_dmabuf_info: Returns information about a dmabuf if it was ++ * created by the GPU driver ++ * ++ * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object ++ * Supports only DMA buffers created by GPU driver on the same GPU ++ * + * This structure contains function pointers to services that the kgd driver + * provides to amdkfd driver. + * +@@ -134,11 +204,23 @@ struct kfd2kgd_calls { + + void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); + ++ void(*get_local_mem_info)(struct kgd_dev *kgd, ++ struct kfd_local_mem_info *mem_info); + uint64_t (*get_vmem_size)(struct kgd_dev *kgd); + uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd); + + uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd); + ++ int (*create_process_vm)(struct kgd_dev *kgd, void **vm); ++ void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm); ++ ++ int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem); ++ void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem); ++ ++ uint32_t (*get_process_page_dir)(void *vm); ++ ++ int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); ++ + /* Register access functions */ + void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid, + uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, +@@ -151,9 +233,11 @@ struct kfd2kgd_calls { + uint32_t hpd_size, uint64_t hpd_gpu_addr); + + int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); ++ + + int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr); ++ uint32_t queue_id, uint32_t __user *wptr, ++ uint32_t page_table_base); + + int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd); + +@@ -168,7 +252,7 @@ struct kfd2kgd_calls { + + int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd, + unsigned int timeout); +- ++ + int (*address_watch_disable)(struct kgd_dev *kgd); + int (*address_watch_execute)(struct kgd_dev *kgd, + unsigned int watch_point_id, +@@ -189,9 +273,53 @@ struct kfd2kgd_calls { + uint8_t vmid); + void (*write_vmid_invalidate_request)(struct kgd_dev *kgd, + uint8_t vmid); ++ int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va, ++ size_t size, void *vm, ++ struct kgd_mem **mem, uint64_t *offset, ++ void **kptr, struct kfd_process_device *pdd, ++ uint32_t flags); ++ int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem); ++ int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, ++ void *vm); ++ int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, ++ void *vm); + + uint16_t (*get_fw_version)(struct kgd_dev *kgd, + enum kgd_engine_type type); ++ ++ void (*set_num_of_requests)(struct kgd_dev *kgd, ++ uint8_t num_of_requests); ++ int (*alloc_memory_of_scratch)(struct kgd_dev *kgd, ++ uint64_t va, uint32_t vmid); ++ int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable, ++ uint8_t element_size, uint8_t index_stride, uint8_t mtype); ++ void (*get_cu_info)(struct kgd_dev *kgd, ++ struct kfd_cu_info *cu_info); ++ int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma); ++ int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd, ++ struct kgd_mem *mem, void **kptr); ++ void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid, ++ uint32_t page_table_base); ++ struct kfd_process_device* (*get_pdd_from_buffer_object) ++ (struct kgd_dev *kgd, struct kgd_mem *mem); ++ int (*return_bo_size)(struct kgd_dev *kgd, struct kgd_mem *mem); ++ ++ int (*pin_get_sg_table_bo)(struct kgd_dev *kgd, ++ struct kgd_mem *mem, uint64_t offset, ++ uint64_t size, struct sg_table **ret_sg); ++ void (*unpin_put_sg_table_bo)(struct kgd_mem *mem, ++ struct sg_table *sg); ++ ++ int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd, ++ struct kgd_dev **dma_buf_kgd, uint64_t *bo_size, ++ void *metadata_buffer, size_t buffer_size, ++ uint32_t *metadata_size, uint32_t *flags); ++ int (*import_dmabuf)(struct kgd_dev *kgd, int dma_buf_fd, uint64_t va, ++ void *vm, struct kgd_mem **mem, uint64_t *size); ++ ++ int (*get_vm_fault_info)(struct kgd_dev *kgd, ++ struct kfd_vm_fault_info *info); ++ + }; + + /** +@@ -210,6 +338,10 @@ struct kfd2kgd_calls { + * + * @resume: Notifies amdkfd about a resume action done to a kgd device + * ++ * @quiesce_mm: Quiesce all user queue access to specified MM address space ++ * ++ * @resume_mm: Resume user queue access to specified MM address space ++ * + * This structure contains function callback pointers so the kgd driver + * will notify to the amdkfd about certain status changes. + * +@@ -224,9 +356,13 @@ struct kgd2kfd_calls { + void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); + void (*suspend)(struct kfd_dev *kfd); + int (*resume)(struct kfd_dev *kfd); ++ int (*evict_bo)(struct kfd_dev *dev, void *ptr); ++ int (*restore)(struct kfd_dev *kfd); ++ int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm); ++ int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm); + }; + + int kgd2kfd_init(unsigned interface_version, + const struct kgd2kfd_calls **g2f); + +-#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ ++#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ +-- +2.7.4 + |