diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch | 31349 |
1 files changed, 31349 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch new file mode 100644 index 00000000..f6937453 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch @@ -0,0 +1,31349 @@ +From 3538bdf4c8b2d8f1f93fd806656bad0c82c6e60f Mon Sep 17 00:00:00 2001 +From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> +Date: Thu, 18 Oct 2018 18:06:09 +0530 +Subject: [PATCH 1368/4131] drm/amdkfd: revert kfd part to a previous state + +Revert following files to "2ba6b00 drm/amd/powerplay: add profile mode for vega10.": + + - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd* + - drivers/gpu/drm/amd/amdkfd/* + - drivers/gpu/drm/amd/include/kgd_kfd_interface.h + - include/uapi/linux/kfd_ioctl.h + - drivers/gpu/drm/radeon/radeon_kfd* + +Due to upstream, porting kfd patches to 4.13 all-open has many conflicts. +It's hard to elegantly fix these conflicts. So we revert the kfd part to a +previous commit, where we began to first port dkms patches in 4.12 hybrid. +Then sequentially port all kfd patches. + +Change-Id: I75eda45f41ced2f4c444ded126e2b80b53d15f2a +Signed-off-by: Le.Ma <Le.Ma@amd.com> +Acked-by: Junwei Zhang <Jerry.Zhang@amd.com> +Signed-off-by: kalyan.alle <kalyan.alle@amd.com> +Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/Makefile | 4 +- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 97 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 354 +-- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 184 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 196 -- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 542 +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 642 +---- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1227 ---------- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2578 -------------------- + drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 246 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 50 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h | 1 + + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 13 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 82 + + drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 133 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 106 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 22 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 5 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 - + drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 21 +- + drivers/gpu/drm/amd/amdgpu/vid.h | 6 - + drivers/gpu/drm/amd/amdkfd/Kconfig | 3 +- + drivers/gpu/drm/amd/amdkfd/Makefile | 23 +- + drivers/gpu/drm/amd/amdkfd/backport/Makefile | 7 - + drivers/gpu/drm/amd/amdkfd/backport/backport.h | 6 - + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 73 +- + drivers/gpu/drm/amd/amdkfd/cik_int.h | 24 +- + drivers/gpu/drm/amd/amdkfd/cik_regs.h | 3 +- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h | 1384 ----------- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 1388 ----------- + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1361 +---------- + drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 1304 ---------- + drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 42 +- + drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 219 +- + drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h | 32 - + drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c | 24 +- + drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h | 27 +- + drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 75 - + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 890 +------ + .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1222 +++------- + .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 56 +- + .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c | 80 +- + .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 90 - + .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c | 123 +- + drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 106 +- + drivers/gpu/drm/amd/amdkfd/kfd_events.c | 253 +- + drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 124 +- + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 133 - + drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 91 +- + drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 275 --- + drivers/gpu/drm/amd/amdkfd/kfd_ipc.h | 51 - + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 149 +- + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 17 +- + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | 128 - + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 377 --- + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 361 --- + drivers/gpu/drm/amd/amdkfd/kfd_module.c | 61 +- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 54 - + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 18 +- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 240 +- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 528 ---- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 329 +-- + drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 556 +++-- + drivers/gpu/drm/amd/amdkfd/kfd_pasid.c | 7 +- + drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c | 513 ---- + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h | 330 ++- + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h | 583 ----- + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h | 97 + + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h | 140 +- + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 546 +---- + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 919 +------ + .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 290 +-- + drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 12 +- + drivers/gpu/drm/amd/amdkfd/kfd_rdma.c | 294 --- + drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1177 ++++----- + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 41 +- + drivers/gpu/drm/amd/amdkfd/soc15_int.h | 84 - + drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 231 +- + drivers/gpu/drm/drm_pci.c | 1 + + drivers/gpu/drm/radeon/radeon_kfd.c | 19 +- + include/drm/drm_drv.h | 2 + + include/drm/drm_pci.h | 7 + + 82 files changed, 3407 insertions(+), 20703 deletions(-) + delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c + mode change 100755 => 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c + mode change 100755 => 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c + delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c + delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c + mode change 100755 => 100644 drivers/gpu/drm/amd/amdkfd/Makefile + delete mode 100644 drivers/gpu/drm/amd/amdkfd/backport/Makefile + delete mode 100644 drivers/gpu/drm/amd/amdkfd/backport/backport.h + delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h + delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_crat.c + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.c + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.h + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h + mode change 100755 => 100644 drivers/gpu/drm/amd/amdkfd/kfd_priv.h + delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_rdma.c + delete mode 100644 drivers/gpu/drm/amd/amdkfd/soc15_int.h + mode change 100755 => 100644 drivers/gpu/drm/amd/include/kgd_kfd_interface.h + +diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile +index 6b373d0..bc6f49e 100755 +--- a/drivers/gpu/drm/amd/amdgpu/Makefile ++++ b/drivers/gpu/drm/amd/amdgpu/Makefile +@@ -32,7 +32,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \ + amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \ + amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ + amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \ +- amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o amdgpu_amdkfd_fence.o ++ amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o + + # add asic specific block + amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \ +@@ -110,8 +110,6 @@ amdgpu-y += \ + amdgpu_amdkfd.o \ + amdgpu_amdkfd_gfx_v7.o \ + amdgpu_amdkfd_gfx_v8.o \ +- amdgpu_amdkfd_gfx_v9.o \ +- amdgpu_amdkfd_gpuvm.o + + # add cgs + amdgpu-y += amdgpu_cgs.o +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index bcf95e7..b07c90e 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -67,7 +67,6 @@ + #include "amdgpu_vce.h" + #include "amdgpu_vcn.h" + #include "amdgpu_dm.h" +-#include "amdgpu_mn.h" + + #include "gpu_scheduler.h" + #include "amdgpu_virt.h" +@@ -125,7 +124,6 @@ extern int amdgpu_cntl_sb_buf_per_se; + extern int amdgpu_param_buf_per_se; + extern int amdgpu_job_hang_limit; + extern int amdgpu_lbpw; +-extern int amdgpu_compute_multipipe; + + #ifdef CONFIG_DRM_AMDGPU_SI + extern int amdgpu_si_support; +@@ -184,8 +182,7 @@ struct amdgpu_cs_parser; + struct amdgpu_job; + struct amdgpu_irq_src; + struct amdgpu_fpriv; +-struct kfd_vm_fault_info; +-struct amdgpu_bo_va_mapping; ++struct kfd_process_device; + + enum amdgpu_cp_irq { + AMDGPU_CP_IRQ_GFX_EOP = 0, +@@ -300,25 +297,14 @@ struct amdgpu_buffer_funcs { + + /* provided by hw blocks that can write ptes, e.g., sdma */ + struct amdgpu_vm_pte_funcs { +- /* number of dw to reserve per operation */ +- unsigned copy_pte_num_dw; +- + /* copy pte entries from GART */ + void (*copy_pte)(struct amdgpu_ib *ib, + uint64_t pe, uint64_t src, + unsigned count); +- + /* write pte one entry at a time with addr mapping */ + void (*write_pte)(struct amdgpu_ib *ib, uint64_t pe, + uint64_t value, unsigned count, + uint32_t incr); +- +- /* maximum nums of PTEs/PDEs in a single operation */ +- uint32_t set_max_nums_pte_pde; +- +- /* number of dw to reserve per operation */ +- unsigned set_pte_pde_num_dw; +- + /* for linear pte/pde updates without addr mapping */ + void (*set_pte_pde)(struct amdgpu_ib *ib, + uint64_t pe, +@@ -397,15 +383,7 @@ struct amdgpu_clock { + */ + + #define AMDGPU_GEM_DOMAIN_MAX 0x3 +- +-struct amdgpu_gem_object { +- struct drm_gem_object base; +- struct list_head list; +- struct amdgpu_bo *bo; +-}; +- +-struct kgd_mem; +-#define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_gem_object, base)->bo ++#define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_bo, gem_base) + + void amdgpu_gem_object_free(struct drm_gem_object *obj); + int amdgpu_gem_object_open(struct drm_gem_object *obj, +@@ -421,8 +399,6 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, + struct dma_buf *amdgpu_gem_prime_export(struct drm_device *dev, + struct drm_gem_object *gobj, + int flags); +-struct drm_gem_object * +-amdgpu_gem_prime_foreign_bo(struct amdgpu_device *adev, struct amdgpu_bo *bo); + int amdgpu_gem_prime_pin(struct drm_gem_object *obj); + void amdgpu_gem_prime_unpin(struct drm_gem_object *obj); + struct reservation_object *amdgpu_gem_prime_res_obj(struct drm_gem_object *); +@@ -484,10 +460,9 @@ struct amdgpu_sa_bo { + */ + void amdgpu_gem_force_release(struct amdgpu_device *adev); + int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size, +- int alignment, u32 initial_domain, +- u64 flags, bool kernel, +- struct reservation_object *resv, +- struct drm_gem_object **obj); ++ int alignment, u32 initial_domain, ++ u64 flags, bool kernel, ++ struct drm_gem_object **obj); + + int amdgpu_mode_dumb_create(struct drm_file *file_priv, + struct drm_device *dev, +@@ -545,9 +520,6 @@ struct amdgpu_mc { + u64 private_aperture_end; + /* protects concurrent invalidation */ + spinlock_t invalidate_lock; +- +- struct kfd_vm_fault_info *vm_fault_info; +- atomic_t vm_fault_info_updated; + }; + + /* +@@ -730,7 +702,7 @@ int amdgpu_queue_mgr_fini(struct amdgpu_device *adev, + struct amdgpu_queue_mgr *mgr); + int amdgpu_queue_mgr_map(struct amdgpu_device *adev, + struct amdgpu_queue_mgr *mgr, +- u32 hw_ip, u32 instance, u32 ring, ++ int hw_ip, int instance, int ring, + struct amdgpu_ring **out_ring); + + /* +@@ -966,7 +938,6 @@ struct amdgpu_gfx_config { + }; + + struct amdgpu_cu_info { +- uint32_t simd_per_cu; + uint32_t max_waves_per_simd; + uint32_t wave_front_size; + uint32_t max_scratch_slots_per_cu; +@@ -1094,7 +1065,6 @@ struct amdgpu_cs_parser { + /* buffer objects */ + struct ww_acquire_ctx ticket; + struct amdgpu_bo_list *bo_list; +- struct amdgpu_mn *mn; + struct amdgpu_bo_list_entry vm_pd; + struct list_head validated; + struct dma_fence *fence; +@@ -1236,6 +1206,20 @@ void amdgpu_benchmark(struct amdgpu_device *adev, int test_number); + void amdgpu_test_moves(struct amdgpu_device *adev); + + /* ++ * MMU Notifier ++ */ ++#if defined(CONFIG_MMU_NOTIFIER) ++int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr); ++void amdgpu_mn_unregister(struct amdgpu_bo *bo); ++#else ++static inline int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) ++{ ++ return -ENODEV; ++} ++static inline void amdgpu_mn_unregister(struct amdgpu_bo *bo) {} ++#endif ++ ++/* + * Debugfs + */ + struct amdgpu_debugfs { +@@ -1435,7 +1419,10 @@ struct amdgpu_direct_gma { + }; + + #if defined(CONFIG_ZONE_DEVICE) && \ +- (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) || defined(OS_NAME_RHEL_7_3) || defined(OS_NAME_SLE)) ++ (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) || \ ++ defined(OS_NAME_RHEL_7_3) || \ ++ defined(OS_NAME_RHEL_7_4) || \ ++ defined(OS_NAME_SLE)) + #define CONFIG_ENABLE_SSG + #endif + +@@ -1603,14 +1590,18 @@ struct amdgpu_device { + /* sdma */ + struct amdgpu_sdma sdma; + +- /* uvd */ +- struct amdgpu_uvd uvd; ++ union { ++ struct { ++ /* uvd */ ++ struct amdgpu_uvd uvd; + +- /* vce */ +- struct amdgpu_vce vce; ++ /* vce */ ++ struct amdgpu_vce vce; ++ }; + +- /* vcn */ +- struct amdgpu_vcn vcn; ++ /* vcn */ ++ struct amdgpu_vcn vcn; ++ }; + + /* firmwares */ + struct amdgpu_firmware firmware; +@@ -1655,7 +1646,6 @@ struct amdgpu_device { + /* record hw reset is performed */ + bool has_hw_reset; + u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; +- spinlock_t tlb_invalidation_lock; + + /* record last mm index being written through WREG32*/ + unsigned long last_mm_index; +@@ -1861,6 +1851,18 @@ void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes, + u64 num_vis_bytes); + void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain); + bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo); ++int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages); ++int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr, ++ uint32_t flags); ++bool amdgpu_ttm_tt_has_userptr(struct ttm_tt *ttm); ++struct mm_struct *amdgpu_ttm_tt_get_usermm(struct ttm_tt *ttm); ++bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start, ++ unsigned long end); ++bool amdgpu_ttm_tt_userptr_invalidated(struct ttm_tt *ttm, ++ int *last_invalidated); ++bool amdgpu_ttm_tt_is_readonly(struct ttm_tt *ttm); ++uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt *ttm, ++ struct ttm_mem_reg *mem); + void amdgpu_vram_location(struct amdgpu_device *adev, struct amdgpu_mc *mc, u64 base); + void amdgpu_gart_location(struct amdgpu_device *adev, struct amdgpu_mc *mc); + void amdgpu_ttm_set_active_vram_size(struct amdgpu_device *adev, u64 size); +@@ -1943,9 +1945,10 @@ static inline int amdgpu_acpi_init(struct amdgpu_device *adev) { return 0; } + static inline void amdgpu_acpi_fini(struct amdgpu_device *adev) { } + #endif + +-int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, +- uint64_t addr, struct amdgpu_bo **bo, +- struct amdgpu_bo_va_mapping **mapping); ++struct amdgpu_bo_va_mapping * ++amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, ++ uint64_t addr, struct amdgpu_bo **bo); ++int amdgpu_cs_sysvm_access_required(struct amdgpu_cs_parser *parser); + + #if defined(CONFIG_DRM_AMD_DC) + int amdgpu_dm_display_resume(struct amdgpu_device *adev ); +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +index ec8141f..ef56352 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +@@ -20,29 +20,23 @@ + * OTHER DEALINGS IN THE SOFTWARE. + */ + +-#undef pr_fmt +-#define pr_fmt(fmt) "kfd2kgd: " fmt +- + #include "amdgpu_amdkfd.h" +-#include <linux/dma-buf.h> ++#include "amd_shared.h" + #include <drm/drmP.h> + #include "amdgpu.h" + #include "amdgpu_gfx.h" + #include <linux/module.h> + +-#define AMDKFD_SKIP_UNCOMPILED_CODE 1 +- ++const struct kfd2kgd_calls *kfd2kgd; + const struct kgd2kfd_calls *kgd2kfd; +-bool (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); +- +-unsigned int global_compute_vmid_bitmap = 0xFF00; ++bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); + + int amdgpu_amdkfd_init(void) + { + int ret; + + #if defined(CONFIG_HSA_AMD_MODULE) +- int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); ++ int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); + + kgd2kfd_init_p = symbol_request(kgd2kfd_init); + +@@ -63,68 +57,56 @@ int amdgpu_amdkfd_init(void) + #else + ret = -ENOENT; + #endif +- amdgpu_amdkfd_gpuvm_init_mem_limits(); +- return ret; +-} + +-void amdgpu_amdkfd_fini(void) +-{ +- if (kgd2kfd) { +- kgd2kfd->exit(); +- symbol_put(kgd2kfd_init); +- } ++ return ret; + } + +-void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) ++bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev) + { +- const struct kfd2kgd_calls *kfd2kgd; +- +- if (!kgd2kfd) +- return; +- + switch (adev->asic_type) { + #ifdef CONFIG_DRM_AMDGPU_CIK + case CHIP_KAVERI: +- case CHIP_HAWAII: + kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions(); + break; + #endif + case CHIP_CARRIZO: +- case CHIP_TONGA: +- case CHIP_FIJI: +- case CHIP_POLARIS10: +- case CHIP_POLARIS11: + kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); + break; +- case CHIP_VEGA10: +- case CHIP_RAVEN: +- kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions(); +- break; + default: +- dev_info(adev->dev, "kfd not supported on this ASIC\n"); +- return; ++ return false; ++ } ++ ++ return true; ++} ++ ++void amdgpu_amdkfd_fini(void) ++{ ++ if (kgd2kfd) { ++ kgd2kfd->exit(); ++ symbol_put(kgd2kfd_init); + } ++} + +- adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, +- adev->pdev, kfd2kgd); ++void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) ++{ ++ if (kgd2kfd) ++ adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, ++ adev->pdev, kfd2kgd); + } + + void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) + { + int i; + int last_valid_bit; +- + if (adev->kfd) { + struct kgd2kfd_shared_resources gpu_resources = { +- .compute_vmid_bitmap = global_compute_vmid_bitmap, ++ .compute_vmid_bitmap = 0xFF00, + .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec, +- .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe, +- .gpuvm_size = (uint64_t)amdgpu_vm_size << 30 ++ .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe + }; + + /* this is going to have a few of the MSBs set that we need to +- * clear +- */ ++ * clear */ + bitmap_complement(gpu_resources.queue_bitmap, + adev->gfx.mec.queue_bitmap, + KGD_MAX_QUEUES); +@@ -138,8 +120,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) + gpu_resources.queue_bitmap); + + /* According to linux/bitmap.h we shouldn't use bitmap_clear if +- * nbits is not compile time constant +- */ ++ * nbits is not compile time constant */ + last_valid_bit = 1 /* only first MEC can have compute queues */ + * adev->gfx.mec.num_pipe_per_mec + * adev->gfx.mec.num_queue_per_pipe; +@@ -150,28 +131,6 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) + &gpu_resources.doorbell_physical_address, + &gpu_resources.doorbell_aperture_size, + &gpu_resources.doorbell_start_offset); +- if (adev->asic_type >= CHIP_VEGA10) { +- /* On SOC15 the BIF is involved in routing +- * doorbells using the low 12 bits of the +- * address. Communicate the assignments to +- * KFD. KFD uses two doorbell pages per +- * process in case of 64-bit doorbells so we +- * can use each doorbell assignment twice. +- */ +- gpu_resources.sdma_doorbell[0][0] = +- AMDGPU_DOORBELL64_sDMA_ENGINE0; +- gpu_resources.sdma_doorbell[0][1] = +- AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200; +- gpu_resources.sdma_doorbell[1][0] = +- AMDGPU_DOORBELL64_sDMA_ENGINE1; +- gpu_resources.sdma_doorbell[1][1] = +- AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200; +- /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for +- * SDMA, IH and VCN. So don't use them for the CP. +- */ +- gpu_resources.reserved_doorbell_mask = 0x1f0; +- gpu_resources.reserved_doorbell_val = 0x0f0; +- } + + kgd2kfd->device_init(adev->kfd, &gpu_resources); + } +@@ -208,81 +167,24 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev) + return r; + } + +-int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, +- uint32_t vmid, uint64_t gpu_addr, +- uint32_t *ib_cmd, uint32_t ib_len) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- struct amdgpu_job *job; +- struct amdgpu_ib *ib; +- struct amdgpu_ring *ring; +- struct dma_fence *f = NULL; +- int ret; +- +- switch (engine) { +- case KGD_ENGINE_MEC1: +- ring = &adev->gfx.compute_ring[0]; +- break; +- case KGD_ENGINE_SDMA1: +- ring = &adev->sdma.instance[0].ring; +- break; +- case KGD_ENGINE_SDMA2: +- ring = &adev->sdma.instance[1].ring; +- break; +- default: +- pr_err("Invalid engine in IB submission: %d\n", engine); +- ret = -EINVAL; +- goto err; +- } +- +- ret = amdgpu_job_alloc(adev, 1, &job, NULL); +- if (ret) +- goto err; +- +- ib = &job->ibs[0]; +- memset(ib, 0, sizeof(struct amdgpu_ib)); +- +- ib->gpu_addr = gpu_addr; +- ib->ptr = ib_cmd; +- ib->length_dw = ib_len; +- /* This works for NO_HWS. TODO: need to handle without knowing VMID */ +- job->vm_id = vmid; +- +- ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); +- if (ret) { +- DRM_ERROR("amdgpu: failed to schedule IB.\n"); +- goto err_ib_sched; +- } +- +- ret = dma_fence_wait(f, false); +- +-err_ib_sched: +- dma_fence_put(f); +- amdgpu_job_free(job); +-err: +- return ret; +-} +- +-u32 pool_to_domain(enum kgd_memory_pool p) +-{ +- switch (p) { +- case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM; +- default: return AMDGPU_GEM_DOMAIN_GTT; +- } +-} +- + int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + void **mem_obj, uint64_t *gpu_addr, + void **cpu_ptr) + { + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- struct amdgpu_bo *bo = NULL; ++ struct kgd_mem **mem = (struct kgd_mem **) mem_obj; + int r; +- uint64_t gpu_addr_tmp = 0; +- void *cpu_ptr_tmp = NULL; ++ ++ BUG_ON(kgd == NULL); ++ BUG_ON(gpu_addr == NULL); ++ BUG_ON(cpu_ptr == NULL); ++ ++ *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL); ++ if ((*mem) == NULL) ++ return -ENOMEM; + + r = amdgpu_bo_create(adev, size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT, +- AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &bo); ++ AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &(*mem)->bo); + if (r) { + dev_err(adev->dev, + "failed to allocate BO for amdkfd (%d)\n", r); +@@ -290,87 +192,64 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + } + + /* map the buffer */ +- r = amdgpu_bo_reserve(bo, true); ++ r = amdgpu_bo_reserve((*mem)->bo, true); + if (r) { + dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r); + goto allocate_mem_reserve_bo_failed; + } + +- r = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT, +- &gpu_addr_tmp); ++ r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT, ++ &(*mem)->gpu_addr); + if (r) { + dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r); + goto allocate_mem_pin_bo_failed; + } ++ *gpu_addr = (*mem)->gpu_addr; + +- r = amdgpu_bo_kmap(bo, &cpu_ptr_tmp); ++ r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); + if (r) { + dev_err(adev->dev, + "(%d) failed to map bo to kernel for amdkfd\n", r); + goto allocate_mem_kmap_bo_failed; + } ++ *cpu_ptr = (*mem)->cpu_ptr; + +- *mem_obj = bo; +- *gpu_addr = gpu_addr_tmp; +- *cpu_ptr = cpu_ptr_tmp; +- +- amdgpu_bo_unreserve(bo); ++ amdgpu_bo_unreserve((*mem)->bo); + + return 0; + + allocate_mem_kmap_bo_failed: +- amdgpu_bo_unpin(bo); ++ amdgpu_bo_unpin((*mem)->bo); + allocate_mem_pin_bo_failed: +- amdgpu_bo_unreserve(bo); ++ amdgpu_bo_unreserve((*mem)->bo); + allocate_mem_reserve_bo_failed: +- amdgpu_bo_unref(&bo); ++ amdgpu_bo_unref(&(*mem)->bo); + + return r; + } + + void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) + { +- struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; ++ struct kgd_mem *mem = (struct kgd_mem *) mem_obj; + +- amdgpu_bo_reserve(bo, true); +- amdgpu_bo_kunmap(bo); +- amdgpu_bo_unpin(bo); +- amdgpu_bo_unreserve(bo); +- amdgpu_bo_unref(&(bo)); ++ BUG_ON(mem == NULL); ++ ++ amdgpu_bo_reserve(mem->bo, true); ++ amdgpu_bo_kunmap(mem->bo); ++ amdgpu_bo_unpin(mem->bo); ++ amdgpu_bo_unreserve(mem->bo); ++ amdgpu_bo_unref(&(mem->bo)); ++ kfree(mem); + } + +-void get_local_mem_info(struct kgd_dev *kgd, +- struct kfd_local_mem_info *mem_info) ++uint64_t get_vmem_size(struct kgd_dev *kgd) + { +- uint64_t address_mask; +- resource_size_t aper_limit; +- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; ++ struct amdgpu_device *adev = ++ (struct amdgpu_device *)kgd; + +- address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask : +- ~((1ULL << 32) - 1); +- aper_limit = adev->mc.aper_base + adev->mc.aper_size; +- +- memset(mem_info, 0, sizeof(*mem_info)); +- if (!(adev->mc.aper_base & address_mask || +- aper_limit & address_mask)) { +- mem_info->local_mem_size_public = adev->mc.visible_vram_size; +- mem_info->local_mem_size_private = adev->mc.real_vram_size - +- adev->mc.visible_vram_size; +- } else { +- mem_info->local_mem_size_public = 0; +- mem_info->local_mem_size_private = adev->mc.real_vram_size; +- } +- mem_info->vram_width = adev->mc.vram_width; ++ BUG_ON(kgd == NULL); + +- pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n", +- adev->mc.aper_base, aper_limit, +- mem_info->local_mem_size_public, +- mem_info->local_mem_size_private); +- +- if (amdgpu_sriov_vf(adev)) +- mem_info->mem_clk_max = adev->clock.default_mclk / 100; +- else +- mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100; ++ return adev->mc.real_vram_size; + } + + uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) +@@ -385,113 +264,6 @@ uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) + uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd) + { + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- +- /* the sclk is in quantas of 10kHz */ +- if (amdgpu_sriov_vf(adev)) +- return adev->clock.default_sclk / 100; +- +- return amdgpu_dpm_get_sclk(adev, false) / 100; +-} +- +-void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- struct amdgpu_cu_info acu_info = adev->gfx.cu_info; +- +- memset(cu_info, 0, sizeof(*cu_info)); +- if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap)) +- return; +- +- cu_info->cu_active_number = acu_info.number; +- cu_info->cu_ao_mask = acu_info.ao_cu_mask; +- memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], +- sizeof(acu_info.bitmap)); +- cu_info->num_shader_engines = adev->gfx.config.max_shader_engines; +- cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; +- cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh; +- cu_info->simd_per_cu = acu_info.simd_per_cu; +- cu_info->max_waves_per_simd = acu_info.max_waves_per_simd; +- cu_info->wave_front_size = acu_info.wave_front_size; +- cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu; +- cu_info->lds_size = acu_info.lds_size; +-} +- +-int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, +- struct kgd_dev **dma_buf_kgd, +- uint64_t *bo_size, void *metadata_buffer, +- size_t buffer_size, uint32_t *metadata_size, +- uint32_t *flags) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- struct dma_buf *dma_buf; +- struct drm_gem_object *obj; +- struct amdgpu_bo *bo; +- uint64_t metadata_flags; +- int r = -EINVAL; +- +- dma_buf = dma_buf_get(dma_buf_fd); +- if (IS_ERR(dma_buf)) +- return PTR_ERR(dma_buf); +- +- if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) +- /* Can't handle non-graphics buffers */ +- goto out_put; +- +- obj = dma_buf->priv; +- if (obj->dev->driver != adev->ddev->driver) +- /* Can't handle buffers from different drivers */ +- goto out_put; +- +- adev = obj->dev->dev_private; +- bo = gem_to_amdgpu_bo(obj); +- if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | +- AMDGPU_GEM_DOMAIN_GTT | +- AMDGPU_GEM_DOMAIN_DGMA))) +- /* Only VRAM, GTT and DGMA BOs are supported */ +- goto out_put; +- +- r = 0; +- if (dma_buf_kgd) +- *dma_buf_kgd = (struct kgd_dev *)adev; +- if (bo_size) +- *bo_size = amdgpu_bo_size(bo); +- if (metadata_size) +- *metadata_size = bo->metadata_size; +- if (metadata_buffer) +- r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size, +- metadata_size, &metadata_flags); +- if (flags) { +- /* If the preferred domain is DGMA, set flags to VRAM because +- * KFD doesn't support allocating DGMA memory +- */ +- *flags = (bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | +- AMDGPU_GEM_DOMAIN_DGMA)) ? +- ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT; +- +- if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) +- *flags |= ALLOC_MEM_FLAGS_PUBLIC; +- } +- +-out_put: +- dma_buf_put(dma_buf); +- return r; +-} +- +-uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- uint64_t usage = +- amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]); +- return usage; +-} +- +-bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, +- u32 vmid) +-{ +- if (adev->kfd) { +- if ((1 << vmid) & global_compute_vmid_bitmap) +- return true; +- } +- +- return false; ++ /* The sclk is in quantas of 10kHz */ ++ return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +index b259ba7..8e8c10e 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +@@ -27,109 +27,21 @@ + + #include <linux/types.h> + #include <linux/mm.h> +-#include <linux/workqueue.h> +-#include <linux/mmu_context.h> + #include <kgd_kfd_interface.h> +-#include "amdgpu.h" +- +-extern const struct kgd2kfd_calls *kgd2kfd; + + struct amdgpu_device; + +-struct kfd_bo_va_list { +- struct list_head bo_list; +- struct amdgpu_bo_va *bo_va; +- void *kgd_dev; +- bool is_mapped; +- bool map_fail; +- uint64_t va; +- uint64_t pte_flags; +-}; +- + struct kgd_mem { +- struct mutex lock; + struct amdgpu_bo *bo; +- struct list_head bo_va_list; +- /* protected by amdkfd_process_info.lock */ +- struct ttm_validate_buffer validate_list; +- struct ttm_validate_buffer resv_list; +- uint32_t domain; +- unsigned int mapped_to_gpu_memory; +- void *kptr; +- uint64_t va; +- +- uint32_t mapping_flags; +- +- atomic_t invalid; +- struct amdkfd_process_info *process_info; +- struct page **user_pages; +- +- struct amdgpu_sync sync; +- +- /* flags bitfield */ +- bool coherent : 1; +- bool no_substitute : 1; +- bool aql_queue : 1; +-}; +- +-/* KFD Memory Eviction */ +-struct amdgpu_amdkfd_fence { +- struct dma_fence base; +- void *mm; +- spinlock_t lock; +- char timeline_name[TASK_COMM_LEN]; +-}; +- +-struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, +- void *mm); +-bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm); +-struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); +- +-struct amdkfd_process_info { +- /* List head of all VMs that belong to a KFD process */ +- struct list_head vm_list_head; +- /* List head for all KFD BOs that belong to a KFD process. */ +- struct list_head kfd_bo_list; +- /* List of userptr BOs that are valid or invalid */ +- struct list_head userptr_valid_list; +- struct list_head userptr_inval_list; +- /* Lock to protect kfd_bo_list */ +- struct mutex lock; +- +- /* Number of VMs */ +- unsigned int n_vms; +- /* Eviction Fence */ +- struct amdgpu_amdkfd_fence *eviction_fence; +- +- /* MMU-notifier related fields */ +- atomic_t evicted_bos; +- struct delayed_work work; +- struct pid *pid; +-}; +- +-/* struct amdkfd_vm - +- * For Memory Eviction KGD requires a mechanism to keep track of all KFD BOs +- * belonging to a KFD process. All the VMs belonging to the same process point +- * to the same amdkfd_process_info. +- */ +-struct amdkfd_vm { +- /* Keep base as the first parameter for pointer compatibility between +- * amdkfd_vm and amdgpu_vm. +- */ +- struct amdgpu_vm base; +- +- /* List node in amdkfd_process_info.vm_list_head*/ +- struct list_head vm_list_node; +- +- struct amdgpu_device *adev; +- /* Points to the KFD process VM info*/ +- struct amdkfd_process_info *process_info; ++ uint64_t gpu_addr; ++ void *cpu_ptr; + }; + + + int amdgpu_amdkfd_init(void); + void amdgpu_amdkfd_fini(void); + ++bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev); + + void amdgpu_amdkfd_suspend(struct amdgpu_device *adev); + int amdgpu_amdkfd_resume(struct amdgpu_device *adev); +@@ -139,105 +51,17 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); + void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); + void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); + +-int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); +-int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, +- uint32_t vmid, uint64_t gpu_addr, +- uint32_t *ib_cmd, uint32_t ib_len); +-int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, +- struct dma_fence **ef); + struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); + struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); +-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void); +-int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, +- uint64_t src_offset, struct kgd_mem *dst_mem, +- uint64_t dest_offset, uint64_t size, struct dma_fence **f, +- uint64_t *actual_size); +- +-bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, +- u32 vmid); + + /* Shared API */ +-int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm, +- struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va); + int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + void **mem_obj, uint64_t *gpu_addr, + void **cpu_ptr); + void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); +-void get_local_mem_info(struct kgd_dev *kgd, +- struct kfd_local_mem_info *mem_info); ++uint64_t get_vmem_size(struct kgd_dev *kgd); + uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); + + uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd); +-void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info); +-int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, +- struct kgd_dev **dmabuf_kgd, +- uint64_t *bo_size, void *metadata_buffer, +- size_t buffer_size, uint32_t *metadata_size, +- uint32_t *flags); +-uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd); +- +-#define read_user_wptr(mmptr, wptr, dst) \ +- ({ \ +- bool valid = false; \ +- if ((mmptr) && (wptr)) { \ +- if ((mmptr) == current->mm) { \ +- valid = !get_user((dst), (wptr)); \ +- } else if (current->mm == NULL) { \ +- use_mm(mmptr); \ +- valid = !get_user((dst), (wptr)); \ +- unuse_mm(mmptr); \ +- } \ +- } \ +- valid; \ +- }) +- +-/* GPUVM API */ +-int amdgpu_amdkfd_gpuvm_sync_memory( +- struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); +-int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( +- struct kgd_dev *kgd, uint64_t va, uint64_t size, +- void *vm, struct kgd_mem **mem, +- uint64_t *offset, uint32_t flags); +-int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( +- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); +-int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( +- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); +-int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( +- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); + +-int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, +- void **process_info, +- struct dma_fence **ef); +-void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm); +- +-uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm); +- +-int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, +- struct kfd_vm_fault_info *info); +- +-int amdgpu_amdkfd_gpuvm_mmap_bo( +- struct kgd_dev *kgd, struct vm_area_struct *vma); +- +-int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, +- struct kgd_mem *mem, void **kptr); +- +-int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, +- struct kgd_mem *mem, uint64_t offset, +- uint64_t size, struct sg_table **ret_sg); +-void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( +- struct kgd_mem *mem, struct sg_table *sg); +-int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, +- struct dma_buf *dmabuf, +- uint64_t va, void *vm, +- struct kgd_mem **mem, uint64_t *size, +- uint64_t *mmap_offset); +-int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, +- struct kgd_mem *mem, +- struct dma_buf **dmabuf); +-int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm); +-int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm); +- +-void amdgpu_amdkfd_gpuvm_init_mem_limits(void); +-void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo); + #endif /* AMDGPU_AMDKFD_H_INCLUDED */ +- +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c +deleted file mode 100644 +index 3961937..0000000 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c ++++ /dev/null +@@ -1,196 +0,0 @@ +-/* +- * Copyright 2016 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#include <linux/spinlock.h> +-#include <linux/atomic.h> +-#include <linux/stacktrace.h> +-#include <linux/sched.h> +-#include <linux/slab.h> +-#include "amdgpu_amdkfd.h" +- +-const struct dma_fence_ops amd_kfd_fence_ops; +-static atomic_t fence_seq = ATOMIC_INIT(0); +- +-static int amd_kfd_fence_signal(struct dma_fence *f); +- +-/* Eviction Fence +- * Fence helper functions to deal with KFD memory eviction. +- * Big Idea - Since KFD submissions are done by user queues, a BO cannot be +- * evicted unless all the user queues for that process are evicted. +- * +- * All the BOs in a process share an eviction fence. When process X wants +- * to map VRAM memory but TTM can't find enough space, TTM will attempt to +- * evict BOs from its LRU list. TTM checks if the BO is valuable to evict +- * by calling ttm_bo_driver->eviction_valuable(). +- * +- * ttm_bo_driver->eviction_valuable() - will return false if the BO belongs +- * to process X. Otherwise, it will return true to indicate BO can be +- * evicted by TTM. +- * +- * If ttm_bo_driver->eviction_valuable returns true, then TTM will continue +- * the evcition process for that BO by calling ttm_bo_evict --> amdgpu_bo_move +- * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler. +- * +- * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to +- * nofity when the BO is free to move. fence_add_callback --> enable_signaling +- * --> amdgpu_amdkfd_fence.enable_signaling +- * +- * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will quiesce +- * user queues and signal fence. The work item will also start another delayed +- * work item to restore BOs +- */ +- +-struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, +- void *mm) +-{ +- struct amdgpu_amdkfd_fence *fence = NULL; +- +- fence = kzalloc(sizeof(*fence), GFP_KERNEL); +- if (fence == NULL) +- return NULL; +- +- /* mm_struct mm is used as void pointer to identify the parent +- * KFD process. Don't dereference it. Fence and any threads using +- * mm is guranteed to be released before process termination. +- */ +- fence->mm = mm; +- get_task_comm(fence->timeline_name, current); +- spin_lock_init(&fence->lock); +- +- dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock, +- context, atomic_inc_return(&fence_seq)); +- +- return fence; +-} +- +-struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f) +-{ +- struct amdgpu_amdkfd_fence *fence; +- +- if (!f) +- return NULL; +- +- fence = container_of(f, struct amdgpu_amdkfd_fence, base); +- if (fence && f->ops == &amd_kfd_fence_ops) +- return fence; +- +- return NULL; +-} +- +-static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f) +-{ +- return "amdgpu_amdkfd_fence"; +-} +- +-static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f) +-{ +- struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); +- +- return fence->timeline_name; +-} +- +-/** +- * amd_kfd_fence_enable_signaling - This gets called when TTM wants to evict +- * a KFD BO and schedules a job to move the BO. +- * If fence is already signaled return true. +- * If fence is not signaled schedule a evict KFD process work item. +- */ +-static bool amd_kfd_fence_enable_signaling(struct dma_fence *f) +-{ +- struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); +- +- if (!fence) +- return false; +- +- if (dma_fence_is_signaled(f)) +- return true; +- +- if (!kgd2kfd->schedule_evict_and_restore_process( +- (struct mm_struct *)fence->mm, f)) +- return true; +- +- return false; +-} +- +-static int amd_kfd_fence_signal(struct dma_fence *f) +-{ +- unsigned long flags; +- int ret; +- +- spin_lock_irqsave(f->lock, flags); +- /* Set enabled bit so cb will called */ +- set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags); +- ret = dma_fence_signal_locked(f); +- spin_unlock_irqrestore(f->lock, flags); +- +- return ret; +-} +- +-/** +- * amd_kfd_fence_release - callback that fence can be freed +- * +- * @fence: fence +- * +- * This function is called when the reference count becomes zero. +- * It just RCU schedules freeing up the fence. +-*/ +-static void amd_kfd_fence_release(struct dma_fence *f) +-{ +- struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); +- /* Unconditionally signal the fence. The process is getting +- * terminated. +- */ +- if (WARN_ON(!fence)) +- return; /* Not an amdgpu_amdkfd_fence */ +- +- amd_kfd_fence_signal(f); +- kfree_rcu(f, rcu); +-} +- +-/** +- * amd_kfd_fence_check_mm - Check if @mm is same as that of the fence @f +- * if same return TRUE else return FALSE. +- * +- * @f: [IN] fence +- * @mm: [IN] mm that needs to be verified +-*/ +-bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm) +-{ +- struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); +- +- if (!fence) +- return false; +- else if (fence->mm == mm) +- return true; +- +- return false; +-} +- +-const struct dma_fence_ops amd_kfd_fence_ops = { +- .get_driver_name = amd_kfd_fence_get_driver_name, +- .get_timeline_name = amd_kfd_fence_get_timeline_name, +- .enable_signaling = amd_kfd_fence_enable_signaling, +- .signaled = NULL, +- .wait = dma_fence_default_wait, +- .release = amd_kfd_fence_release, +-}; +- +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +old mode 100755 +new mode 100644 +index 6964ece..f6acf48 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +@@ -20,9 +20,6 @@ + * OTHER DEALINGS IN THE SOFTWARE. + */ + +-#undef pr_fmt +-#define pr_fmt(fmt) "kfd2kgd: " fmt +- + #include <linux/fdtable.h> + #include <linux/uaccess.h> + #include <linux/firmware.h> +@@ -42,14 +39,6 @@ + #include "gmc/gmc_7_1_sh_mask.h" + #include "cik_structs.h" + +-#define AMDKFD_SKIP_UNCOMPILED_CODE 1 +- +-enum hqd_dequeue_request_type { +- NO_ACTION = 0, +- DRAIN_PIPE, +- RESET_WAVES +-}; +- + enum { + MAX_TRAPID = 8, /* 3 bits in the bitfield. */ + MAX_WATCH_ADDRESSES = 4 +@@ -66,8 +55,8 @@ enum { + enum { + ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, + ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, +- ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000, +- /* extend the mask to 26 bits in order to match the low address field */ ++ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, ++ /* extend the mask to 26 bits to match the low address field */ + ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, + ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF + }; +@@ -92,42 +81,30 @@ union TCP_WATCH_CNTL_BITS { + float f32All; + }; + +-static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, +- int fd, uint32_t handle, struct kgd_mem **mem); +- +-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); +- + /* + * Register access functions + */ + + static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, +- uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); ++ uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, ++ uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); ++ + static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, +- unsigned int vmid); ++ unsigned int vmid); ++ + static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, +- uint32_t hpd_size, uint64_t hpd_gpu_addr); ++ uint32_t hpd_size, uint64_t hpd_gpu_addr); + static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr, +- uint32_t wptr_shift, uint32_t wptr_mask, +- struct mm_struct *mm); +-static int kgd_hqd_dump(struct kgd_dev *kgd, +- uint32_t pipe_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs); +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, +- uint32_t __user *wptr, struct mm_struct *mm); +-static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, +- uint32_t engine_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs); ++ uint32_t queue_id, uint32_t __user *wptr); ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, +- uint32_t pipe_id, uint32_t queue_id); +-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); +-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, +- enum kfd_preempt_type reset_type, ++ uint32_t pipe_id, uint32_t queue_id); ++ ++static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id); ++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); + static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout); + static int kgd_address_watch_disable(struct kgd_dev *kgd); +@@ -147,60 +124,21 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid); + static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + uint8_t vmid); + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +-static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); +-static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req); +-static int alloc_memory_of_scratch(struct kgd_dev *kgd, +- uint64_t va, uint32_t vmid); +-static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, +- uint8_t element_size, uint8_t index_stride, uint8_t mtype); +-static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t page_table_base); +-static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd); +- +-/* Because of REG_GET_FIELD() being used, we put this function in the +- * asic specific file. +- */ +-static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, +- struct tile_config *config) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- +- config->gb_addr_config = adev->gfx.config.gb_addr_config; +- config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, +- MC_ARB_RAMCFG, NOOFBANK); +- config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, +- MC_ARB_RAMCFG, NOOFRANKS); + +- config->tile_config_ptr = adev->gfx.config.tile_mode_array; +- config->num_tile_configs = +- ARRAY_SIZE(adev->gfx.config.tile_mode_array); +- config->macro_tile_config_ptr = +- adev->gfx.config.macrotile_mode_array; +- config->num_macro_tile_configs = +- ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); +- +- +- return 0; +-} ++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); + + static const struct kfd2kgd_calls kfd2kgd = { + .init_gtt_mem_allocation = alloc_gtt_mem, + .free_gtt_mem = free_gtt_mem, +- .get_local_mem_info = get_local_mem_info, ++ .get_vmem_size = get_vmem_size, + .get_gpu_clock_counter = get_gpu_clock_counter, + .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, +- .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, +- .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, +- .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, +- .open_graphic_handle = open_graphic_handle, + .program_sh_mem_settings = kgd_program_sh_mem_settings, + .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, + .init_pipeline = kgd_init_pipeline, + .init_interrupts = kgd_init_interrupts, + .hqd_load = kgd_hqd_load, + .hqd_sdma_load = kgd_hqd_sdma_load, +- .hqd_dump = kgd_hqd_dump, +- .hqd_sdma_dump = kgd_hqd_sdma_dump, + .hqd_is_occupied = kgd_hqd_is_occupied, + .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, + .hqd_destroy = kgd_hqd_destroy, +@@ -209,50 +147,17 @@ static const struct kfd2kgd_calls kfd2kgd = { + .address_watch_execute = kgd_address_watch_execute, + .wave_control_execute = kgd_wave_control_execute, + .address_watch_get_offset = kgd_address_watch_get_offset, +- .get_atc_vmid_pasid_mapping_pasid = +- get_atc_vmid_pasid_mapping_pasid, +- .get_atc_vmid_pasid_mapping_valid = +- get_atc_vmid_pasid_mapping_valid, +- .read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg, ++ .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid, ++ .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, + .write_vmid_invalidate_request = write_vmid_invalidate_request, +- .invalidate_tlbs = invalidate_tlbs, +- .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, +- .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, +- .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, +- .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, +- .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, +- .get_fw_version = get_fw_version, +- .set_num_of_requests = set_num_of_requests, +- .get_cu_info = get_cu_info, +- .alloc_memory_of_scratch = alloc_memory_of_scratch, +- .write_config_static_mem = write_config_static_mem, +- .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, +- .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, +- .set_vm_context_page_table_base = set_vm_context_page_table_base, +- .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, +- .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, +- .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, +- .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, +- .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, +- .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, +- .submit_ib = amdgpu_amdkfd_submit_ib, +- .get_tile_config = amdgpu_amdkfd_get_tile_config, +- .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, +- .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, +- .get_vram_usage = amdgpu_amdkfd_get_vram_usage ++ .get_fw_version = get_fw_version + }; + +-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions() ++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void) + { + return (struct kfd2kgd_calls *)&kfd2kgd; + } + +-static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, +- int fd, uint32_t handle, struct kgd_mem **mem) +-{ +- return 0; +-} +- + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) + { + return (struct amdgpu_device *)kgd; +@@ -281,7 +186,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + +- uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + lock_srbm(kgd, mec, pipe, queue_id, 0); +@@ -317,12 +222,12 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, + + /* + * We have to assume that there is no outstanding mapping. +- * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a +- * mapping is in progress or because a mapping finished and the SW +- * cleared it. So the protocol is to always wait & clear. ++ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because ++ * a mapping is in progress or because a mapping finished and the ++ * SW cleared it. So the protocol is to always wait & clear. + */ +- uint32_t pasid_mapping = (pasid == 0) ? 0 : +- (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; ++ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | ++ ATC_VMID0_PASID_MAPPING__VALID_MASK; + + WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); + +@@ -368,7 +273,8 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) + + retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + + m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; +- pr_debug("sdma base address: 0x%x\n", retval); ++ ++ pr_debug("kfd: sdma base address: 0x%x\n", retval); + + return retval; + } +@@ -384,138 +290,42 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) + } + + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr, +- uint32_t wptr_shift, uint32_t wptr_mask, +- struct mm_struct *mm) ++ uint32_t queue_id, uint32_t __user *wptr) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); ++ uint32_t wptr_shadow, is_wptr_shadow_valid; + struct cik_mqd *m; +- uint32_t *mqd_hqd; +- uint32_t reg, wptr_val, data; +- bool valid_wptr = false; + + m = get_mqd(mqd); + +- acquire_queue(kgd, pipe_id, queue_id); +- +- /* HQD registers extend from CP_MQD_BASE_ADDR to CP_MQD_CONTROL. */ +- mqd_hqd = &m->cp_mqd_base_addr_lo; +- +- for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) +- WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); +- +- /* Copy userspace write pointer value to register. +- * Activate doorbell logic to monitor subsequent changes. +- */ +- data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, +- CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); +- WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); +- +- /* read_user_ptr may take the mm->mmap_sem. +- * release srbm_mutex to avoid circular dependency between +- * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. +- */ +- release_queue(kgd); +- valid_wptr = read_user_wptr(mm, wptr, wptr_val); ++ is_wptr_shadow_valid = !get_user(wptr_shadow, wptr); ++ if (is_wptr_shadow_valid) ++ m->cp_hqd_pq_wptr = wptr_shadow; + + acquire_queue(kgd, pipe_id, queue_id); +- if (valid_wptr) +- WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); +- +- data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); +- WREG32(mmCP_HQD_ACTIVE, data); +- + ++ gfx_v7_0_mqd_commit(adev, m); + release_queue(kgd); + + return 0; + } + +-static int kgd_hqd_dump(struct kgd_dev *kgd, +- uint32_t pipe_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t i = 0, reg; +-#define HQD_N_REGS (35+4) +-#define DUMP_REG(addr) do { \ +- if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ +- break; \ +- (*dump)[i][0] = (addr) << 2; \ +- (*dump)[i++][1] = RREG32(addr); \ +- } while (0) +- +- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); +- if (*dump == NULL) +- return -ENOMEM; +- +- acquire_queue(kgd, pipe_id, queue_id); +- +- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); +- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); +- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); +- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); +- +- for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) +- DUMP_REG(reg); +- +- release_queue(kgd); +- +- WARN_ON_ONCE(i != HQD_N_REGS); +- *n_regs = i; +- +- return 0; +-} +- +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, +- uint32_t __user *wptr, struct mm_struct *mm) ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + struct cik_sdma_rlc_registers *m; + unsigned long end_jiffies; + uint32_t sdma_base_addr; +- uint32_t data; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); + +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, +- m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); +- +- while (true) { +- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) +- break; +- if (timeout == 0) +- return -ETIME; +- msleep(10); +- timeout -= 10; +- } +- if (m->sdma_engine_id) { +- data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); +- data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, +- RESUME_CTX, 0); +- WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); +- } else { +- data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); +- data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, +- RESUME_CTX, 0); +- WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); +- } +- +- data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL, +- ENABLE, 1); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr); +- if (read_user_wptr(mm, wptr, data)) +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); +- else +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, +- m->sdma_rlc_rb_rptr); +- ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, ++ m->sdma_rlc_virtual_addr); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, ++ m->sdma_rlc_rb_base); + WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, + m->sdma_rlc_virtual_addr); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); + + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, + m->sdma_rlc_rb_base_hi); +@@ -523,35 +333,11 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, + m->sdma_rlc_rb_rptr_addr_lo); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, + m->sdma_rlc_rb_rptr_addr_hi); +- data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL, +- RB_ENABLE, 1); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); +- return 0; +-} ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, ++ m->sdma_rlc_doorbell); + +-static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, +- uint32_t engine_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + +- queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; +- uint32_t i = 0, reg; +-#undef HQD_N_REGS +-#define HQD_N_REGS (19+4) +- +- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); +- if (*dump == NULL) +- return -ENOMEM; +- +- for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) +- DUMP_REG(sdma_offset + reg); +- for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; +- reg++) +- DUMP_REG(sdma_offset + reg); +- +- WARN_ON_ONCE(i != HQD_N_REGS); +- *n_regs = i; ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, ++ m->sdma_rlc_rb_cntl); + + return 0; + } +@@ -596,99 +382,30 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) + return false; + } + +-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, +- enum kfd_preempt_type reset_type, ++static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t temp; +- enum hqd_dequeue_request_type type; +- unsigned long flags, end_jiffies; +- int retry; ++ int timeout = utimeout; + + acquire_queue(kgd, pipe_id, queue_id); + WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0); + +- switch (reset_type) { +- case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: +- type = DRAIN_PIPE; +- break; +- case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: +- type = RESET_WAVES; +- break; +- default: +- type = DRAIN_PIPE; +- break; +- } +- +- /* Workaround: If IQ timer is active and the wait time is close to or +- * equal to 0, dequeueing is not safe. Wait until either the wait time +- * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is +- * cleared before continuing. Also, ensure wait times are set to at +- * least 0x3. +- */ +- local_irq_save(flags); +- preempt_disable(); +- retry = 5000; /* wait for 500 usecs at maximum */ +- while (true) { +- temp = RREG32(mmCP_HQD_IQ_TIMER); +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { +- pr_debug("HW is processing IQ\n"); +- goto loop; +- } +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) +- == 3) /* SEM-rearm is safe */ +- break; +- /* Wait time 3 is safe for CP, but our MMIO read/write +- * time is close to 1 microsecond, so check for 10 to +- * leave more buffer room +- */ +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) +- >= 10) +- break; +- pr_debug("IQ timer is active\n"); +- } else +- break; +-loop: +- if (!retry) { +- pr_err("CP HQD IQ timer status time out\n"); +- break; +- } +- ndelay(100); +- --retry; +- } +- retry = 1000; +- while (true) { +- temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); +- if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) +- break; +- pr_debug("Dequeue request is pending\n"); +- +- if (!retry) { +- pr_err("CP HQD dequeue request time out\n"); +- break; +- } +- ndelay(100); +- --retry; +- } +- local_irq_restore(flags); +- preempt_enable(); +- +- WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); ++ WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); + +- end_jiffies = (utimeout * HZ / 1000) + jiffies; + while (true) { + temp = RREG32(mmCP_HQD_ACTIVE); +- if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) ++ if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) + break; +- if (time_after(jiffies, end_jiffies)) { +- pr_err("cp queue preemption time out\n"); ++ if (timeout <= 0) { ++ pr_err("kfd: cp queue preemption time out.\n"); + release_queue(kgd); + return -ETIME; + } +- usleep_range(500, 1000); ++ msleep(20); ++ timeout -= 20; + } + + release_queue(kgd); +@@ -702,7 +419,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + struct cik_sdma_rlc_registers *m; + uint32_t sdma_base_addr; + uint32_t temp; +- unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; ++ int timeout = utimeout; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); +@@ -713,19 +430,18 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + + while (true) { + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) ++ if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) + break; +- if (time_after(jiffies, end_jiffies)) ++ if (timeout <= 0) + return -ETIME; +- usleep_range(500, 1000); ++ msleep(20); ++ timeout -= 20; + } + + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, +- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | +- SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); +- +- m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); + + return 0; + } +@@ -744,9 +460,8 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd) + + /* Turning off this address until we set all the registers */ + for (i = 0; i < MAX_WATCH_ADDRESSES; i++) +- WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_CNTL], +- cntl.u32All); ++ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + ++ ADDRESS_WATCH_REG_CNTL], cntl.u32All); + + return 0; + } +@@ -764,24 +479,20 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, + + /* Turning off this watch point until we set all the registers */ + cntl.bitfields.valid = 0; +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_CNTL], +- cntl.u32All); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ++ ADDRESS_WATCH_REG_CNTL], cntl.u32All); + +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_ADDR_HI], +- addr_hi); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ++ ADDRESS_WATCH_REG_ADDR_HI], addr_hi); + +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_ADDR_LO], +- addr_lo); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ++ ADDRESS_WATCH_REG_ADDR_LO], addr_lo); + + /* Enable the watch point */ + cntl.bitfields.valid = 1; + +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_CNTL], +- cntl.u32All); ++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ++ ADDRESS_WATCH_REG_CNTL], cntl.u32All); + + return 0; + } +@@ -835,7 +546,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); +- return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; ++ return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; + } + + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) +@@ -845,90 +556,52 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) + WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); + } + +-static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- int vmid; +- +- for (vmid = 0; vmid < 16; vmid++) { +- if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) +- continue; +- if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & +- ATC_VMID0_PASID_MAPPING__VALID_MASK) { +- if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & +- ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { +- WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); +- break; +- } +- } +- } +- +- return 0; +-} +- +-static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, +- uint8_t element_size, uint8_t index_stride, uint8_t mtype) +-{ +- uint32_t reg; +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- +- reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | +- element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | +- index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | +- mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; +- +- WREG32(mmSH_STATIC_MEM_CONFIG, reg); +- return 0; +-} +-static int alloc_memory_of_scratch(struct kgd_dev *kgd, +- uint64_t va, uint32_t vmid) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- +- lock_srbm(kgd, 0, 0, 0, vmid); +- WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); +- unlock_srbm(kgd); +- +- return 0; +-} +- +- + static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + { + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + const union amdgpu_firmware_header *hdr; + ++ BUG_ON(kgd == NULL); ++ + switch (type) { + case KGD_ENGINE_PFP: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; ++ hdr = (const union amdgpu_firmware_header *) ++ adev->gfx.pfp_fw->data; + break; + + case KGD_ENGINE_ME: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; ++ hdr = (const union amdgpu_firmware_header *) ++ adev->gfx.me_fw->data; + break; + + case KGD_ENGINE_CE: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; ++ hdr = (const union amdgpu_firmware_header *) ++ adev->gfx.ce_fw->data; + break; + + case KGD_ENGINE_MEC1: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; ++ hdr = (const union amdgpu_firmware_header *) ++ adev->gfx.mec_fw->data; + break; + + case KGD_ENGINE_MEC2: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; ++ hdr = (const union amdgpu_firmware_header *) ++ adev->gfx.mec2_fw->data; + break; + + case KGD_ENGINE_RLC: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; ++ hdr = (const union amdgpu_firmware_header *) ++ adev->gfx.rlc_fw->data; + break; + + case KGD_ENGINE_SDMA1: +- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; ++ hdr = (const union amdgpu_firmware_header *) ++ adev->sdma.instance[0].fw->data; + break; + + case KGD_ENGINE_SDMA2: +- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; ++ hdr = (const union amdgpu_firmware_header *) ++ adev->sdma.instance[1].fw->data; + break; + + default: +@@ -942,42 +615,3 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + return hdr->common.ucode_version; + } + +-static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req) +-{ +- uint32_t value; +- struct amdgpu_device *adev = get_amdgpu_device(dev); +- +- value = RREG32(mmATC_ATS_DEBUG); +- value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK; +- value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT); +- +- WREG32(mmATC_ATS_DEBUG, value); +-} +- +-static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t page_table_base) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- /* TODO: Don't use hardcoded VMIDs */ +- if (vmid < 8 || vmid > 15) { +- pr_err("trying to set page table base for wrong VMID\n"); +- return; +- } +- WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); +-} +- +- /** +- * read_vmid_from_vmfault_reg - read vmid from register +- * +- * adev: amdgpu_device pointer +- * @vmid: vmid pointer +- * read vmid from register (CIK). +- */ +-static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- +- uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); +- +- return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID); +-} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +old mode 100755 +new mode 100644 +index 2ff10e9..133d066 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +@@ -20,9 +20,6 @@ + * OTHER DEALINGS IN THE SOFTWARE. + */ + +-#undef pr_fmt +-#define pr_fmt(fmt) "kfd2kgd: " fmt +- + #include <linux/module.h> + #include <linux/fdtable.h> + #include <linux/uaccess.h> +@@ -31,7 +28,7 @@ + #include "amdgpu.h" + #include "amdgpu_amdkfd.h" + #include "amdgpu_ucode.h" +-#include "amdgpu_amdkfd_gfx_v8.h" ++#include "gfx_v8_0.h" + #include "gca/gfx_8_0_sh_mask.h" + #include "gca/gfx_8_0_d.h" + #include "gca/gfx_8_0_enum.h" +@@ -42,31 +39,7 @@ + #include "vi_structs.h" + #include "vid.h" + +-enum hqd_dequeue_request_type { +- NO_ACTION = 0, +- DRAIN_PIPE, +- RESET_WAVES, +- SAVE_WAVES +-}; +- +-static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { +- mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, +- mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, +- mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, +- mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL +-}; +- +- +-struct vi_sdma_mqd; +- +-static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, +- void *vm, struct kgd_mem **mem); +-static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); +- +-static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, +- int fd, uint32_t handle, struct kgd_mem **mem); +- +-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); ++struct cik_sdma_rlc_registers; + + /* + * Register access functions +@@ -82,26 +55,17 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, + uint32_t hpd_size, uint64_t hpd_gpu_addr); + static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr, +- uint32_t wptr_shift, uint32_t wptr_mask, +- struct mm_struct *mm); +-static int kgd_hqd_dump(struct kgd_dev *kgd, +- uint32_t pipe_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs); +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, +- uint32_t __user *wptr, struct mm_struct *mm); +-static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, +- uint32_t engine_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs); ++ uint32_t queue_id, uint32_t __user *wptr); ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + uint32_t pipe_id, uint32_t queue_id); + static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); +-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, +- enum kfd_preempt_type reset_type, ++static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id); + static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout); ++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); + static int kgd_address_watch_disable(struct kgd_dev *kgd); + static int kgd_address_watch_execute(struct kgd_dev *kgd, + unsigned int watch_point_id, +@@ -120,61 +84,20 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, + static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + uint8_t vmid); + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +-static void set_num_of_requests(struct kgd_dev *kgd, +- uint8_t num_of_requests); +-static int alloc_memory_of_scratch(struct kgd_dev *kgd, +- uint64_t va, uint32_t vmid); +-static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, +- uint8_t element_size, uint8_t index_stride, uint8_t mtype); +-static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t page_table_base); +-static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); +- +-/* Because of REG_GET_FIELD() being used, we put this function in the +- * asic specific file. +- */ +-static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, +- struct tile_config *config) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- +- config->gb_addr_config = adev->gfx.config.gb_addr_config; +- config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, +- MC_ARB_RAMCFG, NOOFBANK); +- config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, +- MC_ARB_RAMCFG, NOOFRANKS); +- +- config->tile_config_ptr = adev->gfx.config.tile_mode_array; +- config->num_tile_configs = +- ARRAY_SIZE(adev->gfx.config.tile_mode_array); +- config->macro_tile_config_ptr = +- adev->gfx.config.macrotile_mode_array; +- config->num_macro_tile_configs = +- ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); +- +- return 0; +-} ++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); + + static const struct kfd2kgd_calls kfd2kgd = { + .init_gtt_mem_allocation = alloc_gtt_mem, + .free_gtt_mem = free_gtt_mem, +- .get_local_mem_info = get_local_mem_info, ++ .get_vmem_size = get_vmem_size, + .get_gpu_clock_counter = get_gpu_clock_counter, + .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, +- .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, +- .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, +- .create_process_gpumem = create_process_gpumem, +- .destroy_process_gpumem = destroy_process_gpumem, +- .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, +- .open_graphic_handle = open_graphic_handle, + .program_sh_mem_settings = kgd_program_sh_mem_settings, + .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, + .init_pipeline = kgd_init_pipeline, + .init_interrupts = kgd_init_interrupts, + .hqd_load = kgd_hqd_load, + .hqd_sdma_load = kgd_hqd_sdma_load, +- .hqd_dump = kgd_hqd_dump, +- .hqd_sdma_dump = kgd_hqd_sdma_dump, + .hqd_is_occupied = kgd_hqd_is_occupied, + .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, + .hqd_destroy = kgd_hqd_destroy, +@@ -188,56 +111,14 @@ static const struct kfd2kgd_calls kfd2kgd = { + .get_atc_vmid_pasid_mapping_valid = + get_atc_vmid_pasid_mapping_valid, + .write_vmid_invalidate_request = write_vmid_invalidate_request, +- .invalidate_tlbs = invalidate_tlbs, +- .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, +- .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, +- .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, +- .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, +- .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, +- .get_fw_version = get_fw_version, +- .set_num_of_requests = set_num_of_requests, +- .get_cu_info = get_cu_info, +- .alloc_memory_of_scratch = alloc_memory_of_scratch, +- .write_config_static_mem = write_config_static_mem, +- .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, +- .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, +- .set_vm_context_page_table_base = set_vm_context_page_table_base, +- .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, +- .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, +- .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, +- .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, +- .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, +- .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, +- .submit_ib = amdgpu_amdkfd_submit_ib, +- .get_tile_config = amdgpu_amdkfd_get_tile_config, +- .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, +- .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, +- .get_vram_usage = amdgpu_amdkfd_get_vram_usage ++ .get_fw_version = get_fw_version + }; + +-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions() ++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void) + { + return (struct kfd2kgd_calls *)&kfd2kgd; + } + +-static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, +- void *vm, struct kgd_mem **mem) +-{ +- return 0; +-} +- +-/* Destroys the GPU allocation and frees the kgd_mem structure */ +-static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) +-{ +- +-} +- +-static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, +- int fd, uint32_t handle, struct kgd_mem **mem) +-{ +- return 0; +-} +- + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) + { + return (struct amdgpu_device *)kgd; +@@ -266,7 +147,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + +- uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + lock_srbm(kgd, mec, pipe, queue_id, 0); +@@ -335,28 +216,21 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) + uint32_t mec; + uint32_t pipe; + +- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; ++ mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + lock_srbm(kgd, mec, pipe, 0, 0); + +- WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | +- CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); ++ WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK); + + unlock_srbm(kgd); + + return 0; + } + +-static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m) ++static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) + { +- uint32_t retval; +- +- retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + +- m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET; +- pr_debug("sdma base address: 0x%x\n", retval); +- +- return retval; ++ return 0; + } + + static inline struct vi_mqd *get_mqd(void *mqd) +@@ -364,224 +238,33 @@ static inline struct vi_mqd *get_mqd(void *mqd) + return (struct vi_mqd *)mqd; + } + +-static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) ++static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) + { +- return (struct vi_sdma_mqd *)mqd; ++ return (struct cik_sdma_rlc_registers *)mqd; + } + + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr, +- uint32_t wptr_shift, uint32_t wptr_mask, +- struct mm_struct *mm) ++ uint32_t queue_id, uint32_t __user *wptr) + { +- struct amdgpu_device *adev = get_amdgpu_device(kgd); + struct vi_mqd *m; +- uint32_t *mqd_hqd; +- uint32_t reg, wptr_val, data; +- bool valid_wptr = false; ++ uint32_t shadow_wptr, valid_wptr; ++ struct amdgpu_device *adev = get_amdgpu_device(kgd); + + m = get_mqd(mqd); + +- acquire_queue(kgd, pipe_id, queue_id); +- +- /* HIQ is set during driver init period with vmid set to 0. For SRIOV +- * world switching support let the RLC know about the HIQ. +- * +- * Workaround: This causes reboots on CZ. Disable this on CZ, which +- * doesn't support SRIOV anyway. +- */ +- if (m->cp_hqd_vmid == 0 && +- adev->asic_type != CHIP_CARRIZO) { +- uint32_t value, mec, pipe; +- +- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; +- pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); +- +- pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", +- mec, pipe, queue_id); +- value = RREG32(mmRLC_CP_SCHEDULERS); +- value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, +- ((mec << 5) | (pipe << 3) | queue_id | 0x80)); +- WREG32(mmRLC_CP_SCHEDULERS, value); +- } +- +- /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ +- mqd_hqd = &m->cp_mqd_base_addr_lo; ++ valid_wptr = copy_from_user(&shadow_wptr, wptr, sizeof(shadow_wptr)); ++ if (valid_wptr == 0) ++ m->cp_hqd_pq_wptr = shadow_wptr; + +- for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_CONTROL; reg++) +- WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); +- +- /* Tonga errata: EOP RPTR/WPTR should be left unmodified. +- * This is safe since EOP RPTR==WPTR for any inactive HQD +- * on ASICs that do not support context-save. +- * EOP writes/reads can start anywhere in the ring. +- */ +- if (get_amdgpu_device(kgd)->asic_type != CHIP_TONGA) { +- WREG32(mmCP_HQD_EOP_RPTR, m->cp_hqd_eop_rptr); +- WREG32(mmCP_HQD_EOP_WPTR, m->cp_hqd_eop_wptr); +- WREG32(mmCP_HQD_EOP_WPTR_MEM, m->cp_hqd_eop_wptr_mem); +- } +- +- for (reg = mmCP_HQD_EOP_EVENTS; reg <= mmCP_HQD_ERROR; reg++) +- WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); +- +- /* Copy userspace write pointer value to register. +- * Activate doorbell logic to monitor subsequent changes. +- */ +- data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, +- CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); +- WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); +- +- /* read_user_ptr may take the mm->mmap_sem. +- * release srbm_mutex to avoid circular dependency between +- * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. +- */ +- release_queue(kgd); +- valid_wptr = read_user_wptr(mm, wptr, wptr_val); + acquire_queue(kgd, pipe_id, queue_id); +- if (valid_wptr) +- WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); +- +- data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); +- WREG32(mmCP_HQD_ACTIVE, data); +- ++ gfx_v8_0_mqd_commit(adev, mqd); + release_queue(kgd); + + return 0; + } + +-static int kgd_hqd_dump(struct kgd_dev *kgd, +- uint32_t pipe_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs) ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) + { +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t i = 0, reg; +-#define HQD_N_REGS (54+4) +-#define DUMP_REG(addr) do { \ +- if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ +- break; \ +- (*dump)[i][0] = (addr) << 2; \ +- (*dump)[i++][1] = RREG32(addr); \ +- } while (0) +- +- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); +- if (*dump == NULL) +- return -ENOMEM; +- +- acquire_queue(kgd, pipe_id, queue_id); +- +- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); +- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); +- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); +- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); +- +- for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++) +- DUMP_REG(reg); +- +- release_queue(kgd); +- +- WARN_ON_ONCE(i != HQD_N_REGS); +- *n_regs = i; +- +- return 0; +-} +- +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, +- uint32_t __user *wptr, struct mm_struct *mm) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct vi_sdma_mqd *m; +- uint32_t sdma_base_addr; +- uint32_t temp, timeout = 2000; +- uint32_t data; +- +- m = get_sdma_mqd(mqd); +- sdma_base_addr = get_sdma_base_addr(m); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, +- m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); +- +- while (true) { +- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) +- break; +- if (timeout == 0) +- return -ETIME; +- msleep(10); +- timeout -= 10; +- } +- if (m->sdma_engine_id) { +- data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); +- data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, +- RESUME_CTX, 0); +- WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); +- } else { +- data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); +- data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, +- RESUME_CTX, 0); +- WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); +- } +- +- data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, +- ENABLE, 1); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); +- +- if (read_user_wptr(mm, wptr, data)) +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); +- else +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, +- m->sdmax_rlcx_rb_rptr); +- +- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, +- m->sdmax_rlcx_virtual_addr); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, +- m->sdmax_rlcx_rb_base_hi); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, +- m->sdmax_rlcx_rb_rptr_addr_lo); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, +- m->sdmax_rlcx_rb_rptr_addr_hi); +- +- data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, +- RB_ENABLE, 1); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); +- +- return 0; +-} +- +-static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, +- uint32_t engine_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + +- queue_id * KFD_VI_SDMA_QUEUE_OFFSET; +- uint32_t i = 0, reg; +-#undef HQD_N_REGS +-#define HQD_N_REGS (19+4+2+3+7) +- +- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); +- if (*dump == NULL) +- return -ENOMEM; +- +- for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) +- DUMP_REG(sdma_offset + reg); +- for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; +- reg++) +- DUMP_REG(sdma_offset + reg); +- for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; +- reg++) +- DUMP_REG(sdma_offset + reg); +- for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG; +- reg++) +- DUMP_REG(sdma_offset + reg); +- for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL; +- reg++) +- DUMP_REG(sdma_offset + reg); +- +- WARN_ON_ONCE(i != HQD_N_REGS); +- *n_regs = i; +- + return 0; + } + +@@ -610,7 +293,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct vi_sdma_mqd *m; ++ struct cik_sdma_rlc_registers *m; + uint32_t sdma_base_addr; + uint32_t sdma_rlc_rb_cntl; + +@@ -625,102 +308,29 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) + return false; + } + +-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, +- enum kfd_preempt_type reset_type, ++static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t temp; +- enum hqd_dequeue_request_type type; +- unsigned long flags, end_jiffies; +- int retry; +- struct vi_mqd *m = get_mqd(mqd); ++ int timeout = utimeout; + + acquire_queue(kgd, pipe_id, queue_id); + +- if (m->cp_hqd_vmid == 0) +- WREG32_FIELD(RLC_CP_SCHEDULERS, scheduler1, 0); +- +- switch (reset_type) { +- case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: +- type = DRAIN_PIPE; +- break; +- case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: +- type = RESET_WAVES; +- break; +- default: +- type = DRAIN_PIPE; +- break; +- } ++ WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); + +- /* Workaround: If IQ timer is active and the wait time is close to or +- * equal to 0, dequeueing is not safe. Wait until either the wait time +- * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is +- * cleared before continuing. Also, ensure wait times are set to at +- * least 0x3. +- */ +- local_irq_save(flags); +- preempt_disable(); +- retry = 5000; /* wait for 500 usecs at maximum */ +- while (true) { +- temp = RREG32(mmCP_HQD_IQ_TIMER); +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { +- pr_debug("HW is processing IQ\n"); +- goto loop; +- } +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) +- == 3) /* SEM-rearm is safe */ +- break; +- /* Wait time 3 is safe for CP, but our MMIO read/write +- * time is close to 1 microsecond, so check for 10 to +- * leave more buffer room +- */ +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) +- >= 10) +- break; +- pr_debug("IQ timer is active\n"); +- } else +- break; +-loop: +- if (!retry) { +- pr_err("CP HQD IQ timer status time out\n"); +- break; +- } +- ndelay(100); +- --retry; +- } +- retry = 1000; +- while (true) { +- temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); +- if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) +- break; +- pr_debug("Dequeue request is pending\n"); +- +- if (!retry) { +- pr_err("CP HQD dequeue request time out\n"); +- break; +- } +- ndelay(100); +- --retry; +- } +- local_irq_restore(flags); +- preempt_enable(); +- +- WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); +- +- end_jiffies = (utimeout * HZ / 1000) + jiffies; + while (true) { + temp = RREG32(mmCP_HQD_ACTIVE); +- if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) ++ if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) + break; +- if (time_after(jiffies, end_jiffies)) { +- pr_err("cp queue preemption time out.\n"); ++ if (timeout <= 0) { ++ pr_err("kfd: cp queue preemption time out.\n"); + release_queue(kgd); + return -ETIME; + } +- usleep_range(500, 1000); ++ msleep(20); ++ timeout -= 20; + } + + release_queue(kgd); +@@ -731,10 +341,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout) + { + struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct vi_sdma_mqd *m; ++ struct cik_sdma_rlc_registers *m; + uint32_t sdma_base_addr; + uint32_t temp; +- unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; ++ int timeout = utimeout; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); +@@ -745,19 +355,18 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + + while (true) { + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) ++ if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) + break; +- if (time_after(jiffies, end_jiffies)) ++ if (timeout <= 0) + return -ETIME; +- usleep_range(500, 1000); ++ msleep(20); ++ timeout -= 20; + } + + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, +- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | +- SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); +- +- m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); ++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); + + return 0; + } +@@ -779,7 +388,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); +- return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; ++ return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; + } + + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) +@@ -789,83 +398,8 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) + WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); + } + +-/* +- * FIXME: Poliars test failed with this package, FIJI works fine +- * From the CP spec it does not official support the invalidation +- * with the specified pasid in the package, so disable it for V8 +- * +- */ +-#ifdef V8_SUPPORT_IT_OFFICIAL +-static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) +-{ +- signed long r; +- struct dma_fence *f; +- struct amdgpu_ring *ring = &adev->gfx.kiq.ring; +- +- mutex_lock(&adev->gfx.kiq.ring_mutex); +- amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ +- amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); +- amdgpu_ring_write(ring, +- PACKET3_INVALIDATE_TLBS_DST_SEL(1) | +- PACKET3_INVALIDATE_TLBS_PASID(pasid)); +- amdgpu_fence_emit(ring, &f); +- amdgpu_ring_commit(ring); +- mutex_unlock(&adev->gfx.kiq.ring_mutex); +- +- r = dma_fence_wait(f, false); +- if (r) +- DRM_ERROR("wait for kiq fence error: %ld.\n", r); +- dma_fence_put(f); +- +- return r; +-} +-#endif +-static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- int vmid; +- +-#ifdef V8_SUPPORT_IT_OFFICIAL +- struct amdgpu_ring *ring = &adev->gfx.kiq.ring; +- +- if (ring->ready) +- return invalidate_tlbs_with_kiq(adev, pasid); +-#endif +- +- for (vmid = 0; vmid < 16; vmid++) { +- if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) +- continue; +- if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & +- ATC_VMID0_PASID_MAPPING__VALID_MASK) { +- if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & +- ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { +- WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); +- break; +- } +- } +- } +- +- return 0; +-} +- + static int kgd_address_watch_disable(struct kgd_dev *kgd) + { +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- union TCP_WATCH_CNTL_BITS cntl; +- unsigned int i; +- +- cntl.u32All = 0; +- +- cntl.bitfields.valid = 0; +- cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; +- cntl.bitfields.atc = 1; +- +- /* Turning off this address until we set all the registers */ +- for (i = 0; i < MAX_WATCH_ADDRESSES; i++) +- WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_CNTL], +- cntl.u32All); +- + return 0; + } + +@@ -875,32 +409,6 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, + uint32_t addr_hi, + uint32_t addr_lo) + { +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- union TCP_WATCH_CNTL_BITS cntl; +- +- cntl.u32All = cntl_val; +- +- /* Turning off this watch point until we set all the registers */ +- cntl.bitfields.valid = 0; +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_CNTL], +- cntl.u32All); +- +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_ADDR_HI], +- addr_hi); +- +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_ADDR_LO], +- addr_lo); +- +- /* Enable the watch point */ +- cntl.bitfields.valid = 1; +- +- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +- + ADDRESS_WATCH_REG_CNTL], +- cntl.u32All); +- + return 0; + } + +@@ -933,32 +441,6 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, + unsigned int watch_point_id, + unsigned int reg_offset) + { +- return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; +-} +- +-static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, +- uint8_t element_size, uint8_t index_stride, uint8_t mtype) +-{ +- uint32_t reg; +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- +- reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | +- element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | +- index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | +- mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; +- +- WREG32(mmSH_STATIC_MEM_CONFIG, reg); +- return 0; +-} +-static int alloc_memory_of_scratch(struct kgd_dev *kgd, +- uint64_t va, uint32_t vmid) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- +- lock_srbm(kgd, 0, 0, 0, vmid); +- WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); +- unlock_srbm(kgd); +- + return 0; + } + +@@ -967,45 +449,47 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + const union amdgpu_firmware_header *hdr; + ++ BUG_ON(kgd == NULL); ++ + switch (type) { + case KGD_ENGINE_PFP: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.pfp_fw->data; ++ adev->gfx.pfp_fw->data; + break; + + case KGD_ENGINE_ME: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.me_fw->data; ++ adev->gfx.me_fw->data; + break; + + case KGD_ENGINE_CE: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.ce_fw->data; ++ adev->gfx.ce_fw->data; + break; + + case KGD_ENGINE_MEC1: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.mec_fw->data; ++ adev->gfx.mec_fw->data; + break; + + case KGD_ENGINE_MEC2: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.mec2_fw->data; ++ adev->gfx.mec2_fw->data; + break; + + case KGD_ENGINE_RLC: + hdr = (const union amdgpu_firmware_header *) +- adev->gfx.rlc_fw->data; ++ adev->gfx.rlc_fw->data; + break; + + case KGD_ENGINE_SDMA1: + hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[0].fw->data; ++ adev->sdma.instance[0].fw->data; + break; + + case KGD_ENGINE_SDMA2: + hdr = (const union amdgpu_firmware_header *) +- adev->sdma.instance[1].fw->data; ++ adev->sdma.instance[1].fw->data; + break; + + default: +@@ -1018,21 +502,3 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) + /* Only 12 bit in use*/ + return hdr->common.ucode_version; + } +- +-static void set_num_of_requests(struct kgd_dev *kgd, +- uint8_t num_of_requests) +-{ +- pr_debug("This is a stub\n"); +-} +- +-static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t page_table_base) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- /* TODO: Don't use hardcoded VMIDs */ +- if (vmid < 8 || vmid > 15) { +- pr_err("trying to set page table base for wrong VMID\n"); +- return; +- } +- WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); +-} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +deleted file mode 100644 +index edbae19..0000000 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c ++++ /dev/null +@@ -1,1227 +0,0 @@ +-/* +- * Copyright 2014 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +-#undef pr_fmt +-#define pr_fmt(fmt) "kfd2kgd: " fmt +- +-#include <linux/module.h> +-#include <linux/fdtable.h> +-#include <linux/uaccess.h> +-#include <linux/firmware.h> +-#include <drm/drmP.h> +-#include "amdgpu.h" +-#include "amdgpu_amdkfd.h" +-#include "amdgpu_ucode.h" +-#include "amdgpu_amdkfd_gfx_v8.h" +-#include "vega10/soc15ip.h" +-#include "vega10/GC/gc_9_0_offset.h" +-#include "vega10/GC/gc_9_0_sh_mask.h" +-#include "vega10/vega10_enum.h" +-#include "vega10/SDMA0/sdma0_4_0_offset.h" +-#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" +-#include "vega10/SDMA1/sdma1_4_0_offset.h" +-#include "vega10/SDMA1/sdma1_4_0_sh_mask.h" +-#include "vega10/ATHUB/athub_1_0_offset.h" +-#include "vega10/ATHUB/athub_1_0_sh_mask.h" +-#include "vega10/OSSSYS/osssys_4_0_offset.h" +-#include "vega10/OSSSYS/osssys_4_0_sh_mask.h" +-#include "soc15_common.h" +-#include "v9_structs.h" +-#include "soc15.h" +-#include "soc15d.h" +- +-/* HACK: MMHUB and GC both have VM-related register with the same +- * names but different offsets. Define the MMHUB register we need here +- * with a prefix. A proper solution would be to move the functions +- * programming these registers into gfx_v9_0.c and mmhub_v1_0.c +- * respectively. +- */ +-#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3 +-#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0 +- +-#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705 +-#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0 +- +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0 +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0 +- +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0 +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0 +- +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0 +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c +-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0 +- +-#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727 +-#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0 +-#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728 +-#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0 +- +-enum hqd_dequeue_request_type { +- NO_ACTION = 0, +- DRAIN_PIPE, +- RESET_WAVES, +- SAVE_WAVES +-}; +- +-static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { +- mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, +- mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, +- mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, +- mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL +-}; +- +- +-static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, +- void *vm, struct kgd_mem **mem); +-static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); +- +-static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, +- int fd, uint32_t handle, struct kgd_mem **mem); +- +-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); +- +-/* +- * Register access functions +- */ +- +-static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t sh_mem_config, +- uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, +- uint32_t sh_mem_bases); +-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, +- unsigned int vmid); +-static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, +- uint32_t hpd_size, uint64_t hpd_gpu_addr); +-static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); +-static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr, +- uint32_t wptr_shift, uint32_t wptr_mask, +- struct mm_struct *mm); +-static int kgd_hqd_dump(struct kgd_dev *kgd, +- uint32_t pipe_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs); +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, +- uint32_t __user *wptr, struct mm_struct *mm); +-static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, +- uint32_t engine_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs); +-static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, +- uint32_t pipe_id, uint32_t queue_id); +-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); +-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, +- enum kfd_preempt_type reset_type, +- unsigned int utimeout, uint32_t pipe_id, +- uint32_t queue_id); +-static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, +- unsigned int utimeout); +-static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +-static uint32_t get_watch_base_addr(void); +-static int kgd_address_watch_disable(struct kgd_dev *kgd); +-static int kgd_address_watch_execute(struct kgd_dev *kgd, +- unsigned int watch_point_id, +- uint32_t cntl_val, +- uint32_t addr_hi, +- uint32_t addr_lo); +-static int kgd_wave_control_execute(struct kgd_dev *kgd, +- uint32_t gfx_index_val, +- uint32_t sq_cmd); +-static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, +- unsigned int watch_point_id, +- unsigned int reg_offset); +- +-static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, +- uint8_t vmid); +-static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, +- uint8_t vmid); +-static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +-static void set_num_of_requests(struct kgd_dev *kgd, +- uint8_t num_of_requests); +-static int alloc_memory_of_scratch(struct kgd_dev *kgd, +- uint64_t va, uint32_t vmid); +-static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, +- uint8_t element_size, uint8_t index_stride, uint8_t mtype); +-static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t page_table_base); +-static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); +- +-/* Because of REG_GET_FIELD() being used, we put this function in the +- * asic specific file. +- */ +-static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, +- struct tile_config *config) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- +- config->gb_addr_config = adev->gfx.config.gb_addr_config; +-#if 0 +-/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but +- * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu +- * changes commented out related code, doing the same here for now but +- * need to sync with Ken et al +- */ +- config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, +- MC_ARB_RAMCFG, NOOFBANK); +- config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, +- MC_ARB_RAMCFG, NOOFRANKS); +-#endif +- +- config->tile_config_ptr = adev->gfx.config.tile_mode_array; +- config->num_tile_configs = +- ARRAY_SIZE(adev->gfx.config.tile_mode_array); +- config->macro_tile_config_ptr = +- adev->gfx.config.macrotile_mode_array; +- config->num_macro_tile_configs = +- ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); +- +- return 0; +-} +- +-static const struct kfd2kgd_calls kfd2kgd = { +- .init_gtt_mem_allocation = alloc_gtt_mem, +- .free_gtt_mem = free_gtt_mem, +- .get_local_mem_info = get_local_mem_info, +- .get_gpu_clock_counter = get_gpu_clock_counter, +- .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, +- .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, +- .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, +- .create_process_gpumem = create_process_gpumem, +- .destroy_process_gpumem = destroy_process_gpumem, +- .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, +- .open_graphic_handle = open_graphic_handle, +- .program_sh_mem_settings = kgd_program_sh_mem_settings, +- .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, +- .init_pipeline = kgd_init_pipeline, +- .init_interrupts = kgd_init_interrupts, +- .hqd_load = kgd_hqd_load, +- .hqd_sdma_load = kgd_hqd_sdma_load, +- .hqd_dump = kgd_hqd_dump, +- .hqd_sdma_dump = kgd_hqd_sdma_dump, +- .hqd_is_occupied = kgd_hqd_is_occupied, +- .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, +- .hqd_destroy = kgd_hqd_destroy, +- .hqd_sdma_destroy = kgd_hqd_sdma_destroy, +- .address_watch_disable = kgd_address_watch_disable, +- .address_watch_execute = kgd_address_watch_execute, +- .wave_control_execute = kgd_wave_control_execute, +- .address_watch_get_offset = kgd_address_watch_get_offset, +- .get_atc_vmid_pasid_mapping_pasid = +- get_atc_vmid_pasid_mapping_pasid, +- .get_atc_vmid_pasid_mapping_valid = +- get_atc_vmid_pasid_mapping_valid, +- .write_vmid_invalidate_request = write_vmid_invalidate_request, +- .invalidate_tlbs = invalidate_tlbs, +- .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, +- .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, +- .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, +- .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, +- .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, +- .get_fw_version = get_fw_version, +- .set_num_of_requests = set_num_of_requests, +- .get_cu_info = get_cu_info, +- .alloc_memory_of_scratch = alloc_memory_of_scratch, +- .write_config_static_mem = write_config_static_mem, +- .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, +- .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, +- .set_vm_context_page_table_base = set_vm_context_page_table_base, +- .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, +- .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, +- .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, +- .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, +- .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, +- .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, +- .submit_ib = amdgpu_amdkfd_submit_ib, +- .get_tile_config = amdgpu_amdkfd_get_tile_config, +- .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, +- .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, +- .get_vram_usage = amdgpu_amdkfd_get_vram_usage +-}; +- +-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions() +-{ +- return (struct kfd2kgd_calls *)&kfd2kgd; +-} +- +-static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, +- void *vm, struct kgd_mem **mem) +-{ +- return 0; +-} +- +-/* Destroys the GPU allocation and frees the kgd_mem structure */ +-static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) +-{ +- +-} +- +-static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, +- int fd, uint32_t handle, struct kgd_mem **mem) +-{ +- return 0; +-} +- +-static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) +-{ +- return (struct amdgpu_device *)kgd; +-} +- +-static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, +- uint32_t queue, uint32_t vmid) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- +- mutex_lock(&adev->srbm_mutex); +- soc15_grbm_select(adev, mec, pipe, queue, vmid); +-} +- +-static void unlock_srbm(struct kgd_dev *kgd) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- +- soc15_grbm_select(adev, 0, 0, 0, 0); +- mutex_unlock(&adev->srbm_mutex); +-} +- +-static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, +- uint32_t queue_id) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- +- uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; +- uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); +- +- lock_srbm(kgd, mec, pipe, queue_id, 0); +-} +- +-static uint32_t get_queue_mask(struct amdgpu_device *adev, +- uint32_t pipe_id, uint32_t queue_id) +-{ +- unsigned int bit = (pipe_id * adev->gfx.mec.num_pipe_per_mec + +- queue_id) & 31; +- +- return ((uint32_t)1) << bit; +-} +- +-static void release_queue(struct kgd_dev *kgd) +-{ +- unlock_srbm(kgd); +-} +- +-static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t sh_mem_config, +- uint32_t sh_mem_ape1_base, +- uint32_t sh_mem_ape1_limit, +- uint32_t sh_mem_bases) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- +- lock_srbm(kgd, 0, 0, 0, vmid); +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); +- /* APE1 no longer exists on GFX9 */ +- +- unlock_srbm(kgd); +-} +- +-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, +- unsigned int vmid) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- +- /* +- * We have to assume that there is no outstanding mapping. +- * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because +- * a mapping is in progress or because a mapping finished +- * and the SW cleared it. +- * So the protocol is to always wait & clear. +- */ +- uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | +- ATC_VMID0_PASID_MAPPING__VALID_MASK; +- +- /* +- * need to do this twice, once for gfx and once for mmhub +- * for ATC add 16 to VMID for mmhub, for IH different registers. +- * ATC_VMID0..15 registers are separate from ATC_VMID16..31. +- */ +- +- WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, +- pasid_mapping); +- +- while (!(RREG32(SOC15_REG_OFFSET( +- ATHUB, 0, +- mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & +- (1U << vmid))) +- cpu_relax(); +- +- WREG32(SOC15_REG_OFFSET(ATHUB, 0, +- mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), +- 1U << vmid); +- +- /* Mapping vmid to pasid also for IH block */ +- WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, +- pasid_mapping); +- +- WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, +- pasid_mapping); +- +- while (!(RREG32(SOC15_REG_OFFSET( +- ATHUB, 0, +- mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & +- (1U << (vmid + 16)))) +- cpu_relax(); +- +- WREG32(SOC15_REG_OFFSET(ATHUB, 0, +- mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), +- 1U << (vmid + 16)); +- +- /* Mapping vmid to pasid also for IH block */ +- WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, +- pasid_mapping); +- return 0; +-} +- +-static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, +- uint32_t hpd_size, uint64_t hpd_gpu_addr) +-{ +- /* amdgpu owns the per-pipe state */ +- return 0; +-} +- +-/* TODO - RING0 form of field is obsolete, seems to date back to SI +- * but still works +- */ +- +-static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t mec; +- uint32_t pipe; +- +- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; +- pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); +- +- lock_srbm(kgd, mec, pipe, 0, 0); +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), +- CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | +- CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); +- +- unlock_srbm(kgd); +- +- return 0; +-} +- +-static uint32_t get_sdma_base_addr(unsigned int engine_id, +- unsigned int queue_id) +-{ +- static const uint32_t base[2] = { +- SOC15_REG_OFFSET(SDMA0, 0, +- mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, +- SOC15_REG_OFFSET(SDMA1, 0, +- mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL +- }; +- uint32_t retval; +- +- retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - +- mmSDMA0_RLC0_RB_CNTL); +- +- pr_debug("sdma base address: 0x%x\n", retval); +- +- return retval; +-} +- +-static uint32_t get_watch_base_addr(void) +-{ +- uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) - +- mmTCP_WATCH0_ADDR_H; +- +- pr_debug("kfd: reg watch base address: 0x%x\n", retval); +- +- return retval; +-} +- +-static inline struct v9_mqd *get_mqd(void *mqd) +-{ +- return (struct v9_mqd *)mqd; +-} +- +-static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) +-{ +- return (struct v9_sdma_mqd *)mqd; +-} +- +-static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr, +- uint32_t wptr_shift, uint32_t wptr_mask, +- struct mm_struct *mm) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct v9_mqd *m; +- uint32_t *mqd_hqd; +- uint32_t reg, hqd_base, data; +- +- m = get_mqd(mqd); +- +- acquire_queue(kgd, pipe_id, queue_id); +- +- /* HIQ is set during driver init period with vmid set to 0*/ +- if (m->cp_hqd_vmid == 0) { +- uint32_t value, mec, pipe; +- +- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; +- pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); +- +- pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", +- mec, pipe, queue_id); +- value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); +- value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, +- ((mec << 5) | (pipe << 3) | queue_id | 0x80)); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); +- } +- +- /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ +- mqd_hqd = &m->cp_mqd_base_addr_lo; +- hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); +- +- for (reg = hqd_base; +- reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) +- WREG32(reg, mqd_hqd[reg - hqd_base]); +- +- +- /* Activate doorbell logic before triggering WPTR poll. */ +- data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, +- CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); +- +- if (wptr) { +- /* Don't read wptr with get_user because the user +- * context may not be accessible (if this function +- * runs in a work queue). Instead trigger a one-shot +- * polling read from memory in the CP. This assumes +- * that wptr is GPU-accessible in the queue's VMID via +- * ATC or SVM. WPTR==RPTR before starting the poll so +- * the CP starts fetching new commands from the right +- * place. +- * +- * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit +- * tricky. Assume that the queue didn't overflow. The +- * number of valid bits in the 32-bit RPTR depends on +- * the queue size. The remaining bits are taken from +- * the saved 64-bit WPTR. If the WPTR wrapped, add the +- * queue size. +- */ +- uint32_t queue_size = +- 2 << REG_GET_FIELD(m->cp_hqd_pq_control, +- CP_HQD_PQ_CONTROL, QUEUE_SIZE); +- uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); +- +- if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) +- guessed_wptr += queue_size; +- guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); +- guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), +- lower_32_bits(guessed_wptr)); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), +- upper_32_bits(guessed_wptr)); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), +- lower_32_bits((uint64_t)wptr)); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), +- upper_32_bits((uint64_t)wptr)); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), +- get_queue_mask(adev, pipe_id, queue_id)); +- } +- +- /* Start the EOP fetcher */ +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), +- REG_SET_FIELD(m->cp_hqd_eop_rptr, +- CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); +- +- data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); +- +- release_queue(kgd); +- +- return 0; +-} +- +-static int kgd_hqd_dump(struct kgd_dev *kgd, +- uint32_t pipe_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t i = 0, reg; +-#define HQD_N_REGS 56 +-#define DUMP_REG(addr) do { \ +- if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ +- break; \ +- (*dump)[i][0] = (addr) << 2; \ +- (*dump)[i++][1] = RREG32(addr); \ +- } while (0) +- +- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); +- if (*dump == NULL) +- return -ENOMEM; +- +- acquire_queue(kgd, pipe_id, queue_id); +- +- for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); +- reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) +- DUMP_REG(reg); +- +- release_queue(kgd); +- +- WARN_ON_ONCE(i != HQD_N_REGS); +- *n_regs = i; +- +- return 0; +-} +- +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, +- uint32_t __user *wptr, struct mm_struct *mm) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct v9_sdma_mqd *m; +- uint32_t sdma_base_addr, sdmax_gfx_context_cntl; +- uint32_t temp, timeout = 2000; +- uint32_t data; +- uint64_t data64; +- uint64_t __user *wptr64 = (uint64_t __user *)wptr; +- +- m = get_sdma_mqd(mqd); +- sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, +- m->sdma_queue_id); +- sdmax_gfx_context_cntl = m->sdma_engine_id ? +- SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : +- SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); +- +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, +- m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); +- +- while (true) { +- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) +- break; +- if (timeout == 0) +- return -ETIME; +- msleep(10); +- timeout -= 10; +- } +- data = RREG32(sdmax_gfx_context_cntl); +- data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, +- RESUME_CTX, 0); +- WREG32(sdmax_gfx_context_cntl, data); +- +- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, +- m->sdmax_rlcx_doorbell_offset); +- +- data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, +- ENABLE, 1); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, +- m->sdmax_rlcx_rb_rptr_hi); +- +- WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); +- if (read_user_wptr(mm, wptr64, data64)) { +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, +- lower_32_bits(data64)); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, +- upper_32_bits(data64)); +- } else { +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, +- m->sdmax_rlcx_rb_rptr); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, +- m->sdmax_rlcx_rb_rptr_hi); +- } +- WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); +- +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, +- m->sdmax_rlcx_rb_base_hi); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, +- m->sdmax_rlcx_rb_rptr_addr_lo); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, +- m->sdmax_rlcx_rb_rptr_addr_hi); +- +- data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, +- RB_ENABLE, 1); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); +- +- return 0; +-} +- +-static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, +- uint32_t engine_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t sdma_base_addr = get_sdma_base_addr(engine_id, queue_id); +- uint32_t i = 0, reg; +-#undef HQD_N_REGS +-#define HQD_N_REGS (19+6+7+10) +- +- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); +- if (*dump == NULL) +- return -ENOMEM; +- +- for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) +- DUMP_REG(sdma_base_addr + reg); +- for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) +- DUMP_REG(sdma_base_addr + reg); +- for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; +- reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) +- DUMP_REG(sdma_base_addr + reg); +- for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; +- reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) +- DUMP_REG(sdma_base_addr + reg); +- +- WARN_ON_ONCE(i != HQD_N_REGS); +- *n_regs = i; +- +- return 0; +-} +- +-static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, +- uint32_t pipe_id, uint32_t queue_id) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t act; +- bool retval = false; +- uint32_t low, high; +- +- acquire_queue(kgd, pipe_id, queue_id); +- act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); +- if (act) { +- low = lower_32_bits(queue_address >> 8); +- high = upper_32_bits(queue_address >> 8); +- +- if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && +- high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) +- retval = true; +- } +- release_queue(kgd); +- return retval; +-} +- +-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct v9_sdma_mqd *m; +- uint32_t sdma_base_addr; +- uint32_t sdma_rlc_rb_cntl; +- +- m = get_sdma_mqd(mqd); +- sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, +- m->sdma_queue_id); +- +- sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); +- +- if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) +- return true; +- +- return false; +-} +- +-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, +- enum kfd_preempt_type reset_type, +- unsigned int utimeout, uint32_t pipe_id, +- uint32_t queue_id) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- enum hqd_dequeue_request_type type; +- unsigned long end_jiffies; +- uint32_t temp; +- struct v9_mqd *m = get_mqd(mqd); +- +-#if 0 +- unsigned long flags; +- int retry; +-#endif +- +- acquire_queue(kgd, pipe_id, queue_id); +- +- if (m->cp_hqd_vmid == 0) +- WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); +- +- switch (reset_type) { +- case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: +- type = DRAIN_PIPE; +- break; +- case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: +- type = RESET_WAVES; +- break; +- default: +- type = DRAIN_PIPE; +- break; +- } +- +-#if 0 /* Is this still needed? */ +- /* Workaround: If IQ timer is active and the wait time is close to or +- * equal to 0, dequeueing is not safe. Wait until either the wait time +- * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is +- * cleared before continuing. Also, ensure wait times are set to at +- * least 0x3. +- */ +- local_irq_save(flags); +- preempt_disable(); +- retry = 5000; /* wait for 500 usecs at maximum */ +- while (true) { +- temp = RREG32(mmCP_HQD_IQ_TIMER); +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { +- pr_debug("HW is processing IQ\n"); +- goto loop; +- } +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) +- == 3) /* SEM-rearm is safe */ +- break; +- /* Wait time 3 is safe for CP, but our MMIO read/write +- * time is close to 1 microsecond, so check for 10 to +- * leave more buffer room +- */ +- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) +- >= 10) +- break; +- pr_debug("IQ timer is active\n"); +- } else +- break; +-loop: +- if (!retry) { +- pr_err("CP HQD IQ timer status time out\n"); +- break; +- } +- ndelay(100); +- --retry; +- } +- retry = 1000; +- while (true) { +- temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); +- if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) +- break; +- pr_debug("Dequeue request is pending\n"); +- +- if (!retry) { +- pr_err("CP HQD dequeue request time out\n"); +- break; +- } +- ndelay(100); +- --retry; +- } +- local_irq_restore(flags); +- preempt_enable(); +-#endif +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); +- +- end_jiffies = (utimeout * HZ / 1000) + jiffies; +- while (true) { +- temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); +- if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) +- break; +- if (time_after(jiffies, end_jiffies)) { +- pr_err("cp queue preemption time out.\n"); +- release_queue(kgd); +- return -ETIME; +- } +- usleep_range(500, 1000); +- } +- +- release_queue(kgd); +- return 0; +-} +- +-static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, +- unsigned int utimeout) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- struct v9_sdma_mqd *m; +- uint32_t sdma_base_addr; +- uint32_t temp; +- unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; +- +- m = get_sdma_mqd(mqd); +- sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, +- m->sdma_queue_id); +- +- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); +- temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); +- +- while (true) { +- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); +- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) +- break; +- if (time_after(jiffies, end_jiffies)) +- return -ETIME; +- usleep_range(500, 1000); +- } +- +- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); +- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, +- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | +- SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); +- +- m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); +- m->sdmax_rlcx_rb_rptr_hi = +- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); +- +- return 0; +-} +- +-static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, +- uint8_t vmid) +-{ +- uint32_t reg; +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- +- reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) +- + vmid); +- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; +-} +- +-static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, +- uint8_t vmid) +-{ +- uint32_t reg; +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- +- reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) +- + vmid); +- return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; +-} +- +-static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- uint32_t req = (1 << vmid) | +- (1 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* light */ +- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK | +- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK | +- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK | +- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK | +- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK; +- +- spin_lock(&adev->tlb_invalidation_lock); +- +- /* Use light weight invalidation. +- * +- * TODO 1: agree on the right set of invalidation registers for +- * KFD use. Use the last one for now. Invalidate both GC and +- * MMHUB. +- * +- * TODO 2: support range-based invalidation, requires kfg2kgd +- * interface change +- */ +- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), +- 0xffffffff); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), +- 0x0000001f); +- +- WREG32(SOC15_REG_OFFSET(MMHUB, 0, +- mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), +- 0xffffffff); +- WREG32(SOC15_REG_OFFSET(MMHUB, 0, +- mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), +- 0x0000001f); +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req); +- +- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), +- req); +- +- while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) & +- (1 << vmid))) +- cpu_relax(); +- +- while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, +- mmMMHUB_VM_INVALIDATE_ENG16_ACK)) & +- (1 << vmid))) +- cpu_relax(); +- +- spin_unlock(&adev->tlb_invalidation_lock); +- +-} +- +-static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) +-{ +- signed long r; +- struct dma_fence *f; +- struct amdgpu_ring *ring = &adev->gfx.kiq.ring; +- +- mutex_lock(&adev->gfx.kiq.ring_mutex); +- amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ +- amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); +- amdgpu_ring_write(ring, +- PACKET3_INVALIDATE_TLBS_DST_SEL(1) | +- PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | +- PACKET3_INVALIDATE_TLBS_PASID(pasid) | +- PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(2)); +- amdgpu_fence_emit(ring, &f); +- amdgpu_ring_commit(ring); +- mutex_unlock(&adev->gfx.kiq.ring_mutex); +- +- r = dma_fence_wait(f, false); +- if (r) +- DRM_ERROR("wait for kiq fence error: %ld.\n", r); +- dma_fence_put(f); +- +- return r; +-} +- +-static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- int vmid; +- struct amdgpu_ring *ring = &adev->gfx.kiq.ring; +- +- if (ring->ready) +- return invalidate_tlbs_with_kiq(adev, pasid); +- +- for (vmid = 0; vmid < 16; vmid++) { +- if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) +- continue; +- if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { +- if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) +- == pasid) { +- write_vmid_invalidate_request(kgd, vmid); +- break; +- } +- } +- } +- +- return 0; +-} +- +-static int kgd_address_watch_disable(struct kgd_dev *kgd) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- union TCP_WATCH_CNTL_BITS cntl; +- unsigned int i; +- uint32_t watch_base_addr; +- +- cntl.u32All = 0; +- +- cntl.bitfields.valid = 0; +- cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; +- cntl.bitfields.atc = 1; +- +- watch_base_addr = get_watch_base_addr(); +- /* Turning off this address until we set all the registers */ +- for (i = 0; i < MAX_WATCH_ADDRESSES; i++) +- WREG32(watch_base_addr + +- watchRegs[i * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_CNTL], +- cntl.u32All); +- +- return 0; +-} +- +-static int kgd_address_watch_execute(struct kgd_dev *kgd, +- unsigned int watch_point_id, +- uint32_t cntl_val, +- uint32_t addr_hi, +- uint32_t addr_lo) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- union TCP_WATCH_CNTL_BITS cntl; +- uint32_t watch_base_addr; +- +- watch_base_addr = get_watch_base_addr(); +- cntl.u32All = cntl_val; +- +- /* Turning off this watch point until we set all the registers */ +- cntl.bitfields.valid = 0; +- WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], +- cntl.u32All); +- +- WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], +- addr_hi); +- +- WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], +- addr_lo); +- +- /* Enable the watch point */ +- cntl.bitfields.valid = 1; +- +- WREG32(watch_base_addr + +- watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + +- ADDRESS_WATCH_REG_CNTL], +- cntl.u32All); +- +- return 0; +-} +- +-static int kgd_wave_control_execute(struct kgd_dev *kgd, +- uint32_t gfx_index_val, +- uint32_t sq_cmd) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint32_t data = 0; +- +- mutex_lock(&adev->grbm_idx_mutex); +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); +- +- data = REG_SET_FIELD(data, GRBM_GFX_INDEX, +- INSTANCE_BROADCAST_WRITES, 1); +- data = REG_SET_FIELD(data, GRBM_GFX_INDEX, +- SH_BROADCAST_WRITES, 1); +- data = REG_SET_FIELD(data, GRBM_GFX_INDEX, +- SE_BROADCAST_WRITES, 1); +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); +- mutex_unlock(&adev->grbm_idx_mutex); +- +- return 0; +-} +- +-static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, +- unsigned int watch_point_id, +- unsigned int reg_offset) +-{ +- return get_watch_base_addr() + +- watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; +-} +- +-static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, +- uint8_t element_size, uint8_t index_stride, uint8_t mtype) +-{ +- /* No longer needed on GFXv9. These values are now hard-coded, +- * except for the MTYPE which comes from the page table. +- */ +- +- return 0; +-} +-static int alloc_memory_of_scratch(struct kgd_dev *kgd, +- uint64_t va, uint32_t vmid) +-{ +- /* No longer needed on GFXv9. The scratch base address is +- * passed to the shader by the CP. It's the user mode driver's +- * responsibility. +- */ +- +- return 0; +-} +- +-/* FIXME: Does this need to be ASIC-specific code? */ +-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- const union amdgpu_firmware_header *hdr; +- +- switch (type) { +- case KGD_ENGINE_PFP: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; +- break; +- +- case KGD_ENGINE_ME: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; +- break; +- +- case KGD_ENGINE_CE: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; +- break; +- +- case KGD_ENGINE_MEC1: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; +- break; +- +- case KGD_ENGINE_MEC2: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; +- break; +- +- case KGD_ENGINE_RLC: +- hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; +- break; +- +- case KGD_ENGINE_SDMA1: +- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; +- break; +- +- case KGD_ENGINE_SDMA2: +- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; +- break; +- +- default: +- return 0; +- } +- +- if (hdr == NULL) +- return 0; +- +- /* Only 12 bit in use*/ +- return hdr->common.ucode_version; +-} +- +-static void set_num_of_requests(struct kgd_dev *kgd, +- uint8_t num_of_requests) +-{ +- pr_debug("This is a stub\n"); +-} +- +-static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t page_table_base) +-{ +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT | +- AMDGPU_PTE_VALID; +- +- /* TODO: Don't use hardcoded VMIDs */ +- if (vmid < 8 || vmid > 15) { +- pr_err("trying to set page table base for wrong VMID %u\n", +- vmid); +- return; +- } +- +- /* TODO: take advantage of per-process address space size. For +- * now, all processes share the same address space size, like +- * on GFX8 and older. +- */ +- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); +- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); +- +- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), +- lower_32_bits(adev->vm_manager.max_pfn - 1)); +- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), +- upper_32_bits(adev->vm_manager.max_pfn - 1)); +- +- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); +- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), +- lower_32_bits(adev->vm_manager.max_pfn - 1)); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), +- upper_32_bits(adev->vm_manager.max_pfn - 1)); +- +- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); +- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); +-} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +deleted file mode 100644 +index 7df892d..0000000 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c ++++ /dev/null +@@ -1,2578 +0,0 @@ +-/* +- * Copyright 2014 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#undef pr_fmt +-#define pr_fmt(fmt) "kfd2kgd: " fmt +- +-#include <linux/module.h> +-#include <linux/fdtable.h> +-#include <linux/uaccess.h> +-#include <linux/firmware.h> +-#include <linux/list.h> +-#include <linux/sched/mm.h> +-#include <drm/drmP.h> +-#include <linux/dma-buf.h> +-#include <linux/pagemap.h> +-#include "amdgpu_amdkfd.h" +-#include "amdgpu_ucode.h" +-#include "gca/gfx_8_0_sh_mask.h" +-#include "gca/gfx_8_0_d.h" +-#include "gca/gfx_8_0_enum.h" +-#include "oss/oss_3_0_sh_mask.h" +-#include "oss/oss_3_0_d.h" +-#include "gmc/gmc_8_1_sh_mask.h" +-#include "gmc/gmc_8_1_d.h" +- +-/* Special VM and GART address alignment needed for VI pre-Fiji due to +- * a HW bug. +- */ +-#define VI_BO_SIZE_ALIGN (0x8000) +- +-/* BO flag to indicate a KFD userptr BO */ +-#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63) +- +-/* Impose limit on how much memory KFD can use */ +-struct kfd_mem_usage_limit { +- uint64_t max_system_mem_limit; +- uint64_t max_userptr_mem_limit; +- int64_t system_mem_used; +- int64_t userptr_mem_used; +- spinlock_t mem_limit_lock; +-}; +- +-static struct kfd_mem_usage_limit kfd_mem_limit; +- +-/* Struct used for amdgpu_amdkfd_bo_validate */ +-struct amdgpu_vm_parser { +- uint32_t domain; +- bool wait; +-}; +- +-static const char * const domain_bit_to_string[] = { +- "CPU", +- "GTT", +- "VRAM", +- "GDS", +- "GWS", +- "OA" +-}; +- +-#define domain_string(domain) domain_bit_to_string[ffs(domain)-1] +- +-static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); +- +- +-static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) +-{ +- return (struct amdgpu_device *)kgd; +-} +- +-static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, +- struct kgd_mem *mem) +-{ +- struct kfd_bo_va_list *entry; +- +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) +- if (entry->bo_va->base.vm == avm) +- return false; +- +- return true; +-} +- +-/* Set memory usage limits. Current, limits are +- * System (kernel) memory - 15/16th System RAM +- * Userptr memory - 15/16th System RAM +- */ +-void amdgpu_amdkfd_gpuvm_init_mem_limits(void) +-{ +- struct sysinfo si; +- uint64_t mem; +- +- si_meminfo(&si); +- mem = si.totalram - si.totalhigh; +- mem *= si.mem_unit; +- +- spin_lock_init(&kfd_mem_limit.mem_limit_lock); +- kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4); /* 15/16 */ +- kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 4); /* 15/16 */ +- pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n", +- (kfd_mem_limit.max_system_mem_limit >> 20), +- (kfd_mem_limit.max_userptr_mem_limit >> 20)); +-} +- +-static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, +- uint64_t size, u32 domain) +-{ +- size_t acc_size; +- int ret = 0; +- +- acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, +- sizeof(struct amdgpu_bo)); +- +- spin_lock(&kfd_mem_limit.mem_limit_lock); +- if (domain == AMDGPU_GEM_DOMAIN_GTT) { +- if (kfd_mem_limit.system_mem_used + (acc_size + size) > +- kfd_mem_limit.max_system_mem_limit) { +- ret = -ENOMEM; +- goto err_no_mem; +- } +- kfd_mem_limit.system_mem_used += (acc_size + size); +- } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { +- if ((kfd_mem_limit.system_mem_used + acc_size > +- kfd_mem_limit.max_system_mem_limit) || +- (kfd_mem_limit.userptr_mem_used + (size + acc_size) > +- kfd_mem_limit.max_userptr_mem_limit)) { +- ret = -ENOMEM; +- goto err_no_mem; +- } +- kfd_mem_limit.system_mem_used += acc_size; +- kfd_mem_limit.userptr_mem_used += size; +- } +-err_no_mem: +- spin_unlock(&kfd_mem_limit.mem_limit_lock); +- return ret; +-} +- +-static void unreserve_system_mem_limit(struct amdgpu_device *adev, +- uint64_t size, u32 domain) +-{ +- size_t acc_size; +- +- acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, +- sizeof(struct amdgpu_bo)); +- +- spin_lock(&kfd_mem_limit.mem_limit_lock); +- if (domain == AMDGPU_GEM_DOMAIN_GTT) { +- kfd_mem_limit.system_mem_used -= (acc_size + size); +- } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { +- kfd_mem_limit.system_mem_used -= acc_size; +- kfd_mem_limit.userptr_mem_used -= size; +- } +- WARN_ONCE(kfd_mem_limit.system_mem_used < 0, +- "kfd system memory accounting unbalanced"); +- WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, +- "kfd userptr memory accounting unbalanced"); +- +- spin_unlock(&kfd_mem_limit.mem_limit_lock); +-} +- +-void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo) +-{ +- spin_lock(&kfd_mem_limit.mem_limit_lock); +- +- if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) { +- kfd_mem_limit.system_mem_used -= bo->tbo.acc_size; +- kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo); +- } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { +- kfd_mem_limit.system_mem_used -= +- (bo->tbo.acc_size + amdgpu_bo_size(bo)); +- } +- WARN_ONCE(kfd_mem_limit.system_mem_used < 0, +- "kfd system memory accounting unbalanced"); +- WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, +- "kfd userptr memory accounting unbalanced"); +- +- spin_unlock(&kfd_mem_limit.mem_limit_lock); +-} +- +- +-/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence(s) from BO's +- * reservation object. +- * +- * @bo: [IN] Remove eviction fence(s) from this BO +- * @ef: [IN] If ef is specified, then this eviction fence is removed if it +- * is present in the shared list. +- * @ef_list: [OUT] Returns list of eviction fences. These fences are removed +- * from BO's reservation object shared list. +- * @ef_count: [OUT] Number of fences in ef_list. +- * +- * NOTE: If called with ef_list, then amdgpu_amdkfd_add_eviction_fence must be +- * called to restore the eviction fences and to avoid memory leak. This is +- * useful for shared BOs. +- * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held. +- */ +-static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, +- struct amdgpu_amdkfd_fence *ef, +- struct amdgpu_amdkfd_fence ***ef_list, +- unsigned int *ef_count) +-{ +- struct reservation_object_list *fobj; +- struct reservation_object *resv; +- unsigned int i = 0, j = 0, k = 0, shared_count; +- unsigned int count = 0; +- struct amdgpu_amdkfd_fence **fence_list; +- +- if (!ef && !ef_list) +- return -EINVAL; +- +- if (ef_list) { +- *ef_list = NULL; +- *ef_count = 0; +- } +- +- resv = bo->tbo.resv; +- fobj = reservation_object_get_list(resv); +- +- if (!fobj) +- return 0; +- +- preempt_disable(); +- write_seqcount_begin(&resv->seq); +- +- /* Go through all the shared fences in the resevation object. If +- * ef is specified and it exists in the list, remove it and reduce the +- * count. If ef is not specified, then get the count of eviction fences +- * present. +- */ +- shared_count = fobj->shared_count; +- for (i = 0; i < shared_count; ++i) { +- struct dma_fence *f; +- +- f = rcu_dereference_protected(fobj->shared[i], +- reservation_object_held(resv)); +- +- if (ef) { +- if (f->context == ef->base.context) { +- dma_fence_put(f); +- fobj->shared_count--; +- } else +- RCU_INIT_POINTER(fobj->shared[j++], f); +- +- } else if (to_amdgpu_amdkfd_fence(f)) +- count++; +- } +- write_seqcount_end(&resv->seq); +- preempt_enable(); +- +- if (ef || !count) +- return 0; +- +- /* Alloc memory for count number of eviction fence pointers. Fill the +- * ef_list array and ef_count +- */ +- +- fence_list = kcalloc(count, sizeof(struct amdgpu_amdkfd_fence *), +- GFP_KERNEL); +- if (!fence_list) +- return -ENOMEM; +- +- preempt_disable(); +- write_seqcount_begin(&resv->seq); +- +- j = 0; +- for (i = 0; i < shared_count; ++i) { +- struct dma_fence *f; +- struct amdgpu_amdkfd_fence *efence; +- +- f = rcu_dereference_protected(fobj->shared[i], +- reservation_object_held(resv)); +- +- efence = to_amdgpu_amdkfd_fence(f); +- if (efence) { +- fence_list[k++] = efence; +- fobj->shared_count--; +- } else +- RCU_INIT_POINTER(fobj->shared[j++], f); +- } +- +- write_seqcount_end(&resv->seq); +- preempt_enable(); +- +- *ef_list = fence_list; +- *ef_count = k; +- +- return 0; +-} +- +-/* amdgpu_amdkfd_add_eviction_fence - Adds eviction fence(s) back into BO's +- * reservation object. +- * +- * @bo: [IN] Add eviction fences to this BO +- * @ef_list: [IN] List of eviction fences to be added +- * @ef_count: [IN] Number of fences in ef_list. +- * +- * NOTE: Must call amdgpu_amdkfd_remove_eviction_fence before calling this +- * function. +- */ +-static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo, +- struct amdgpu_amdkfd_fence **ef_list, +- unsigned int ef_count) +-{ +- int i; +- +- if (!ef_list || !ef_count) +- return; +- +- for (i = 0; i < ef_count; i++) { +- amdgpu_bo_fence(bo, &ef_list[i]->base, true); +- /* Readding the fence takes an additional reference. Drop that +- * reference. +- */ +- dma_fence_put(&ef_list[i]->base); +- } +- +- kfree(ef_list); +-} +- +-static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, +- bool wait) +-{ +- int ret; +- +- if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm), +- "Called with userptr BO")) +- return -EINVAL; +- +- amdgpu_ttm_placement_from_domain(bo, domain); +- +- ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); +- if (ret) +- goto validate_fail; +- if (wait) { +- struct amdgpu_amdkfd_fence **ef_list; +- unsigned int ef_count; +- +- ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list, +- &ef_count); +- if (ret) +- goto validate_fail; +- +- ttm_bo_wait(&bo->tbo, false, false); +- amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count); +- } +- +-validate_fail: +- return ret; +-} +- +-static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo) +-{ +- struct amdgpu_vm_parser *p = param; +- +- return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait); +-} +- +-/* vm_validate_pt_pd_bos - Validate page table and directory BOs +- * +- * Also updates page directory entries so we don't need to do this +- * again later until the page directory is validated again (e.g. after +- * an eviction or allocating new page tables). +- */ +-static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm) +-{ +- struct amdgpu_bo *pd = vm->root.base.bo; +- struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev); +- struct amdgpu_vm_parser param; +- int ret; +- +- param.domain = AMDGPU_GEM_DOMAIN_VRAM; +- param.wait = false; +- +- ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate, +- ¶m); +- if (ret) { +- pr_err("amdgpu: failed to validate PT BOs\n"); +- return ret; +- } +- +- ret = amdgpu_amdkfd_validate(¶m, pd); +- if (ret) { +- pr_err("amdgpu: failed to validate PD\n"); +- return ret; +- } +- +- ret = amdgpu_vm_update_directories(adev, vm); +- if (ret != 0) +- return ret; +- +- return 0; +-} +- +-/* add_bo_to_vm - Add a BO to a VM +- * +- * Everything that needs to bo done only once when a BO is first added +- * to a VM. It can later be mapped and unmapped many times without +- * repeating these steps. +- * +- * 1. Allocate and initialize BO VA entry data structure +- * 2. Add BO to the VM +- * 3. Determine ASIC-specific PTE flags +- * 4. Alloc page tables and directories if needed +- * 4a. Validate new page tables and directories and update directories +- */ +-static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem, +- struct amdgpu_vm *avm, bool is_aql, +- struct kfd_bo_va_list **p_bo_va_entry) +-{ +- int ret; +- struct kfd_bo_va_list *bo_va_entry; +- struct amdkfd_vm *kvm = container_of(avm, +- struct amdkfd_vm, base); +- struct amdgpu_bo *pd = avm->root.base.bo; +- struct amdgpu_bo *bo = mem->bo; +- uint64_t va = mem->va; +- struct list_head *list_bo_va = &mem->bo_va_list; +- unsigned long bo_size = bo->tbo.mem.size; +- +- if (!va) { +- pr_err("Invalid VA when adding BO to VM\n"); +- return -EINVAL; +- } +- +- if (is_aql) +- va += bo_size; +- +- bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL); +- if (!bo_va_entry) +- return -ENOMEM; +- +- pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va, +- va + bo_size, avm); +- +- /* Add BO to VM internal data structures*/ +- bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo); +- if (bo_va_entry->bo_va == NULL) { +- ret = -EINVAL; +- pr_err("Failed to add BO object to VM. ret == %d\n", +- ret); +- goto err_vmadd; +- } +- +- bo_va_entry->va = va; +- bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev, +- mem->mapping_flags); +- bo_va_entry->kgd_dev = (void *)adev; +- list_add(&bo_va_entry->bo_list, list_bo_va); +- +- if (p_bo_va_entry) +- *p_bo_va_entry = bo_va_entry; +- +- /* Allocate new page tables if neeeded and validate +- * them. Clearing of new page tables and validate need to wait +- * on move fences. We don't want that to trigger the eviction +- * fence, so remove it temporarily. +- */ +- amdgpu_amdkfd_remove_eviction_fence(pd, +- kvm->process_info->eviction_fence, +- NULL, NULL); +- +- ret = amdgpu_vm_alloc_pts(adev, avm, va, amdgpu_bo_size(bo)); +- if (ret) { +- pr_err("Failed to allocate pts, err=%d\n", ret); +- goto err_alloc_pts; +- } +- +- ret = vm_validate_pt_pd_bos(avm); +- if (ret != 0) { +- pr_err("validate_pt_pd_bos() failed\n"); +- goto err_alloc_pts; +- } +- +- /* Add the eviction fence back */ +- amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); +- +- return 0; +- +-err_alloc_pts: +- amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); +- amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va); +- list_del(&bo_va_entry->bo_list); +-err_vmadd: +- kfree(bo_va_entry); +- return ret; +-} +- +-static void remove_bo_from_vm(struct amdgpu_device *adev, +- struct kfd_bo_va_list *entry, unsigned long size) +-{ +- pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n", +- entry->va, +- entry->va + size, entry); +- amdgpu_vm_bo_rmv(adev, entry->bo_va); +- list_del(&entry->bo_list); +- kfree(entry); +-} +- +-static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, +- struct amdkfd_process_info *process_info, +- bool userptr) +-{ +- struct ttm_validate_buffer *entry = &mem->validate_list; +- struct amdgpu_bo *bo = mem->bo; +- +- INIT_LIST_HEAD(&entry->head); +- entry->shared = true; +- entry->bo = &bo->tbo; +- mutex_lock(&process_info->lock); +- if (userptr) +- list_add_tail(&entry->head, &process_info->userptr_valid_list); +- else +- list_add_tail(&entry->head, &process_info->kfd_bo_list); +- mutex_unlock(&process_info->lock); +-} +- +-/* Initializes user pages. It registers the MMU notifier and validates +- * the userptr BO in the GTT domain. +- * +- * The BO must already be on the userptr_valid_list. Otherwise an +- * eviction and restore may happen that leaves the new BO unmapped +- * with the user mode queues running. +- * +- * Takes the process_info->lock to protect against concurrent restore +- * workers. +- * +- * Returns 0 for success, negative errno for errors. +- */ +-static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, +- uint64_t user_addr) +-{ +- struct amdkfd_process_info *process_info = mem->process_info; +- struct amdgpu_bo *bo = mem->bo; +- int ret = 0; +- +- mutex_lock(&process_info->lock); +- +- ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); +- if (ret) { +- pr_err("%s: Failed to set userptr: %d\n", __func__, ret); +- goto out; +- } +- +- ret = amdgpu_mn_register(bo, user_addr); +- if (ret) { +- pr_err("%s: Failed to register MMU notifier: %d\n", +- __func__, ret); +- goto out; +- } +- +- /* If no restore worker is running concurrently, user_pages +- * should not be allocated +- */ +- WARN(mem->user_pages, "Leaking user_pages array"); +- +-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) +- mem->user_pages = drm_calloc_large(bo->tbo.ttm->num_pages, +- sizeof(struct page *)); +-#else +- mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages, +- sizeof(struct page *), +- GFP_KERNEL | __GFP_ZERO); +-#endif +- if (!mem->user_pages) { +- pr_err("%s: Failed to allocate pages array\n", __func__); +- ret = -ENOMEM; +- goto unregister_out; +- } +- +- ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); +- if (ret) { +- pr_err("%s: Failed to get user pages: %d\n", __func__, ret); +- goto free_out; +- } +- +- amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages); +- +- ret = amdgpu_bo_reserve(bo, true); +- if (ret) { +- pr_err("%s: Failed to reserve BO\n", __func__); +- goto release_out; +- } +- amdgpu_ttm_placement_from_domain(bo, mem->domain); +- ret = ttm_bo_validate(&bo->tbo, &bo->placement, +- true, false); +- if (ret) +- pr_err("%s: failed to validate BO\n", __func__); +- amdgpu_bo_unreserve(bo); +- +-release_out: +- if (ret) +- release_pages(mem->user_pages, bo->tbo.ttm->num_pages, 0); +-free_out: +-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) +- drm_free_large(mem->user_pages); +-#else +- kvfree(mem->user_pages); +-#endif +- mem->user_pages = NULL; +-unregister_out: +- if (ret) +- amdgpu_mn_unregister(bo); +-out: +- mutex_unlock(&process_info->lock); +- return ret; +-} +- +-static int __map_bo_to_kernel(struct amdgpu_bo *bo, u32 domain, void **kptr) +-{ +- int ret; +- +- ret = amdgpu_bo_reserve(bo, true); +- if (ret) { +- pr_err("Failed to reserve bo. ret %d\n", ret); +- return ret; +- } +- +- ret = amdgpu_bo_pin(bo, domain, NULL); +- if (ret) { +- pr_err("Failed to pin bo. ret %d\n", ret); +- goto pin_failed; +- } +- +- ret = amdgpu_bo_kmap(bo, kptr); +- if (ret) { +- pr_err("Failed to map bo to kernel. ret %d\n", ret); +- goto kmap_failed; +- } +- +- amdgpu_bo_unreserve(bo); +- +- return ret; +- +-kmap_failed: +- amdgpu_bo_unpin(bo); +-pin_failed: +- amdgpu_bo_unreserve(bo); +- +- return ret; +-} +- +-static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, +- uint64_t size, void *vm, struct kgd_mem **mem, +- uint64_t *offset, u32 domain, u64 flags, +- struct sg_table *sg, bool aql_queue, +- bool readonly, bool execute, bool coherent, bool no_sub, +- bool userptr) +-{ +- struct amdgpu_device *adev; +- int ret; +- struct amdgpu_bo *bo; +- uint64_t user_addr = 0; +- int byte_align; +- u32 alloc_domain; +- uint32_t mapping_flags; +- struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; +- +- if (aql_queue) +- size = size >> 1; +- if (userptr) { +- if (!offset || !*offset) +- return -EINVAL; +- user_addr = *offset; +- } +- +- adev = get_amdgpu_device(kgd); +- byte_align = (adev->family == AMDGPU_FAMILY_VI && +- adev->asic_type != CHIP_FIJI && +- adev->asic_type != CHIP_POLARIS10 && +- adev->asic_type != CHIP_POLARIS11) ? +- VI_BO_SIZE_ALIGN : 1; +- +- *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); +- if (*mem == NULL) { +- ret = -ENOMEM; +- goto err; +- } +- INIT_LIST_HEAD(&(*mem)->bo_va_list); +- mutex_init(&(*mem)->lock); +- (*mem)->coherent = coherent; +- (*mem)->no_substitute = no_sub; +- (*mem)->aql_queue = aql_queue; +- +- mapping_flags = AMDGPU_VM_PAGE_READABLE; +- if (!readonly) +- mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE; +- if (execute) +- mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; +- if (coherent) +- mapping_flags |= AMDGPU_VM_MTYPE_UC; +- else +- mapping_flags |= AMDGPU_VM_MTYPE_NC; +- +- (*mem)->mapping_flags = mapping_flags; +- +- alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain; +- +- amdgpu_sync_create(&(*mem)->sync); +- +- ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain); +- if (ret) { +- pr_err("Insufficient system memory\n"); +- goto err_bo_create; +- } +- +- pr_debug("\t create BO VA 0x%llx size 0x%llx domain %s\n", +- va, size, domain_string(alloc_domain)); +- +- /* Allocate buffer object. Userptr objects need to start out +- * in the CPU domain, get moved to GTT when pinned. +- */ +- ret = amdgpu_bo_create(adev, size, byte_align, false, +- alloc_domain, +- flags, sg, NULL, 0, &bo); +- if (ret != 0) { +- pr_err("Failed to create BO on domain %s. ret %d\n", +- domain_string(alloc_domain), ret); +- unreserve_system_mem_limit(adev, size, alloc_domain); +- goto err_bo_create; +- } +- bo->kfd_bo = *mem; +- (*mem)->bo = bo; +- if (userptr) +- bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; +- +- (*mem)->va = va; +- (*mem)->domain = domain; +- (*mem)->mapped_to_gpu_memory = 0; +- (*mem)->process_info = kfd_vm->process_info; +- add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, userptr); +- +- if (userptr) { +- ret = init_user_pages(*mem, current->mm, user_addr); +- if (ret) { +- mutex_lock(&kfd_vm->process_info->lock); +- list_del(&(*mem)->validate_list.head); +- mutex_unlock(&kfd_vm->process_info->lock); +- goto allocate_init_user_pages_failed; +- } +- } +- +- if (offset) +- *offset = amdgpu_bo_mmap_offset(bo); +- +- return 0; +- +-allocate_init_user_pages_failed: +- amdgpu_bo_unref(&bo); +-err_bo_create: +- kfree(*mem); +-err: +- return ret; +-} +- +-/* Reserving a BO and its page table BOs must happen atomically to +- * avoid deadlocks. When updating userptrs we need to temporarily +- * back-off the reservation and then reacquire it. Track all the +- * reservation info in a context structure. Buffers can be mapped to +- * multiple VMs simultaneously (buffers being restored on multiple +- * GPUs). +- */ +-struct bo_vm_reservation_context { +- struct amdgpu_bo_list_entry kfd_bo; +- unsigned int n_vms; +- struct amdgpu_bo_list_entry *vm_pd; +- struct ww_acquire_ctx ticket; +- struct list_head list, duplicates; +- struct amdgpu_sync *sync; +- bool reserved; +-}; +- +-/** +- * reserve_bo_and_vm - reserve a BO and a VM unconditionally. +- * @mem: KFD BO structure. +- * @vm: the VM to reserve. +- * @ctx: the struct that will be used in unreserve_bo_and_vms(). +- */ +-static int reserve_bo_and_vm(struct kgd_mem *mem, +- struct amdgpu_vm *vm, +- struct bo_vm_reservation_context *ctx) +-{ +- struct amdgpu_bo *bo = mem->bo; +- int ret; +- +- WARN_ON(!vm); +- +- ctx->reserved = false; +- ctx->n_vms = 1; +- ctx->sync = &mem->sync; +- +- INIT_LIST_HEAD(&ctx->list); +- INIT_LIST_HEAD(&ctx->duplicates); +- +- ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) +- * ctx->n_vms, GFP_KERNEL); +- if (ctx->vm_pd == NULL) +- return -ENOMEM; +- +- ctx->kfd_bo.robj = bo; +- ctx->kfd_bo.priority = 0; +- ctx->kfd_bo.tv.bo = &bo->tbo; +- ctx->kfd_bo.tv.shared = true; +- ctx->kfd_bo.user_pages = NULL; +- list_add(&ctx->kfd_bo.tv.head, &ctx->list); +- +- amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]); +- +- ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, +- false, &ctx->duplicates); +- if (!ret) +- ctx->reserved = true; +- else +- pr_err("Failed to reserve buffers in ttm\n"); +- +- if (ret) { +- kfree(ctx->vm_pd); +- ctx->vm_pd = NULL; +- } +- +- return ret; +-} +- +-enum VA_TYPE { +- VA_NOT_MAPPED = 0, +- VA_MAPPED, +- VA_DO_NOT_CARE, +-}; +- +-/** +- * reserve_bo_and_vm - reserve a BO and some VMs that the BO has been added +- * to, conditionally based on map_type. +- * @mem: KFD BO structure. +- * @vm: the VM to reserve. If NULL, then all VMs associated with the BO +- * is used. Otherwise, a single VM associated with the BO. +- * @map_type: the mapping status that will be used to filter the VMs. +- * @ctx: the struct that will be used in unreserve_bo_and_vms(). +- */ +-static int reserve_bo_and_cond_vms(struct kgd_mem *mem, +- struct amdgpu_vm *vm, enum VA_TYPE map_type, +- struct bo_vm_reservation_context *ctx) +-{ +- struct amdgpu_bo *bo = mem->bo; +- struct kfd_bo_va_list *entry; +- unsigned int i; +- int ret; +- +- ctx->reserved = false; +- ctx->n_vms = 0; +- ctx->vm_pd = NULL; +- ctx->sync = &mem->sync; +- +- INIT_LIST_HEAD(&ctx->list); +- INIT_LIST_HEAD(&ctx->duplicates); +- +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { +- if ((vm && vm != entry->bo_va->base.vm) || +- (entry->is_mapped != map_type +- && map_type != VA_DO_NOT_CARE)) +- continue; +- +- ctx->n_vms++; +- } +- +- if (ctx->n_vms != 0) { +- ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) +- * ctx->n_vms, GFP_KERNEL); +- if (ctx->vm_pd == NULL) +- return -ENOMEM; +- } +- +- ctx->kfd_bo.robj = bo; +- ctx->kfd_bo.priority = 0; +- ctx->kfd_bo.tv.bo = &bo->tbo; +- ctx->kfd_bo.tv.shared = true; +- ctx->kfd_bo.user_pages = NULL; +- list_add(&ctx->kfd_bo.tv.head, &ctx->list); +- +- i = 0; +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { +- if ((vm && vm != entry->bo_va->base.vm) || +- (entry->is_mapped != map_type +- && map_type != VA_DO_NOT_CARE)) +- continue; +- +- amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list, +- &ctx->vm_pd[i]); +- i++; +- } +- +- ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, +- false, &ctx->duplicates); +- if (!ret) +- ctx->reserved = true; +- else +- pr_err("Failed to reserve buffers in ttm.\n"); +- +- if (ret) { +- kfree(ctx->vm_pd); +- ctx->vm_pd = NULL; +- } +- +- return ret; +-} +- +-static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, +- bool wait, bool intr) +-{ +- int ret = 0; +- +- if (wait) +- ret = amdgpu_sync_wait(ctx->sync, intr); +- +- if (ctx->reserved) +- ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); +- kfree(ctx->vm_pd); +- +- ctx->sync = NULL; +- +- ctx->reserved = false; +- ctx->vm_pd = NULL; +- +- return ret; +-} +- +-static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, +- struct kfd_bo_va_list *entry, +- struct amdgpu_sync *sync) +-{ +- struct amdgpu_bo_va *bo_va = entry->bo_va; +- struct amdgpu_vm *vm = bo_va->base.vm; +- struct amdkfd_vm *kvm = container_of(vm, struct amdkfd_vm, base); +- struct amdgpu_bo *pd = vm->root.base.bo; +- +- /* Remove eviction fence from PD (and thereby from PTs too as they +- * share the resv. object. Otherwise during PT update job (see +- * amdgpu_vm_bo_update_mapping), eviction fence will get added to +- * job->sync object +- */ +- amdgpu_amdkfd_remove_eviction_fence(pd, +- kvm->process_info->eviction_fence, +- NULL, NULL); +- amdgpu_vm_bo_unmap(adev, bo_va, entry->va); +- +- amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); +- +- /* Add the eviction fence back */ +- amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); +- +- amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); +- +- /* Sync objects can't handle multiple GPUs (contexts) updating +- * sync->last_vm_update. Fortunately we don't need it for +- * KFD's purposes, so we can just drop that fence. +- */ +- if (sync->last_vm_update) { +- dma_fence_put(sync->last_vm_update); +- sync->last_vm_update = NULL; +- } +- +- return 0; +-} +- +-static int update_gpuvm_pte(struct amdgpu_device *adev, +- struct kfd_bo_va_list *entry, +- struct amdgpu_sync *sync) +-{ +- int ret; +- struct amdgpu_vm *vm; +- struct amdgpu_bo_va *bo_va; +- struct amdgpu_bo *bo; +- +- bo_va = entry->bo_va; +- vm = bo_va->base.vm; +- bo = bo_va->base.bo; +- +- /* Update the page tables */ +- ret = amdgpu_vm_bo_update(adev, bo_va, false); +- if (ret != 0) { +- pr_err("amdgpu_vm_bo_update failed\n"); +- return ret; +- } +- +- amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); +- +- /* Sync objects can't handle multiple GPUs (contexts) updating +- * sync->last_vm_update. Fortunately we don't need it for +- * KFD's purposes, so we can just drop that fence. +- */ +- if (sync->last_vm_update) { +- dma_fence_put(sync->last_vm_update); +- sync->last_vm_update = NULL; +- } +- +- return 0; +-} +- +-static int map_bo_to_gpuvm(struct amdgpu_device *adev, +- struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, +- bool no_update_pte) +-{ +- int ret; +- +- /* Set virtual address for the allocation */ +- ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0, +- amdgpu_bo_size(entry->bo_va->base.bo), entry->pte_flags); +- if (ret != 0) { +- pr_err("Failed to map VA 0x%llx in vm. ret %d\n", +- entry->va, ret); +- return ret; +- } +- +- if (no_update_pte) +- return 0; +- +- ret = update_gpuvm_pte(adev, entry, sync); +- if (ret != 0) { +- pr_err("update_gpuvm_pte() failed\n"); +- goto update_gpuvm_pte_failed; +- } +- +- return 0; +- +-update_gpuvm_pte_failed: +- unmap_bo_from_gpuvm(adev, entry, sync); +- return ret; +-} +- +-static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size) +-{ +- struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL); +- +- if (!sg) +- return NULL; +- if (sg_alloc_table(sg, 1, GFP_KERNEL)) { +- kfree(sg); +- return NULL; +- } +- sg->sgl->dma_address = addr; +- sg->sgl->length = size; +-#ifdef CONFIG_NEED_SG_DMA_LENGTH +- sg->sgl->dma_length = size; +-#endif +- return sg; +-} +- +-int amdgpu_amdkfd_gpuvm_sync_memory( +- struct kgd_dev *kgd, struct kgd_mem *mem, bool intr) +-{ +- int ret = 0; +- struct amdgpu_sync sync; +- struct amdgpu_device *adev; +- +- adev = get_amdgpu_device(kgd); +- amdgpu_sync_create(&sync); +- +- mutex_lock(&mem->lock); +- amdgpu_sync_clone(adev, &mem->sync, &sync); +- mutex_unlock(&mem->lock); +- +- ret = amdgpu_sync_wait(&sync, intr); +- amdgpu_sync_free(&sync); +- return ret; +-} +- +-#define BOOL_TO_STR(b) (b == true) ? "true" : "false" +- +-int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( +- struct kgd_dev *kgd, uint64_t va, uint64_t size, +- void *vm, struct kgd_mem **mem, +- uint64_t *offset, uint32_t flags) +-{ +- bool aql_queue, public, readonly, execute, coherent, no_sub, userptr; +- u64 alloc_flag; +- uint32_t domain; +- uint64_t *temp_offset; +- struct sg_table *sg = NULL; +- +- if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) { +- pr_err("current hw doesn't support paged memory\n"); +- return -EINVAL; +- } +- +- domain = 0; +- alloc_flag = 0; +- temp_offset = NULL; +- +- aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false; +- public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false; +- readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false; +- execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false; +- coherent = (flags & ALLOC_MEM_FLAGS_COHERENT) ? true : false; +- no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false; +- userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false; +- +- /* +- * Check on which domain to allocate BO +- */ +- if (flags & ALLOC_MEM_FLAGS_VRAM) { +- domain = AMDGPU_GEM_DOMAIN_VRAM; +- alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; +- if (public) { +- alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; +- temp_offset = offset; +- } +- alloc_flag |= AMDGPU_GEM_CREATE_VRAM_CLEARED; +- } else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) { +- domain = AMDGPU_GEM_DOMAIN_GTT; +- alloc_flag = 0; +- temp_offset = offset; +- } else if (flags & ALLOC_MEM_FLAGS_DOORBELL) { +- domain = AMDGPU_GEM_DOMAIN_GTT; +- alloc_flag = 0; +- temp_offset = offset; +- if (size > UINT_MAX) +- return -EINVAL; +- sg = create_doorbell_sg(*offset, size); +- if (!sg) +- return -ENOMEM; +- } +- +- if (offset && !userptr) +- *offset = 0; +- +- pr_debug("Allocate VA 0x%llx - 0x%llx domain %s aql %s\n", +- va, va + size, domain_string(domain), +- BOOL_TO_STR(aql_queue)); +- +- pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s coherent %s no_sub %s\n", +- alloc_flag, BOOL_TO_STR(public), +- BOOL_TO_STR(readonly), BOOL_TO_STR(execute), +- BOOL_TO_STR(coherent), BOOL_TO_STR(no_sub)); +- +- return __alloc_memory_of_gpu(kgd, va, size, vm, mem, +- temp_offset, domain, +- alloc_flag, sg, +- aql_queue, readonly, execute, +- coherent, no_sub, userptr); +-} +- +-int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( +- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) +-{ +- struct amdgpu_device *adev; +- struct kfd_bo_va_list *entry, *tmp; +- struct bo_vm_reservation_context ctx; +- int ret = 0; +- struct ttm_validate_buffer *bo_list_entry; +- struct amdkfd_process_info *process_info; +- unsigned long bo_size; +- +- adev = get_amdgpu_device(kgd); +- process_info = ((struct amdkfd_vm *)vm)->process_info; +- +- bo_size = mem->bo->tbo.mem.size; +- +- mutex_lock(&mem->lock); +- +- if (mem->mapped_to_gpu_memory > 0) { +- pr_err("BO VA 0x%llx size 0x%lx is already mapped to vm %p.\n", +- mem->va, bo_size, vm); +- mutex_unlock(&mem->lock); +- return -EBUSY; +- } +- +- mutex_unlock(&mem->lock); +- /* lock is not needed after this, since mem is unused and will +- * be freed anyway +- */ +- +- /* No more MMU notifiers */ +- amdgpu_mn_unregister(mem->bo); +- +- /* Make sure restore workers don't access the BO any more */ +- bo_list_entry = &mem->validate_list; +- mutex_lock(&process_info->lock); +- list_del(&bo_list_entry->head); +- mutex_unlock(&process_info->lock); +- +- /* Free user pages if necessary */ +- if (mem->user_pages) { +- pr_debug("%s: Freeing user_pages array\n", __func__); +- if (mem->user_pages[0]) +- release_pages(mem->user_pages, +- mem->bo->tbo.ttm->num_pages, 0); +-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) +- drm_free_large(mem->user_pages); +-#else +- kvfree(mem->user_pages); +-#endif +- } +- +- ret = reserve_bo_and_cond_vms(mem, NULL, VA_DO_NOT_CARE, &ctx); +- if (unlikely(ret != 0)) +- return ret; +- +- /* The eviction fence should be removed by the last unmap. +- * TODO: Log an error condition if the bo still has the eviction fence +- * attached +- */ +- amdgpu_amdkfd_remove_eviction_fence(mem->bo, +- process_info->eviction_fence, +- NULL, NULL); +- pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va, +- mem->va + bo_size * (1 + mem->aql_queue)); +- +- /* Remove from VM internal data structures */ +- list_for_each_entry_safe(entry, tmp, &mem->bo_va_list, bo_list) { +- remove_bo_from_vm((struct amdgpu_device *)entry->kgd_dev, +- entry, bo_size); +- } +- +- ret = unreserve_bo_and_vms(&ctx, false, false); +- +- /* Free the sync object */ +- amdgpu_sync_free(&mem->sync); +- +- /* If the SG is not NULL, it's one we created for a doorbell +- * BO. We need to free it. +- */ +- if (mem->bo->tbo.sg) { +- sg_free_table(mem->bo->tbo.sg); +- kfree(mem->bo->tbo.sg); +- } +- +- /* Free the BO*/ +- amdgpu_bo_unref(&mem->bo); +- kfree(mem); +- +- return ret; +-} +- +-int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( +- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) +-{ +- struct amdgpu_device *adev; +- int ret; +- struct amdgpu_bo *bo; +- uint32_t domain; +- struct kfd_bo_va_list *entry; +- struct bo_vm_reservation_context ctx; +- struct kfd_bo_va_list *bo_va_entry = NULL; +- struct kfd_bo_va_list *bo_va_entry_aql = NULL; +- struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; +- unsigned long bo_size; +- bool is_invalid_userptr; +- +- adev = get_amdgpu_device(kgd); +- +- /* Make sure restore is not running concurrently. Since we +- * don't map invalid userptr BOs, we rely on the next restore +- * worker to do the mapping +- */ +- mutex_lock(&mem->process_info->lock); +- +- /* Lock mmap-sem. If we find an invalid userptr BO, we can be +- * sure that the MMU notifier is no longer running +- * concurrently and the queues are actually stopped +- */ +- down_read(¤t->mm->mmap_sem); +- is_invalid_userptr = atomic_read(&mem->invalid); +- up_read(¤t->mm->mmap_sem); +- +- mutex_lock(&mem->lock); +- +- bo = mem->bo; +- +- if (!bo) { +- pr_err("Invalid BO when mapping memory to GPU\n"); +- return -EINVAL; +- } +- +- domain = mem->domain; +- bo_size = bo->tbo.mem.size; +- +- pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n", +- mem->va, +- mem->va + bo_size * (1 + mem->aql_queue), +- vm, domain_string(domain)); +- +- ret = reserve_bo_and_vm(mem, vm, &ctx); +- if (unlikely(ret != 0)) +- goto bo_reserve_failed; +- +- /* Userptr can be marked as "not invalid", but not actually be +- * validated yet (still in the system domain). In that case +- * the queues are still stopped and we can leave mapping for +- * the next restore worker +- */ +- if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) +- is_invalid_userptr = true; +- +- if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, mem)) { +- ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, false, +- &bo_va_entry); +- if (ret != 0) +- goto add_bo_to_vm_failed; +- if (mem->aql_queue) { +- ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, +- true, &bo_va_entry_aql); +- if (ret != 0) +- goto add_bo_to_vm_failed_aql; +- } +- } +- +- if (mem->mapped_to_gpu_memory == 0 && +- !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { +- /* Validate BO only once. The eviction fence gets added to BO +- * the first time it is mapped. Validate will wait for all +- * background evictions to complete. +- */ +- ret = amdgpu_amdkfd_bo_validate(bo, domain, true); +- if (ret) { +- pr_debug("Validate failed\n"); +- goto map_bo_to_gpuvm_failed; +- } +- } +- +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { +- if (entry->bo_va->base.vm == vm && !entry->is_mapped) { +- pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n", +- entry->va, entry->va + bo_size, +- entry); +- +- ret = map_bo_to_gpuvm(adev, entry, ctx.sync, +- is_invalid_userptr); +- if (ret != 0) { +- pr_err("Failed to map radeon bo to gpuvm\n"); +- goto map_bo_to_gpuvm_failed; +- } +- entry->is_mapped = true; +- mem->mapped_to_gpu_memory++; +- pr_debug("\t INC mapping count %d\n", +- mem->mapped_to_gpu_memory); +- } +- } +- +- if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) == NULL) +- amdgpu_bo_fence(bo, +- &kfd_vm->process_info->eviction_fence->base, +- true); +- ret = unreserve_bo_and_vms(&ctx, false, false); +- +- mutex_unlock(&mem->process_info->lock); +- mutex_unlock(&mem->lock); +- return ret; +- +-map_bo_to_gpuvm_failed: +- if (bo_va_entry_aql) +- remove_bo_from_vm(adev, bo_va_entry_aql, bo_size); +-add_bo_to_vm_failed_aql: +- if (bo_va_entry) +- remove_bo_from_vm(adev, bo_va_entry, bo_size); +-add_bo_to_vm_failed: +- unreserve_bo_and_vms(&ctx, false, false); +-bo_reserve_failed: +- mutex_unlock(&mem->process_info->lock); +- mutex_unlock(&mem->lock); +- return ret; +-} +- +-static u64 get_vm_pd_gpu_offset(void *vm) +-{ +- struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; +- struct amdgpu_device *adev = +- amdgpu_ttm_adev(avm->root.base.bo->tbo.bdev); +- u64 offset; +- +- BUG_ON(avm == NULL); +- +- amdgpu_bo_reserve(avm->root.base.bo, false); +- +- offset = amdgpu_bo_gpu_offset(avm->root.base.bo); +- +- amdgpu_bo_unreserve(avm->root.base.bo); +- +- /* On some ASICs the FB doesn't start at 0. Adjust FB offset +- * to an actual MC address. +- */ +- if (adev->gart.gart_funcs->get_vm_pde) +- offset = amdgpu_gart_get_vm_pde(adev, offset); +- +- return offset; +-} +- +-int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, +- void **process_info, +- struct dma_fence **ef) +-{ +- int ret; +- struct amdkfd_vm *new_vm; +- struct amdkfd_process_info *info; +- struct amdgpu_device *adev = get_amdgpu_device(kgd); +- +- new_vm = kzalloc(sizeof(*new_vm), GFP_KERNEL); +- if (new_vm == NULL) +- return -ENOMEM; +- +- /* Initialize the VM context, allocate the page directory and zero it */ +- ret = amdgpu_vm_init(adev, &new_vm->base, AMDGPU_VM_CONTEXT_COMPUTE); +- if (ret != 0) { +- pr_err("Failed init vm ret %d\n", ret); +- /* Undo everything related to the new VM context */ +- goto vm_init_fail; +- } +- new_vm->adev = adev; +- +- if (!*process_info) { +- info = kzalloc(sizeof(*info), GFP_KERNEL); +- if (!info) { +- pr_err("Failed to create amdkfd_process_info"); +- ret = -ENOMEM; +- goto alloc_process_info_fail; +- } +- +- mutex_init(&info->lock); +- INIT_LIST_HEAD(&info->vm_list_head); +- INIT_LIST_HEAD(&info->kfd_bo_list); +- INIT_LIST_HEAD(&info->userptr_valid_list); +- INIT_LIST_HEAD(&info->userptr_inval_list); +- +- info->eviction_fence = +- amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), +- current->mm); +- if (info->eviction_fence == NULL) { +- pr_err("Failed to create eviction fence\n"); +- goto create_evict_fence_fail; +- } +- +- info->pid = get_task_pid(current->group_leader, +- PIDTYPE_PID); +- atomic_set(&info->evicted_bos, 0); +- INIT_DELAYED_WORK(&info->work, +- amdgpu_amdkfd_restore_userptr_worker); +- +- *process_info = info; +- *ef = dma_fence_get(&info->eviction_fence->base); +- } +- +- new_vm->process_info = *process_info; +- +- mutex_lock(&new_vm->process_info->lock); +- list_add_tail(&new_vm->vm_list_node, +- &(new_vm->process_info->vm_list_head)); +- new_vm->process_info->n_vms++; +- mutex_unlock(&new_vm->process_info->lock); +- +- *vm = (void *) new_vm; +- +- pr_debug("Created process vm %p\n", *vm); +- +- return ret; +- +-create_evict_fence_fail: +- kfree(info); +-alloc_process_info_fail: +- amdgpu_vm_fini(adev, &new_vm->base); +-vm_init_fail: +- kfree(new_vm); +- return ret; +- +-} +- +-void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; +- struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *) vm; +- struct amdgpu_vm *avm = &kfd_vm->base; +- struct amdgpu_bo *pd; +- struct amdkfd_process_info *process_info; +- +- if (WARN_ON(!kgd || !vm)) +- return; +- +- pr_debug("Destroying process vm %p\n", vm); +- /* Release eviction fence from PD */ +- pd = avm->root.base.bo; +- amdgpu_bo_reserve(pd, false); +- amdgpu_bo_fence(pd, NULL, false); +- amdgpu_bo_unreserve(pd); +- +- process_info = kfd_vm->process_info; +- +- mutex_lock(&process_info->lock); +- process_info->n_vms--; +- list_del(&kfd_vm->vm_list_node); +- mutex_unlock(&process_info->lock); +- +- /* Release per-process resources */ +- if (!process_info->n_vms) { +- WARN_ON(!list_empty(&process_info->kfd_bo_list)); +- WARN_ON(!list_empty(&process_info->userptr_valid_list)); +- WARN_ON(!list_empty(&process_info->userptr_inval_list)); +- +- dma_fence_put(&process_info->eviction_fence->base); +- cancel_delayed_work_sync(&process_info->work); +- put_pid(process_info->pid); +- kfree(process_info); +- } +- +- /* Release the VM context */ +- amdgpu_vm_fini(adev, avm); +- kfree(vm); +-} +- +-uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm) +-{ +- return get_vm_pd_gpu_offset(vm) >> AMDGPU_GPU_PAGE_SHIFT; +-} +- +-int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, +- struct kfd_vm_fault_info *mem) +-{ +- struct amdgpu_device *adev; +- +- adev = (struct amdgpu_device *) kgd; +- if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) { +- *mem = *adev->mc.vm_fault_info; +- mb(); +- atomic_set(&adev->mc.vm_fault_info_updated, 0); +- } +- return 0; +-} +- +-static bool is_mem_on_local_device(struct kgd_dev *kgd, +- struct list_head *bo_va_list, void *vm) +-{ +- struct kfd_bo_va_list *entry; +- +- list_for_each_entry(entry, bo_va_list, bo_list) { +- if (entry->kgd_dev == kgd && entry->bo_va->base.vm == vm) +- return true; +- } +- +- return false; +-} +- +-int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( +- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) +-{ +- struct kfd_bo_va_list *entry; +- struct amdgpu_device *adev; +- unsigned int mapped_before; +- int ret = 0; +- struct bo_vm_reservation_context ctx; +- struct amdkfd_process_info *process_info; +- unsigned long bo_size; +- +- adev = (struct amdgpu_device *) kgd; +- process_info = ((struct amdkfd_vm *)vm)->process_info; +- +- bo_size = mem->bo->tbo.mem.size; +- +- mutex_lock(&mem->lock); +- +- /* +- * Make sure that this BO mapped on KGD before unmappping it +- */ +- if (!is_mem_on_local_device(kgd, &mem->bo_va_list, vm)) { +- ret = -EINVAL; +- goto out; +- } +- +- if (mem->mapped_to_gpu_memory == 0) { +- pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", +- mem->va, bo_size, vm); +- ret = -EINVAL; +- goto out; +- } +- mapped_before = mem->mapped_to_gpu_memory; +- +- ret = reserve_bo_and_cond_vms(mem, vm, VA_MAPPED, &ctx); +- if (unlikely(ret != 0)) +- goto out; +- +- pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n", +- mem->va, +- mem->va + bo_size * (1 + mem->aql_queue), +- vm); +- +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { +- if (entry->bo_va->base.vm == vm && entry->is_mapped) { +- pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n", +- entry->va, +- entry->va + bo_size, +- entry); +- +- ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync); +- if (ret == 0) { +- entry->is_mapped = false; +- } else { +- pr_err("failed to unmap VA 0x%llx\n", +- mem->va); +- goto unreserve_out; +- } +- +- mem->mapped_to_gpu_memory--; +- pr_debug("\t DEC mapping count %d\n", +- mem->mapped_to_gpu_memory); +- } +- } +- +- /* If BO is unmapped from all VMs, unfence it. It can be evicted if +- * required. +- */ +- if (mem->mapped_to_gpu_memory == 0 && +- !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) +- amdgpu_amdkfd_remove_eviction_fence(mem->bo, +- process_info->eviction_fence, +- NULL, NULL); +- +- if (mapped_before == mem->mapped_to_gpu_memory) { +- pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", +- mem->va, bo_size, vm); +- ret = -EINVAL; +- } +- +-unreserve_out: +- unreserve_bo_and_vms(&ctx, false, false); +-out: +- mutex_unlock(&mem->lock); +- return ret; +-} +- +-int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) +-{ +- struct amdgpu_device *adev; +- +- adev = get_amdgpu_device(kgd); +- if (!adev) { +- pr_err("Could not get amdgpu device in %s\n", __func__); +- return -ENODEV; +- } +- +- return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev); +-} +- +-int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, +- struct kgd_mem *mem, void **kptr) +-{ +- int ret; +- struct amdgpu_bo *bo = mem->bo; +- +- if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { +- pr_err("userptr can't be mapped to kernel\n"); +- return -EINVAL; +- } +- +- /* delete kgd_mem from kfd_bo_list to avoid re-validating +- * this BO in BO's restoring after eviction. +- */ +- mutex_lock(&mem->process_info->lock); +- +- list_del_init(&mem->validate_list.head); +- +- ret = __map_bo_to_kernel(bo, AMDGPU_GEM_DOMAIN_GTT, kptr); +- if (!ret) +- mem->kptr = *kptr; +- +- mutex_unlock(&mem->process_info->lock); +- +- return ret; +-} +- +-static int pin_bo_wo_map(struct kgd_mem *mem) +-{ +- struct amdgpu_bo *bo = mem->bo; +- int ret = 0; +- +- ret = amdgpu_bo_reserve(bo, false); +- if (unlikely(ret != 0)) +- return ret; +- +- ret = amdgpu_bo_pin(bo, mem->domain, NULL); +- amdgpu_bo_unreserve(bo); +- +- return ret; +-} +- +-static void unpin_bo_wo_map(struct kgd_mem *mem) +-{ +- struct amdgpu_bo *bo = mem->bo; +- int ret = 0; +- +- ret = amdgpu_bo_reserve(bo, false); +- if (unlikely(ret != 0)) +- return; +- +- amdgpu_bo_unpin(bo); +- amdgpu_bo_unreserve(bo); +-} +- +-#define AMD_GPU_PAGE_SHIFT PAGE_SHIFT +-#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT) +- +-static int get_sg_table(struct amdgpu_device *adev, +- struct kgd_mem *mem, uint64_t offset, +- uint64_t size, struct sg_table **ret_sg) +-{ +- struct amdgpu_bo *bo = mem->bo; +- struct sg_table *sg = NULL; +- unsigned long bus_addr; +- unsigned int chunks; +- unsigned int i; +- struct scatterlist *s; +- uint64_t offset_in_page; +- unsigned int page_size; +- int ret; +- +- sg = kmalloc(sizeof(*sg), GFP_KERNEL); +- if (!sg) { +- ret = -ENOMEM; +- goto out; +- } +- +- if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) +- page_size = AMD_GPU_PAGE_SIZE; +- else +- page_size = PAGE_SIZE; +- +- +- offset_in_page = offset & (page_size - 1); +- chunks = (size + offset_in_page + page_size - 1) +- / page_size; +- +- ret = sg_alloc_table(sg, chunks, GFP_KERNEL); +- if (unlikely(ret)) +- goto out; +- +- if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) { +- bus_addr = bo->tbo.offset + adev->mc.aper_base + offset; +- +- for_each_sg(sg->sgl, s, sg->orig_nents, i) { +- uint64_t chunk_size, length; +- +- chunk_size = page_size - offset_in_page; +- length = min(size, chunk_size); +- +- sg_set_page(s, NULL, length, offset_in_page); +- s->dma_address = bus_addr; +- s->dma_length = length; +- +- size -= length; +- offset_in_page = 0; +- bus_addr += length; +- } +- } else { +- struct page **pages; +- unsigned int cur_page; +- +- pages = bo->tbo.ttm->pages; +- +- cur_page = offset / page_size; +- for_each_sg(sg->sgl, s, sg->orig_nents, i) { +- uint64_t chunk_size, length; +- +- chunk_size = page_size - offset_in_page; +- length = min(size, chunk_size); +- +- sg_set_page(s, pages[cur_page], length, offset_in_page); +- s->dma_address = page_to_phys(pages[cur_page]); +- s->dma_length = length; +- +- size -= length; +- offset_in_page = 0; +- cur_page++; +- } +- } +- +- *ret_sg = sg; +- return 0; +-out: +- kfree(sg); +- *ret_sg = NULL; +- return ret; +-} +- +-int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, +- struct kgd_mem *mem, uint64_t offset, +- uint64_t size, struct sg_table **ret_sg) +-{ +- int ret; +- struct amdgpu_device *adev; +- +- ret = pin_bo_wo_map(mem); +- if (unlikely(ret != 0)) +- return ret; +- +- adev = get_amdgpu_device(kgd); +- +- ret = get_sg_table(adev, mem, offset, size, ret_sg); +- if (ret) +- unpin_bo_wo_map(mem); +- +- return ret; +-} +- +-void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( +- struct kgd_mem *mem, struct sg_table *sg) +-{ +- sg_free_table(sg); +- kfree(sg); +- +- unpin_bo_wo_map(mem); +-} +- +-int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, +- struct dma_buf *dma_buf, +- uint64_t va, void *vm, +- struct kgd_mem **mem, uint64_t *size, +- uint64_t *mmap_offset) +-{ +- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; +- struct drm_gem_object *obj; +- struct amdgpu_bo *bo; +- struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; +- +- if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) +- /* Can't handle non-graphics buffers */ +- return -EINVAL; +- +- obj = dma_buf->priv; +- if (obj->dev->dev_private != adev) +- /* Can't handle buffers from other devices */ +- return -EINVAL; +- +- bo = gem_to_amdgpu_bo(obj); +- if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | +- AMDGPU_GEM_DOMAIN_GTT | +- AMDGPU_GEM_DOMAIN_DGMA))) +- /* Only VRAM and GTT BOs are supported */ +- return -EINVAL; +- +- *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); +- if (*mem == NULL) +- return -ENOMEM; +- +- if (size) +- *size = amdgpu_bo_size(bo); +- +- if (mmap_offset) +- *mmap_offset = amdgpu_bo_mmap_offset(bo); +- +- INIT_LIST_HEAD(&(*mem)->bo_va_list); +- mutex_init(&(*mem)->lock); +- (*mem)->mapping_flags = +- AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | +- AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC; +- +- (*mem)->bo = amdgpu_bo_ref(bo); +- (*mem)->va = va; +- if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) +- (*mem)->domain = AMDGPU_GEM_DOMAIN_VRAM; +- else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) +- (*mem)->domain = AMDGPU_GEM_DOMAIN_GTT; +- else +- (*mem)->domain = AMDGPU_GEM_DOMAIN_DGMA; +- (*mem)->mapped_to_gpu_memory = 0; +- (*mem)->process_info = kfd_vm->process_info; +- add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, false); +- amdgpu_sync_create(&(*mem)->sync); +- +- return 0; +-} +- +-int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, +- struct kgd_mem *mem, +- struct dma_buf **dmabuf) +-{ +- struct amdgpu_device *adev = NULL; +- struct amdgpu_bo *bo = NULL; +- struct drm_gem_object *gobj = NULL; +- +- if (!dmabuf || !kgd || !vm || !mem) +- return -EINVAL; +- +- adev = get_amdgpu_device(kgd); +- bo = mem->bo; +- +- gobj = amdgpu_gem_prime_foreign_bo(adev, bo); +- if (gobj == NULL) { +- pr_err("Export BO failed. Unable to find/create GEM object\n"); +- return -EINVAL; +- } +- +- *dmabuf = amdgpu_gem_prime_export(adev->ddev, gobj, 0); +- return 0; +-} +- +-static int process_validate_vms(struct amdkfd_process_info *process_info) +-{ +- struct amdkfd_vm *peer_vm; +- int ret; +- +- list_for_each_entry(peer_vm, &process_info->vm_list_head, +- vm_list_node) { +- ret = vm_validate_pt_pd_bos(&peer_vm->base); +- if (ret) +- return ret; +- } +- +- return 0; +-} +- +-/* Evict a userptr BO by stopping the queues if necessary +- * +- * Runs in MMU notifier, may be in RECLAIM_FS context. This means it +- * cannot do any memory allocations, and cannot take any locks that +- * are held elsewhere while allocating memory. Therefore this is as +- * simple as possible, using atomic counters. +- * +- * It doesn't do anything to the BO itself. The real work happens in +- * restore, where we get updated page addresses. This function only +- * ensures that GPU access to the BO is stopped. +- */ +-int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, +- struct mm_struct *mm) +-{ +- struct amdkfd_process_info *process_info = mem->process_info; +- int invalid, evicted_bos; +- int r = 0; +- +- invalid = atomic_inc_return(&mem->invalid); +- evicted_bos = atomic_inc_return(&process_info->evicted_bos); +- if (evicted_bos == 1) { +- /* First eviction, stop the queues */ +- r = kgd2kfd->quiesce_mm(NULL, mm); +- if (r != 0) +- pr_err("Failed to quiesce KFD\n"); +- schedule_delayed_work(&process_info->work, 1); +- } +- +- return r; +-} +- +-/* Update invalid userptr BOs +- * +- * Moves invalidated (evicted) userptr BOs from userptr_valid_list to +- * userptr_inval_list and updates user pages for all BOs that have +- * been invalidated since their last update. +- */ +-static int update_invalid_user_pages(struct amdkfd_process_info *process_info, +- struct mm_struct *mm) +-{ +- struct kgd_mem *mem, *tmp_mem; +- struct amdgpu_bo *bo; +- int invalid, ret; +- +- /* Move all invalidated BOs to the userptr_inval_list and +- * release their user pages by migration to the CPU domain +- */ +- list_for_each_entry_safe(mem, tmp_mem, +- &process_info->userptr_valid_list, +- validate_list.head) { +- if (!atomic_read(&mem->invalid)) +- continue; /* BO is still valid */ +- +- bo = mem->bo; +- +- if (amdgpu_bo_reserve(bo, true)) +- return -EAGAIN; +- amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); +- ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); +- amdgpu_bo_unreserve(bo); +- if (ret) { +- pr_err("%s: Failed to invalidate userptr BO\n", +- __func__); +- return -EAGAIN; +- } +- +- list_move_tail(&mem->validate_list.head, +- &process_info->userptr_inval_list); +- } +- +- if (list_empty(&process_info->userptr_inval_list)) +- return 0; /* All evicted userptr BOs were freed */ +- +- /* Go through userptr_inval_list and update any invalid user_pages */ +- list_for_each_entry(mem, &process_info->userptr_inval_list, +- validate_list.head) { +- invalid = atomic_read(&mem->invalid); +- if (!invalid) +- /* BO hasn't been invalidated since the last +- * revalidation attempt. Keep its BO list. +- */ +- continue; +- +- bo = mem->bo; +- +- if (!mem->user_pages) { +-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) +- mem->user_pages = +- drm_calloc_large(bo->tbo.ttm->num_pages, +- sizeof(struct page *)); +-#else +- mem->user_pages = +- kvmalloc_array(bo->tbo.ttm->num_pages, +- sizeof(struct page *), +- GFP_KERNEL | __GFP_ZERO); +-#endif +- if (!mem->user_pages) { +- pr_err("%s: Failed to allocate pages array\n", +- __func__); +- return -ENOMEM; +- } +- } else if (mem->user_pages[0]) { +- release_pages(mem->user_pages, +- bo->tbo.ttm->num_pages, 0); +- } +- +- /* Get updated user pages */ +- ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, +- mem->user_pages); +- if (ret) { +- mem->user_pages[0] = NULL; +- pr_info("%s: Failed to get user pages: %d\n", +- __func__, ret); +- /* Pretend it succeeded. It will fail later +- * with a VM fault if the GPU tries to access +- * it. Better than hanging indefinitely with +- * stalled user mode queues. +- */ +- } +- +- /* Mark the BO as valid unless it was invalidated +- * again concurrently +- */ +- if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) +- return -EAGAIN; +- } +- return 0; +-} +- +-/* Validate invalid userptr BOs +- * +- * Validates BOs on the userptr_inval_list, and moves them back to the +- * userptr_valid_list. Also updates GPUVM page tables with new page +- * addresses and waits for the page table updates to complete. +- */ +-static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) +-{ +- struct amdgpu_bo_list_entry *pd_bo_list_entries; +- struct list_head resv_list, duplicates; +- struct ww_acquire_ctx ticket; +- struct amdgpu_sync sync; +- +- struct amdkfd_vm *peer_vm; +- struct kgd_mem *mem, *tmp_mem; +- struct amdgpu_bo *bo; +- int i, ret; +- +- pd_bo_list_entries = kcalloc(process_info->n_vms, +- sizeof(struct amdgpu_bo_list_entry), +- GFP_KERNEL); +- if (!pd_bo_list_entries) { +- pr_err("%s: Failed to allocate PD BO list entries\n", __func__); +- return -ENOMEM; +- } +- +- INIT_LIST_HEAD(&resv_list); +- INIT_LIST_HEAD(&duplicates); +- +- /* Get all the page directory BOs that need to be reserved */ +- i = 0; +- list_for_each_entry(peer_vm, &process_info->vm_list_head, +- vm_list_node) +- amdgpu_vm_get_pd_bo(&peer_vm->base, &resv_list, +- &pd_bo_list_entries[i++]); +- /* Add the userptr_inval_list entries to resv_list */ +- list_for_each_entry(mem, &process_info->userptr_inval_list, +- validate_list.head) { +- list_add_tail(&mem->resv_list.head, &resv_list); +- mem->resv_list.bo = mem->validate_list.bo; +- mem->resv_list.shared = mem->validate_list.shared; +- } +- +- /* Reserve all BOs and page tables for validation */ +- ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); +- WARN(!list_empty(&duplicates), "Duplicates should be empty"); +- if (ret) +- goto out; +- +- amdgpu_sync_create(&sync); +- +- /* Avoid triggering eviction fences when unmapping invalid +- * userptr BOs (waits for all fences, doesn't use +- * FENCE_OWNER_VM) +- */ +- list_for_each_entry(peer_vm, &process_info->vm_list_head, +- vm_list_node) +- amdgpu_amdkfd_remove_eviction_fence(peer_vm->base.root.base.bo, +- process_info->eviction_fence, +- NULL, NULL); +- +- ret = process_validate_vms(process_info); +- if (ret) +- goto unreserve_out; +- +- /* Validate BOs and update GPUVM page tables */ +- list_for_each_entry_safe(mem, tmp_mem, +- &process_info->userptr_inval_list, +- validate_list.head) { +- struct kfd_bo_va_list *bo_va_entry; +- +- bo = mem->bo; +- +- /* Copy pages array and validate the BO if we got user pages */ +- if (mem->user_pages[0]) { +- amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, +- mem->user_pages); +- amdgpu_ttm_placement_from_domain(bo, mem->domain); +- ret = ttm_bo_validate(&bo->tbo, &bo->placement, +- false, false); +- if (ret) { +- pr_err("%s: failed to validate BO\n", __func__); +- goto unreserve_out; +- } +- } +- +- /* Validate succeeded, now the BO owns the pages, free +- * our copy of the pointer array. Put this BO back on +- * the userptr_valid_list. If we need to revalidate +- * it, we need to start from scratch. +- */ +-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) +- drm_free_large(mem->user_pages); +-#else +- kvfree(mem->user_pages); +-#endif +- mem->user_pages = NULL; +- list_move_tail(&mem->validate_list.head, +- &process_info->userptr_valid_list); +- +- /* Update mapping. If the BO was not validated +- * (because we couldn't get user pages), this will +- * clear the page table entries, which will result in +- * VM faults if the GPU tries to access the invalid +- * memory. +- */ +- list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { +- if (!bo_va_entry->is_mapped) +- continue; +- +- ret = update_gpuvm_pte((struct amdgpu_device *) +- bo_va_entry->kgd_dev, +- bo_va_entry, &sync); +- if (ret) { +- pr_err("%s: update PTE failed\n", __func__); +- /* make sure this gets validated again */ +- atomic_inc(&mem->invalid); +- goto unreserve_out; +- } +- } +- } +-unreserve_out: +- list_for_each_entry(peer_vm, &process_info->vm_list_head, +- vm_list_node) +- amdgpu_bo_fence(peer_vm->base.root.base.bo, +- &process_info->eviction_fence->base, true); +- ttm_eu_backoff_reservation(&ticket, &resv_list); +- amdgpu_sync_wait(&sync, false); +- amdgpu_sync_free(&sync); +-out: +- kfree(pd_bo_list_entries); +- +- return ret; +-} +- +-/* Worker callback to restore evicted userptr BOs +- * +- * Tries to update and validate all userptr BOs. If successful and no +- * concurrent evictions happened, the queues are restarted. Otherwise, +- * reschedule for another attempt later. +- */ +-static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) +-{ +- struct delayed_work *dwork = to_delayed_work(work); +- struct amdkfd_process_info *process_info = +- container_of(dwork, struct amdkfd_process_info, work); +- struct task_struct *usertask; +- struct mm_struct *mm; +- int evicted_bos; +- +- evicted_bos = atomic_read(&process_info->evicted_bos); +- if (!evicted_bos) +- return; +- +- /* Reference task and mm in case of concurrent process termination */ +- usertask = get_pid_task(process_info->pid, PIDTYPE_PID); +- if (!usertask) +- return; +- mm = get_task_mm(usertask); +- if (!mm) { +- put_task_struct(usertask); +- return; +- } +- +- mutex_lock(&process_info->lock); +- +- if (update_invalid_user_pages(process_info, mm)) +- goto unlock_out; +- /* userptr_inval_list can be empty if all evicted userptr BOs +- * have been freed. In that case there is nothing to validate +- * and we can just restart the queues. +- */ +- if (!list_empty(&process_info->userptr_inval_list)) { +- if (atomic_read(&process_info->evicted_bos) != evicted_bos) +- goto unlock_out; /* Concurrent eviction, try again */ +- +- if (validate_invalid_user_pages(process_info)) +- goto unlock_out; +- } +- /* Final check for concurrent evicton and atomic update. If +- * another eviction happens after successful update, it will +- * be a first eviction that calls quiesce_mm. The eviction +- * reference counting inside KFD will handle this case. +- */ +- if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != +- evicted_bos) +- goto unlock_out; +- evicted_bos = 0; +- if (kgd2kfd->resume_mm(NULL, mm)) { +- pr_err("%s: Failed to resume KFD\n", __func__); +- /* No recovery from this failure. Probably the CP is +- * hanging. No point trying again. +- */ +- } +-unlock_out: +- mutex_unlock(&process_info->lock); +- mmput(mm); +- put_task_struct(usertask); +- +- /* If validation failed, reschedule another attempt */ +- if (evicted_bos) +- schedule_delayed_work(&process_info->work, 1); +-} +- +-/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given +- * KFD process identified by process_info +- * +- * @process_info: amdkfd_process_info of the KFD process +- * +- * After memory eviction, restore thread calls this function. The function +- * should be called when the Process is still valid. BO restore involves - +- * +- * 1. Release old eviction fence and create new one +- * 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list. +- * 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of +- * BOs that need to be reserved. +- * 4. Reserve all the BOs +- * 5. Validate of PD and PT BOs. +- * 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence +- * 7. Add fence to all PD and PT BOs. +- * 8. Unreserve all BOs +- */ +- +-int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) +-{ +- struct amdgpu_bo_list_entry *pd_bo_list; +- struct amdkfd_process_info *process_info = info; +- struct amdkfd_vm *peer_vm; +- struct kgd_mem *mem; +- struct bo_vm_reservation_context ctx; +- struct amdgpu_amdkfd_fence *new_fence; +- int ret = 0, i; +- struct list_head duplicate_save; +- struct amdgpu_sync sync_obj; +- +- INIT_LIST_HEAD(&duplicate_save); +- INIT_LIST_HEAD(&ctx.list); +- INIT_LIST_HEAD(&ctx.duplicates); +- +- pd_bo_list = kcalloc(process_info->n_vms, +- sizeof(struct amdgpu_bo_list_entry), +- GFP_KERNEL); +- if (pd_bo_list == NULL) +- return -ENOMEM; +- +- i = 0; +- mutex_lock(&process_info->lock); +- list_for_each_entry(peer_vm, &process_info->vm_list_head, +- vm_list_node) +- amdgpu_vm_get_pd_bo(&peer_vm->base, &ctx.list, +- &pd_bo_list[i++]); +- +- /* Reserve all BOs and page tables/directory. Add all BOs from +- * kfd_bo_list to ctx.list +- */ +- list_for_each_entry(mem, &process_info->kfd_bo_list, +- validate_list.head) { +- +- list_add_tail(&mem->resv_list.head, &ctx.list); +- mem->resv_list.bo = mem->validate_list.bo; +- mem->resv_list.shared = mem->validate_list.shared; +- } +- +- ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list, +- false, &duplicate_save); +- if (ret) { +- pr_debug("Memory eviction: TTM Reserve Failed. Try again\n"); +- goto ttm_reserve_fail; +- } +- +- amdgpu_sync_create(&sync_obj); +- ctx.sync = &sync_obj; +- +- /* Validate PDs and PTs */ +- ret = process_validate_vms(process_info); +- if (ret) +- goto validate_map_fail; +- +- /* Wait for PD/PTs validate to finish */ +- /* FIXME: I think this isn't needed */ +- list_for_each_entry(peer_vm, &process_info->vm_list_head, +- vm_list_node) { +- struct amdgpu_bo *bo = peer_vm->base.root.base.bo; +- +- ttm_bo_wait(&bo->tbo, false, false); +- } +- +- /* Validate BOs and map them to GPUVM (update VM page tables). */ +- list_for_each_entry(mem, &process_info->kfd_bo_list, +- validate_list.head) { +- +- struct amdgpu_bo *bo = mem->bo; +- uint32_t domain = mem->domain; +- struct kfd_bo_va_list *bo_va_entry; +- +- ret = amdgpu_amdkfd_bo_validate(bo, domain, false); +- if (ret) { +- pr_debug("Memory eviction: Validate BOs failed. Try again\n"); +- goto validate_map_fail; +- } +- +- list_for_each_entry(bo_va_entry, &mem->bo_va_list, +- bo_list) { +- ret = update_gpuvm_pte((struct amdgpu_device *) +- bo_va_entry->kgd_dev, +- bo_va_entry, +- ctx.sync); +- if (ret) { +- pr_debug("Memory eviction: update PTE failed. Try again\n"); +- goto validate_map_fail; +- } +- } +- } +- +- amdgpu_sync_wait(ctx.sync, false); +- +- /* Release old eviction fence and create new one, because fence only +- * goes from unsignaled to signaled, fence cannot be reused. +- * Use context and mm from the old fence. +- */ +- new_fence = amdgpu_amdkfd_fence_create( +- process_info->eviction_fence->base.context, +- process_info->eviction_fence->mm); +- if (!new_fence) { +- pr_err("Failed to create eviction fence\n"); +- ret = -ENOMEM; +- goto validate_map_fail; +- } +- dma_fence_put(&process_info->eviction_fence->base); +- process_info->eviction_fence = new_fence; +- *ef = dma_fence_get(&new_fence->base); +- +- /* Wait for validate to finish and attach new eviction fence */ +- list_for_each_entry(mem, &process_info->kfd_bo_list, +- validate_list.head) +- ttm_bo_wait(&mem->bo->tbo, false, false); +- list_for_each_entry(mem, &process_info->kfd_bo_list, +- validate_list.head) +- amdgpu_bo_fence(mem->bo, +- &process_info->eviction_fence->base, true); +- +- /* Attach eviction fence to PD / PT BOs */ +- list_for_each_entry(peer_vm, &process_info->vm_list_head, +- vm_list_node) { +- struct amdgpu_bo *bo = peer_vm->base.root.base.bo; +- +- amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true); +- } +-validate_map_fail: +- ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list); +- amdgpu_sync_free(&sync_obj); +-ttm_reserve_fail: +- mutex_unlock(&process_info->lock); +-evict_fence_fail: +- kfree(pd_bo_list); +- return ret; +-} +- +-int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, +- uint64_t src_offset, struct kgd_mem *dst_mem, +- uint64_t dst_offset, uint64_t size, +- struct dma_fence **f, uint64_t *actual_size) +-{ +- struct amdgpu_device *adev = NULL; +- struct ttm_mem_reg *src = NULL, *dst = NULL; +- struct ttm_buffer_object *src_ttm_bo, *dst_ttm_bo; +- struct drm_mm_node *src_mm, *dst_mm; +- struct amdgpu_ring *ring; +- struct ww_acquire_ctx ticket; +- struct list_head list; +- struct ttm_validate_buffer resv_list[2]; +- uint64_t src_start, dst_start; +- uint64_t src_left, dst_left, cur_copy_size, total_copy_size = 0; +- struct dma_fence *fence = NULL; +- int r; +- +- if (!kgd || !src_mem || !dst_mem) +- return -EINVAL; +- +- if (actual_size) +- *actual_size = 0; +- +- adev = get_amdgpu_device(kgd); +- src_ttm_bo = &src_mem->bo->tbo; +- dst_ttm_bo = &dst_mem->bo->tbo; +- src = &src_ttm_bo->mem; +- dst = &dst_ttm_bo->mem; +- src_mm = (struct drm_mm_node *)src->mm_node; +- dst_mm = (struct drm_mm_node *)dst->mm_node; +- +- ring = adev->mman.buffer_funcs_ring; +- +- INIT_LIST_HEAD(&list); +- +- resv_list[0].bo = src_ttm_bo; +- resv_list[0].shared = true; +- resv_list[1].bo = dst_ttm_bo; +- resv_list[1].shared = true; +- +- list_add_tail(&resv_list[0].head, &list); +- list_add_tail(&resv_list[1].head, &list); +- +- if (!ring->ready) { +- pr_err("Trying to move memory with ring turned off.\n"); +- return -EINVAL; +- } +- +- r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL); +- if (r) { +- pr_err("Copy buffer failed. Unable to reserve bo (%d)\n", r); +- return r; +- } +- +- switch (src->mem_type) { +- case TTM_PL_TT: +- r = amdgpu_ttm_bind(src_ttm_bo, src); +- if (r) { +- DRM_ERROR("Copy failed. Cannot bind to gart\n"); +- goto copy_fail; +- } +- break; +- case TTM_PL_VRAM: +- /* VRAM could be scattered. Find the node in which the offset +- * belongs to +- */ +- while (src_offset >= (src_mm->size << PAGE_SHIFT)) { +- src_offset -= (src_mm->size << PAGE_SHIFT); +- ++src_mm; +- } +- break; +- default: +- DRM_ERROR("Unknown placement %d\n", src->mem_type); +- r = -EINVAL; +- goto copy_fail; +- } +- src_start = src_mm->start << PAGE_SHIFT; +- src_start += src_ttm_bo->bdev->man[src->mem_type].gpu_offset; +- src_start += src_offset; +- src_left = (src_mm->size << PAGE_SHIFT) - src_offset; +- +- switch (dst->mem_type) { +- case TTM_PL_TT: +- r = amdgpu_ttm_bind(dst_ttm_bo, dst); +- if (r) { +- DRM_ERROR("Copy failed. Cannot bind to gart\n"); +- goto copy_fail; +- } +- break; +- case TTM_PL_VRAM: +- while (dst_offset >= (dst_mm->size << PAGE_SHIFT)) { +- dst_offset -= (dst_mm->size << PAGE_SHIFT); +- ++dst_mm; +- } +- break; +- default: +- DRM_ERROR("Unknown placement %d\n", dst->mem_type); +- r = -EINVAL; +- goto copy_fail; +- } +- dst_start = dst_mm->start << PAGE_SHIFT; +- dst_start += dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; +- dst_start += dst_offset; +- dst_left = (dst_mm->size << PAGE_SHIFT) - dst_offset; +- +- do { +- struct dma_fence *next; +- +- /* src_left/dst_left: amount of space left in the current node +- * Copy minimum of (src_left, dst_left, amount of bytes left to +- * copy) +- */ +- cur_copy_size = min3(src_left, dst_left, +- (size - total_copy_size)); +- +- r = amdgpu_copy_buffer(ring, src_start, dst_start, +- cur_copy_size, NULL, &next, false, false); +- if (r) +- break; +- +- /* Just keep the last fence */ +- dma_fence_put(fence); +- fence = next; +- +- total_copy_size += cur_copy_size; +- /* Required amount of bytes copied. Done. */ +- if (total_copy_size >= size) +- break; +- +- /* If end of src or dst node is reached, move to next node */ +- src_left -= cur_copy_size; +- if (!src_left) { +- ++src_mm; +- src_start = src_mm->start << PAGE_SHIFT; +- src_start += +- src_ttm_bo->bdev->man[src->mem_type].gpu_offset; +- src_left = src_mm->size << PAGE_SHIFT; +- } else +- src_start += cur_copy_size; +- +- dst_left -= cur_copy_size; +- if (!dst_left) { +- ++dst_mm; +- dst_start = dst_mm->start << PAGE_SHIFT; +- dst_start += +- dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; +- dst_left = dst_mm->size << PAGE_SHIFT; +- } else +- dst_start += cur_copy_size; +- +- } while (total_copy_size < size); +- +- /* Failure could occur after partial copy. So fill in amount copied +- * and fence, still fill-in +- */ +- if (actual_size) +- *actual_size = total_copy_size; +- +- if (fence) { +- amdgpu_bo_fence(src_mem->bo, fence, true); +- amdgpu_bo_fence(dst_mem->bo, fence, true); +- } +- +- if (f) +- *f = fence; +- +-copy_fail: +- ttm_eu_backoff_reservation(&ticket, &list); +- return r; +-} +- +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +index ff6f90a..5ad0580 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +@@ -27,7 +27,9 @@ + #include <linux/pagemap.h> + #include <drm/drmP.h> + #include <drm/amdgpu_drm.h> ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + #include <drm/drm_syncobj.h> ++#endif + #include "amdgpu.h" + #include "amdgpu_trace.h" + +@@ -38,7 +40,7 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, + struct drm_gem_object *gobj; + unsigned long size; + +- gobj = drm_gem_object_lookup(p->filp, data->handle); ++ gobj = kcl_drm_gem_object_lookup(p->adev->ddev, p->filp, data->handle); + if (gobj == NULL) + return -EINVAL; + +@@ -54,7 +56,7 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, + + *offset = data->offset; + +- drm_gem_object_put_unlocked(gobj); ++ kcl_drm_gem_object_put_unlocked(gobj); + + if (amdgpu_ttm_tt_get_usermm(p->uf_entry.robj->tbo.ttm)) { + amdgpu_bo_unref(&p->uf_entry.robj); +@@ -90,7 +92,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) + } + + /* get chunks */ +- chunk_array_user = u64_to_user_ptr(cs->in.chunks); ++ chunk_array_user = kcl_u64_to_user_ptr(cs->in.chunks); + if (copy_from_user(chunk_array, chunk_array_user, + sizeof(uint64_t)*cs->in.num_chunks)) { + ret = -EFAULT; +@@ -110,7 +112,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) + struct drm_amdgpu_cs_chunk user_chunk; + uint32_t __user *cdata; + +- chunk_ptr = u64_to_user_ptr(chunk_array[i]); ++ chunk_ptr = kcl_u64_to_user_ptr(chunk_array[i]); + if (copy_from_user(&user_chunk, chunk_ptr, + sizeof(struct drm_amdgpu_cs_chunk))) { + ret = -EFAULT; +@@ -121,9 +123,13 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) + p->chunks[i].length_dw = user_chunk.length_dw; + + size = p->chunks[i].length_dw; +- cdata = u64_to_user_ptr(user_chunk.chunk_data); ++ cdata = kcl_u64_to_user_ptr(user_chunk.chunk_data); + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ p->chunks[i].kdata = drm_malloc_ab(size, sizeof(uint32_t)); ++#else + p->chunks[i].kdata = kvmalloc_array(size, sizeof(uint32_t), GFP_KERNEL); ++#endif + if (p->chunks[i].kdata == NULL) { + ret = -ENOMEM; + i--; +@@ -155,8 +161,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) + break; + + case AMDGPU_CHUNK_ID_DEPENDENCIES: ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + case AMDGPU_CHUNK_ID_SYNCOBJ_IN: + case AMDGPU_CHUNK_ID_SYNCOBJ_OUT: ++#endif + break; + + default: +@@ -178,7 +186,11 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) + i = p->nchunks - 1; + free_partial_kdata: + for (; i >= 0; i--) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ drm_free_large(p->chunks[i].kdata); ++#else + kvfree(p->chunks[i].kdata); ++#endif + kfree(p->chunks); + p->chunks = NULL; + p->nchunks = 0; +@@ -477,16 +489,11 @@ static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p, + return -EPERM; + + /* Check if we have user pages and nobody bound the BO already */ +- if (amdgpu_ttm_tt_userptr_needs_pages(bo->tbo.ttm) && +- lobj->user_pages) { +- amdgpu_ttm_placement_from_domain(bo, +- AMDGPU_GEM_DOMAIN_CPU); +- r = ttm_bo_validate(&bo->tbo, &bo->placement, true, +- false); +- if (r) +- return r; +- amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, +- lobj->user_pages); ++ if (lobj->user_pages && bo->tbo.ttm->state != tt_bound) { ++ size_t size = sizeof(struct page *); ++ ++ size *= bo->tbo.ttm->num_pages; ++ memcpy(bo->tbo.ttm->pages, lobj->user_pages, size); + binding_userptr = true; + } + +@@ -498,7 +505,11 @@ static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p, + return r; + + if (binding_userptr) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ drm_free_large(lobj->user_pages); ++#else + kvfree(lobj->user_pages); ++#endif + lobj->user_pages = NULL; + } + } +@@ -511,6 +522,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, + struct amdgpu_fpriv *fpriv = p->filp->driver_priv; + struct amdgpu_bo_list_entry *e; + struct list_head duplicates; ++ bool need_mmap_lock = false; + unsigned i, tries = 10; + int r; + +@@ -518,9 +530,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, + + p->bo_list = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle); + if (p->bo_list) { ++ need_mmap_lock = p->bo_list->first_userptr != ++ p->bo_list->num_entries; + amdgpu_bo_list_get_list(p->bo_list, &p->validated); +- if (p->bo_list->first_userptr != p->bo_list->num_entries) +- p->mn = amdgpu_mn_get(p->adev); + } + + INIT_LIST_HEAD(&duplicates); +@@ -529,6 +541,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, + if (p->uf_entry.robj && !p->uf_entry.robj->parent) + list_add(&p->uf_entry.tv.head, &p->validated); + ++ if (need_mmap_lock) ++ down_read(¤t->mm->mmap_sem); ++ + while (1) { + struct list_head need_pages; + unsigned i; +@@ -548,25 +563,27 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, + INIT_LIST_HEAD(&need_pages); + for (i = p->bo_list->first_userptr; + i < p->bo_list->num_entries; ++i) { +- struct amdgpu_bo *bo; + + e = &p->bo_list->array[i]; +- bo = e->robj; +- +- if (amdgpu_ttm_tt_userptr_invalidated(bo->tbo.ttm, ++ ++ if (amdgpu_ttm_tt_userptr_invalidated(e->robj->tbo.ttm, + &e->user_invalidated) && e->user_pages) { + + /* We acquired a page array, but somebody + * invalidated it. Free it and try again + */ + release_pages(e->user_pages, +- bo->tbo.ttm->num_pages, ++ e->robj->tbo.ttm->num_pages, + false); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ drm_free_large(e->user_pages); ++#else + kvfree(e->user_pages); ++#endif + e->user_pages = NULL; + } + +- if (amdgpu_ttm_tt_userptr_needs_pages(bo->tbo.ttm) && ++ if (e->robj->tbo.ttm->state != tt_bound && + !e->user_pages) { + list_del(&e->tv.head); + list_add(&e->tv.head, &need_pages); +@@ -592,9 +609,14 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, + list_for_each_entry(e, &need_pages, tv.head) { + struct ttm_tt *ttm = e->robj->tbo.ttm; + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ e->user_pages = drm_calloc_large(ttm->num_pages, ++ sizeof(struct page*)); ++#else + e->user_pages = kvmalloc_array(ttm->num_pages, + sizeof(struct page*), + GFP_KERNEL | __GFP_ZERO); ++#endif + if (!e->user_pages) { + r = -ENOMEM; + DRM_ERROR("calloc failure in %s\n", __func__); +@@ -604,7 +626,11 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, + r = amdgpu_ttm_tt_get_user_pages(ttm, e->user_pages); + if (r) { + DRM_ERROR("amdgpu_ttm_tt_get_user_pages failed.\n"); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ drm_free_large(e->user_pages); ++#else + kvfree(e->user_pages); ++#endif + e->user_pages = NULL; + goto error_free_pages; + } +@@ -643,6 +669,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, + + amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved, + p->bytes_moved_vis); ++ fpriv->vm.last_eviction_counter = ++ atomic64_read(&p->adev->num_evictions); ++ + if (p->bo_list) { + struct amdgpu_bo *gds = p->bo_list->gds_obj; + struct amdgpu_bo *gws = p->bo_list->gws_obj; +@@ -683,6 +712,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, + + error_free_pages: + ++ if (need_mmap_lock) ++ up_read(¤t->mm->mmap_sem); ++ + if (p->bo_list) { + for (i = p->bo_list->first_userptr; + i < p->bo_list->num_entries; ++i) { +@@ -694,7 +726,11 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, + release_pages(e->user_pages, + e->robj->tbo.ttm->num_pages, + false); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ drm_free_large(e->user_pages); ++#else + kvfree(e->user_pages); ++#endif + } + } + +@@ -729,13 +765,19 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, + { + unsigned i; + +- if (error && backoff) ++ if (!error) ++ ttm_eu_fence_buffer_objects(&parser->ticket, ++ &parser->validated, ++ parser->fence); ++ else if (backoff) + ttm_eu_backoff_reservation(&parser->ticket, + &parser->validated); + ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + for (i = 0; i < parser->num_post_dep_syncobjs; i++) + drm_syncobj_put(parser->post_dep_syncobjs[i]); + kfree(parser->post_dep_syncobjs); ++#endif + + dma_fence_put(parser->fence); + +@@ -745,7 +787,11 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, + amdgpu_bo_list_put(parser->bo_list); + + for (i = 0; i < parser->nchunks; i++) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ drm_free_large(parser->chunks[i].kdata); ++#else + kvfree(parser->chunks[i].kdata); ++#endif + kfree(parser->chunks); + if (parser->job) + amdgpu_job_free(parser->job); +@@ -765,6 +811,10 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) + if (r) + return r; + ++ r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_dir_update); ++ if (r) ++ return r; ++ + r = amdgpu_vm_clear_freed(adev, vm, NULL); + if (r) + return r; +@@ -818,13 +868,7 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) + + } + +- r = amdgpu_vm_handle_moved(adev, vm); +- if (r) +- return r; +- +- r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_update); +- if (r) +- return r; ++ r = amdgpu_vm_clear_moved(adev, vm, &p->job->sync); + + if (amdgpu_vm_debug && p->bo_list) { + /* Invalidate all BOs to test for userspace bugs */ +@@ -834,7 +878,7 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) + if (!bo) + continue; + +- amdgpu_vm_bo_invalidate(adev, bo, false); ++ amdgpu_vm_bo_invalidate(adev, bo); + } + } + +@@ -859,7 +903,7 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev, + } + + if (p->job->vm) { +- p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->root.base.bo); ++ p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->root.bo); + + r = amdgpu_bo_vm_update_pte(p); + if (r) +@@ -927,11 +971,11 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, + uint64_t offset; + uint8_t *kptr; + +- r = amdgpu_cs_find_mapping(parser, chunk_ib->va_start, +- &aobj, &m); +- if (r) { ++ m = amdgpu_cs_find_mapping(parser, chunk_ib->va_start, ++ &aobj); ++ if (!aobj) { + DRM_ERROR("IB va_start is invalid\n"); +- return r; ++ return -EINVAL; + } + + if ((chunk_ib->va_start + chunk_ib->ib_bytes) > +@@ -1029,12 +1073,13 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, + return 0; + } + ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p, + uint32_t handle) + { + int r; + struct dma_fence *fence; +- r = drm_syncobj_find_fence(p->filp, handle, &fence); ++ r = drm_syncobj_fence_get(p->filp, handle, &fence); + if (r) + return r; + +@@ -1089,6 +1134,7 @@ static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p, + } + return 0; + } ++#endif + + static int amdgpu_cs_dependencies(struct amdgpu_device *adev, + struct amdgpu_cs_parser *p) +@@ -1104,6 +1150,7 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, + r = amdgpu_cs_process_fence_dep(p, chunk); + if (r) + return r; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + } else if (chunk->chunk_id == AMDGPU_CHUNK_ID_SYNCOBJ_IN) { + r = amdgpu_cs_process_syncobj_in_dep(p, chunk); + if (r) +@@ -1112,12 +1159,14 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, + r = amdgpu_cs_process_syncobj_out_dep(p, chunk); + if (r) + return r; ++#endif + } + } + + return amdgpu_sem_add_cs(p->ctx, p->job->ring, &p->job->dep_sync); + } + ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) + { + int i; +@@ -1125,6 +1174,7 @@ static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) + for (i = 0; i < p->num_post_dep_syncobjs; ++i) + drm_syncobj_replace_fence(p->post_dep_syncobjs[i], p->fence); + } ++#endif + + static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, + union drm_amdgpu_cs *cs) +@@ -1132,29 +1182,14 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, + struct amdgpu_ring *ring = p->job->ring; + struct amd_sched_entity *entity = &p->ctx->rings[ring->idx].entity; + struct amdgpu_job *job; +- unsigned i; + int r; + +- amdgpu_mn_lock(p->mn); +- if (p->bo_list) { +- for (i = p->bo_list->first_userptr; +- i < p->bo_list->num_entries; ++i) { +- struct amdgpu_bo *bo = p->bo_list->array[i].robj; +- +- if (amdgpu_ttm_tt_userptr_needs_pages(bo->tbo.ttm)) { +- amdgpu_mn_unlock(p->mn); +- return -ERESTARTSYS; +- } +- } +- } +- + job = p->job; + p->job = NULL; + + r = amd_sched_job_init(&job->base, &ring->sched, entity, p->filp); + if (r) { + amdgpu_job_free(job); +- amdgpu_mn_unlock(p->mn); + return r; + } + +@@ -1162,18 +1197,17 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, + job->fence_ctx = entity->fence_context; + p->fence = dma_fence_get(&job->base.s_fence->finished); + ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) + amdgpu_cs_post_dependencies(p); ++#endif + + cs->out.handle = amdgpu_ctx_add_fence(p->ctx, ring, p->fence); + job->uf_sequence = cs->out.handle; + amdgpu_job_free_resources(job); ++ amdgpu_cs_parser_fini(p, 0, true); + + trace_amdgpu_cs_ioctl(job); + amd_sched_entity_push_job(&job->base); +- +- ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence); +- amdgpu_mn_unlock(p->mn); +- + return 0; + } + +@@ -1228,7 +1262,10 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) + goto out; + + r = amdgpu_cs_submit(&parser, cs); ++ if (r) ++ goto out; + ++ return 0; + out: + amdgpu_cs_parser_fini(&parser, r, reserved_buffers); + return r; +@@ -1274,7 +1311,7 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data, + if (IS_ERR(fence)) + r = PTR_ERR(fence); + else if (fence) { +- r = dma_fence_wait_timeout(fence, true, timeout); ++ r = kcl_fence_wait_timeout(fence, true, timeout); + dma_fence_put(fence); + } else + r = 1; +@@ -1349,7 +1386,7 @@ static int amdgpu_cs_wait_all_fences(struct amdgpu_device *adev, + else if (!fence) + continue; + +- r = dma_fence_wait_timeout(fence, true, timeout); ++ r = kcl_fence_wait_timeout(fence, true, timeout); + dma_fence_put(fence); + if (r < 0) + return r; +@@ -1401,13 +1438,12 @@ static int amdgpu_cs_wait_any_fence(struct amdgpu_device *adev, + array[i] = fence; + } else { /* NULL, the fence has been already signaled */ + r = 1; +- first = i; + goto out; + } + } + +- r = dma_fence_wait_any_timeout(array, fence_count, true, timeout, +- &first); ++ r = kcl_fence_wait_any_timeout(array, fence_count, true, timeout, ++ &first); + if (r < 0) + goto err_free_fence_array; + +@@ -1452,7 +1488,7 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, + if (fences == NULL) + return -ENOMEM; + +- fences_user = u64_to_user_ptr(wait->in.fences); ++ fences_user = kcl_u64_to_user_ptr(wait->in.fences); + if (copy_from_user(fences, fences_user, + sizeof(struct drm_amdgpu_fence) * fence_count)) { + r = -EFAULT; +@@ -1481,36 +1517,78 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, + * virtual memory address. Returns allocation structure when found, NULL + * otherwise. + */ +-int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, +- uint64_t addr, struct amdgpu_bo **bo, +- struct amdgpu_bo_va_mapping **map) ++struct amdgpu_bo_va_mapping * ++amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, ++ uint64_t addr, struct amdgpu_bo **bo) + { +- struct amdgpu_fpriv *fpriv = parser->filp->driver_priv; +- struct amdgpu_vm *vm = &fpriv->vm; + struct amdgpu_bo_va_mapping *mapping; +- int r; ++ unsigned i; ++ ++ if (!parser->bo_list) ++ return NULL; + + addr /= AMDGPU_GPU_PAGE_SIZE; + +- mapping = amdgpu_vm_bo_lookup_mapping(vm, addr); +- if (!mapping || !mapping->bo_va || !mapping->bo_va->base.bo) +- return -EINVAL; ++ for (i = 0; i < parser->bo_list->num_entries; i++) { ++ struct amdgpu_bo_list_entry *lobj; + +- *bo = mapping->bo_va->base.bo; +- *map = mapping; ++ lobj = &parser->bo_list->array[i]; ++ if (!lobj->bo_va) ++ continue; + +- /* Double check that the BO is reserved by this CS */ +- if (READ_ONCE((*bo)->tbo.resv->lock.ctx) != &parser->ticket) +- return -EINVAL; ++ list_for_each_entry(mapping, &lobj->bo_va->valids, list) { ++ if (mapping->start > addr || ++ addr > mapping->last) ++ continue; + +- r = amdgpu_ttm_bind(&(*bo)->tbo, &(*bo)->tbo.mem); +- if (unlikely(r)) +- return r; ++ *bo = lobj->bo_va->base.bo; ++ return mapping; ++ } ++ ++ list_for_each_entry(mapping, &lobj->bo_va->invalids, list) { ++ if (mapping->start > addr || ++ addr > mapping->last) ++ continue; + +- if ((*bo)->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) ++ *bo = lobj->bo_va->base.bo; ++ return mapping; ++ } ++ } ++ ++ return NULL; ++} ++ ++/** ++ * amdgpu_cs_sysvm_access_required - make BOs accessible by the system VM ++ * ++ * @parser: command submission parser context ++ * ++ * Helper for UVD/VCE VM emulation, make sure BOs are accessible by the system VM. ++ */ ++int amdgpu_cs_sysvm_access_required(struct amdgpu_cs_parser *parser) ++{ ++ unsigned i; ++ int r; ++ ++ if (!parser->bo_list) + return 0; + +- (*bo)->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; +- amdgpu_ttm_placement_from_domain(*bo, (*bo)->allowed_domains); +- return ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, false, false); ++ for (i = 0; i < parser->bo_list->num_entries; i++) { ++ struct amdgpu_bo *bo = parser->bo_list->array[i].robj; ++ ++ r = amdgpu_ttm_bind(&bo->tbo, &bo->tbo.mem); ++ if (unlikely(r)) ++ return r; ++ ++ if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) ++ continue; ++ ++ bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; ++ amdgpu_ttm_placement_from_domain(bo, bo->allowed_domains); ++ r = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); ++ if (unlikely(r)) ++ return r; ++ } ++ ++ return 0; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index f032e87..37398e3 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -404,15 +404,6 @@ void amdgpu_pci_config_reset(struct amdgpu_device *adev) + */ + static int amdgpu_doorbell_init(struct amdgpu_device *adev) + { +- /* No doorbell on SI hardware generation */ +- if (adev->asic_type < CHIP_BONAIRE) { +- adev->doorbell.base = 0; +- adev->doorbell.size = 0; +- adev->doorbell.num_doorbells = 0; +- adev->doorbell.ptr = NULL; +- return 0; +- } +- + /* doorbell bar mapping */ + adev->doorbell.base = pci_resource_start(adev->pdev, 2); + adev->doorbell.size = pci_resource_len(adev->pdev, 2); +@@ -2130,8 +2121,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, + DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); + DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); + +- /* doorbell bar mapping */ +- amdgpu_doorbell_init(adev); ++ if (adev->asic_type >= CHIP_BONAIRE) ++ /* doorbell bar mapping */ ++ amdgpu_doorbell_init(adev); + + /* io port mapping */ + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { +@@ -2348,7 +2340,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) + amdgpu_atombios_fini(adev); + kfree(adev->bios); + adev->bios = NULL; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0) + if (!pci_is_thunderbolt_attached(adev->pdev)) ++#endif + vga_switcheroo_unregister_client(adev->pdev); + if (adev->flags & AMD_IS_PX) + vga_switcheroo_fini_domain_pm_ops(adev->dev); +@@ -2358,7 +2352,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) + adev->rio_mem = NULL; + iounmap(adev->rmmio); + adev->rmmio = NULL; +- amdgpu_doorbell_fini(adev); ++ if (adev->asic_type >= CHIP_BONAIRE) ++ amdgpu_doorbell_fini(adev); + amdgpu_debugfs_regs_cleanup(adev); + } + +@@ -3159,6 +3154,27 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, + return 0; + } + ++#if defined(BUILD_AS_DKMS) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) ++void amdgpu_debugfs_cleanup(struct drm_minor *minor) ++{ ++ struct drm_info_node *node, *tmp; ++ ++ if (!&minor->debugfs_root) ++ return 0; ++ ++ mutex_lock(&minor->debugfs_lock); ++ list_for_each_entry_safe(node, tmp, ++ &minor->debugfs_list, list) { ++ debugfs_remove(node->dent); ++ list_del(&node->list); ++ kfree(node); ++ } ++ mutex_unlock(&minor->debugfs_lock); ++ ++ return 0; ++} ++#endif ++ + #if defined(CONFIG_DEBUG_FS) + + static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user *buf, +@@ -3570,7 +3586,10 @@ static ssize_t amdgpu_debugfs_sensor_read(struct file *f, char __user *buf, + + valuesize = sizeof(values); + if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->read_sensor) +- r = amdgpu_dpm_read_sensor(adev, idx, &values[0], &valuesize); ++ r = adev->powerplay.pp_funcs->read_sensor(adev->powerplay.pp_handle, idx, &values[0], &valuesize); ++ else if (adev->pm.funcs && adev->pm.funcs->read_sensor) ++ r = adev->pm.funcs->read_sensor(adev, idx, &values[0], ++ &valuesize); + else + return -EINVAL; + +@@ -3594,7 +3613,7 @@ static ssize_t amdgpu_debugfs_sensor_read(struct file *f, char __user *buf, + static ssize_t amdgpu_debugfs_wave_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) + { +- struct amdgpu_device *adev = f->f_inode->i_private; ++ struct amdgpu_device *adev = file_inode(f)->i_private; + int r, x; + ssize_t result=0; + uint32_t offset, se, sh, cu, wave, simd, data[32]; +@@ -3644,7 +3663,8 @@ static ssize_t amdgpu_debugfs_wave_read(struct file *f, char __user *buf, + static ssize_t amdgpu_debugfs_gpr_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) + { +- struct amdgpu_device *adev = f->f_inode->i_private; ++ struct amdgpu_device *adev = file_inode(f)->i_private; ++ + int r; + ssize_t result = 0; + uint32_t offset, se, sh, cu, wave, simd, thread, bank, *data; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h +index 0d22259..12a4a78 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h +@@ -427,6 +427,7 @@ struct amdgpu_pm { + struct amdgpu_dpm dpm; + const struct firmware *fw; /* SMC firmware */ + uint32_t fw_version; ++ const struct amdgpu_dpm_funcs *funcs; + uint32_t pcie_gen_mask; + uint32_t pcie_mlw_mask; + struct amd_pp_display_configuration pm_display_cfg;/* set by dc */ +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +index 2be2e05..0720358 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +@@ -69,10 +69,9 @@ + * - 3.17.0 - Add AMDGPU_NUM_VRAM_CPU_PAGE_FAULTS. + * - 3.18.0 - Export gpu always on cu bitmap + * - 3.19.0 - Add support for UVD MJPEG decode +- * - 3.20.0 - Add support for local BOs + */ + #define KMS_DRIVER_MAJOR 3 +-#define KMS_DRIVER_MINOR 20 ++#define KMS_DRIVER_MINOR 19 + #define KMS_DRIVER_PATCHLEVEL 0 + + int amdgpu_vram_limit = 0; +@@ -124,7 +123,6 @@ int amdgpu_cntl_sb_buf_per_se = 0; + int amdgpu_param_buf_per_se = 0; + int amdgpu_job_hang_limit = 0; + int amdgpu_lbpw = -1; +-int amdgpu_compute_multipipe = -1; + + MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes"); + module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); +@@ -274,9 +272,6 @@ module_param_named(job_hang_limit, amdgpu_job_hang_limit, int ,0444); + MODULE_PARM_DESC(lbpw, "Load Balancing Per Watt (LBPW) support (1 = enable, 0 = disable, -1 = auto)"); + module_param_named(lbpw, amdgpu_lbpw, int, 0444); + +-MODULE_PARM_DESC(compute_multipipe, "Force compute queues to be spread across pipes (1 = enable, 0 = disable, -1 = auto)"); +-module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444); +- + #ifdef CONFIG_DRM_AMDGPU_SI + + int amdgpu_si_support = 1; +@@ -822,10 +817,8 @@ static struct drm_driver kms_driver = { + .open = amdgpu_driver_open_kms, + .postclose = amdgpu_driver_postclose_kms, + .lastclose = amdgpu_driver_lastclose_kms, ++ .set_busid = drm_pci_set_busid, + .unload = amdgpu_driver_unload_kms, +- .get_vblank_counter = amdgpu_get_vblank_counter_kms, +- .enable_vblank = amdgpu_enable_vblank_kms, +- .disable_vblank = amdgpu_disable_vblank_kms, + .get_vblank_timestamp = drm_calc_vbltimestamp_from_scanoutpos, + .get_scanout_position = amdgpu_get_crtc_scanout_position, + #if defined(CONFIG_DEBUG_FS) +@@ -841,6 +834,7 @@ static struct drm_driver kms_driver = { + .gem_close_object = amdgpu_gem_object_close, + .dumb_create = amdgpu_mode_dumb_create, + .dumb_map_offset = amdgpu_mode_dumb_mmap, ++ .dumb_destroy = drm_gem_dumb_destroy, + .fops = &amdgpu_driver_kms_fops, + + .prime_handle_to_fd = drm_gem_prime_handle_to_fd, +@@ -931,4 +925,3 @@ module_exit(amdgpu_exit); + MODULE_AUTHOR(DRIVER_AUTHOR); + MODULE_DESCRIPTION(DRIVER_DESC); + MODULE_LICENSE("GPL and additional rights"); +-MODULE_VERSION("17.50.2.13"); +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +index 3d08c6f..fdb9d85 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +@@ -44,12 +44,20 @@ + * This is the main unload function for KMS (all asics). + * Returns 0 on success. + */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) ++int amdgpu_driver_unload_kms(struct drm_device *dev) ++#else + void amdgpu_driver_unload_kms(struct drm_device *dev) ++#endif + { + struct amdgpu_device *adev = dev->dev_private; + + if (adev == NULL) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) ++ return 0; ++#else + return; ++#endif + + if (adev->rmmio == NULL) + goto done_free; +@@ -71,6 +79,9 @@ void amdgpu_driver_unload_kms(struct drm_device *dev) + done_free: + kfree(adev); + dev->dev_private = NULL; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) ++ return 0; ++#endif + } + + /** +@@ -129,8 +140,12 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags) + amdgpu_has_atpx() && + (amdgpu_is_atpx_hybrid() || + amdgpu_has_atpx_dgpu_power_cntl()) && ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++ ((flags & AMD_IS_APU) == 0)) ++#else + ((flags & AMD_IS_APU) == 0) && + !pci_is_thunderbolt_attached(dev->pdev)) ++#endif + flags |= AMD_IS_PX; + + /* amdgpu_device_init should report only fatal error +@@ -1052,6 +1067,72 @@ void amdgpu_disable_vblank_kms(struct drm_device *dev, unsigned int pipe) + amdgpu_irq_put(adev, &adev->crtc_irq, idx); + } + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) ++/** ++ * amdgpu_get_vblank_timestamp_kms - get vblank timestamp ++ * ++ * @dev: drm dev pointer ++ * @crtc: crtc to get the timestamp for ++ * @max_error: max error ++ * @vblank_time: time value ++ * @flags: flags passed to the driver ++ * ++ * Gets the timestamp on the requested crtc based on the ++ * scanout position. (all asics). ++ * Returns postive status flags on success, negative error on failure. ++ */ ++int amdgpu_get_vblank_timestamp_kms(struct drm_device *dev, unsigned int pipe, ++ int *max_error, ++ struct timeval *vblank_time, ++ unsigned flags) ++{ ++ struct drm_crtc *crtc; ++ struct amdgpu_device *adev = dev->dev_private; ++ ++ if (pipe >= dev->num_crtcs) { ++ DRM_ERROR("Invalid crtc %u\n", pipe); ++ return -EINVAL; ++ } ++ ++ /* Get associated drm_crtc: */ ++ crtc = &adev->mode_info.crtcs[pipe]->base; ++ if (!crtc) { ++ /* This can occur on driver load if some component fails to ++ * initialize completely and driver is unloaded */ ++ DRM_ERROR("Uninitialized crtc %d\n", pipe); ++ return -EINVAL; ++ } ++ ++ /* Helper routine in DRM core does all the work: */ ++ return kcl_drm_calc_vbltimestamp_from_scanoutpos(dev, pipe, max_error, ++ vblank_time, flags, ++ crtc, &crtc->hwmode); ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) ++const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_CREATE, amdgpu_gem_create_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_CTX, amdgpu_ctx_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_BO_LIST, amdgpu_bo_list_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ /* KMS */ ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_MMAP, amdgpu_gem_mmap_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_WAIT_IDLE, amdgpu_gem_wait_idle_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_CS, amdgpu_cs_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_INFO, amdgpu_info_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_WAIT_CS, amdgpu_cs_wait_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_WAIT_FENCES, amdgpu_cs_wait_fences_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_METADATA, amdgpu_gem_metadata_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_VA, amdgpu_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_OP, amdgpu_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_USERPTR, amdgpu_gem_userptr_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_FIND_BO, amdgpu_gem_find_bo_by_cpu_mapping_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_FREESYNC, amdgpu_freesync_ioctl, DRM_MASTER|DRM_UNLOCKED), ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_FIND_BO, amdgpu_gem_find_bo_by_cpu_mapping_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_DGMA, amdgpu_gem_dgma_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++ DRM_IOCTL_DEF_DRV(AMDGPU_SEM, amdgpu_sem_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), ++}; ++#else + const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_CREATE, amdgpu_gem_create_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_CTX, amdgpu_ctx_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), +@@ -1073,6 +1154,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_DGMA, amdgpu_gem_dgma_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_SEM, amdgpu_sem_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + }; ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) */ + const int amdgpu_max_kms_ioctl = ARRAY_SIZE(amdgpu_ioctls_kms); + + /* +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +index d25ec38..430c622 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +@@ -50,10 +50,8 @@ struct amdgpu_mn { + struct hlist_node node; + + /* objects protected by lock */ +- struct rw_semaphore lock; +- struct rb_root_cached objects; +- struct mutex read_lock; +- atomic_t recursion; ++ struct mutex lock; ++ struct rb_root objects; + }; + + struct amdgpu_mn_node { +@@ -76,17 +74,17 @@ static void amdgpu_mn_destroy(struct work_struct *work) + struct amdgpu_bo *bo, *next_bo; + + mutex_lock(&adev->mn_lock); +- down_write(&rmn->lock); ++ mutex_lock(&rmn->lock); + hash_del(&rmn->node); +- rbtree_postorder_for_each_entry_safe(node, next_node, +- &rmn->objects.rb_root, it.rb) { ++ rbtree_postorder_for_each_entry_safe(node, next_node, &rmn->objects, ++ it.rb) { + list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) { + bo->mn = NULL; + list_del_init(&bo->mn_list); + } + kfree(node); + } +- up_write(&rmn->lock); ++ mutex_unlock(&rmn->lock); + mutex_unlock(&adev->mn_lock); + mmu_notifier_unregister_no_release(&rmn->mn, rmn->mm); + kfree(rmn); +@@ -108,53 +106,6 @@ static void amdgpu_mn_release(struct mmu_notifier *mn, + schedule_work(&rmn->work); + } + +- +-/** +- * amdgpu_mn_lock - take the write side lock for this mn +- */ +-void amdgpu_mn_lock(struct amdgpu_mn *mn) +-{ +- if (mn) +- down_write(&mn->lock); +-} +- +-/** +- * amdgpu_mn_unlock - drop the write side lock for this mn +- */ +-void amdgpu_mn_unlock(struct amdgpu_mn *mn) +-{ +- if (mn) +- up_write(&mn->lock); +-} +- +-/** +- * amdgpu_mn_read_lock - take the rmn read lock +- * +- * @rmn: our notifier +- * +- * Take the rmn read side lock. +- */ +-static void amdgpu_mn_read_lock(struct amdgpu_mn *rmn) +-{ +- mutex_lock(&rmn->read_lock); +- if (atomic_inc_return(&rmn->recursion) == 1) +- down_read_non_owner(&rmn->lock); +- mutex_unlock(&rmn->read_lock); +-} +- +-/** +- * amdgpu_mn_read_unlock - drop the rmn read lock +- * +- * @rmn: our notifier +- * +- * Drop the rmn read side lock. +- */ +-static void amdgpu_mn_read_unlock(struct amdgpu_mn *rmn) +-{ +- if (atomic_dec_return(&rmn->recursion) == 0) +- up_read_non_owner(&rmn->lock); +-} +- + /** + * amdgpu_mn_invalidate_node - unmap all BOs of a node + * +@@ -175,12 +126,23 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, + if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end)) + continue; + +- r = reservation_object_wait_timeout_rcu(bo->tbo.resv, ++ r = amdgpu_bo_reserve(bo, true); ++ if (r) { ++ DRM_ERROR("(%ld) failed to reserve user bo\n", r); ++ continue; ++ } ++ ++ r = kcl_reservation_object_wait_timeout_rcu(bo->tbo.resv, + true, false, MAX_SCHEDULE_TIMEOUT); + if (r <= 0) + DRM_ERROR("(%ld) failed to wait for user bo\n", r); + +- amdgpu_ttm_tt_mark_user_pages(bo->tbo.ttm); ++ amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); ++ r = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); ++ if (r) ++ DRM_ERROR("(%ld) failed to validate user bo\n", r); ++ ++ amdgpu_bo_unreserve(bo); + } + } + +@@ -206,7 +168,7 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, + /* notification is exclusive, but interval is inclusive */ + end -= 1; + +- amdgpu_mn_read_lock(rmn); ++ mutex_lock(&rmn->lock); + + it = interval_tree_iter_first(&rmn->objects, start, end); + while (it) { +@@ -218,33 +180,12 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, + amdgpu_mn_invalidate_node(node, start, end); + } + +- up_read(&rmn->lock); +-} +- +-/** +- * amdgpu_mn_invalidate_range_end - callback to notify about mm change +- * +- * @mn: our notifier +- * @mn: the mm this callback is about +- * @start: start of updated range +- * @end: end of updated range +- * +- * Release the lock again to allow new command submissions. +- */ +-static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn, +- struct mm_struct *mm, +- unsigned long start, +- unsigned long end) +-{ +- struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); +- +- amdgpu_mn_read_unlock(rmn); ++ mutex_unlock(&rmn->lock); + } + + static const struct mmu_notifier_ops amdgpu_mn_ops = { + .release = amdgpu_mn_release, + .invalidate_range_start = amdgpu_mn_invalidate_range_start, +- .invalidate_range_end = amdgpu_mn_invalidate_range_end, + }; + + /** +@@ -254,19 +195,30 @@ static const struct mmu_notifier_ops amdgpu_mn_ops = { + * + * Creates a notifier context for current->mm. + */ +-struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) ++static struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) + { + struct mm_struct *mm = current->mm; + struct amdgpu_mn *rmn; + int r; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) ++ struct hlist_node *node; ++#endif + + mutex_lock(&adev->mn_lock); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 7, 0) ++ down_write(&mm->mmap_sem); ++#else + if (down_write_killable(&mm->mmap_sem)) { + mutex_unlock(&adev->mn_lock); + return ERR_PTR(-EINTR); + } ++#endif + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) ++ hash_for_each_possible(adev->mn_hash, rmn, node, node, (unsigned long)mm) ++#else + hash_for_each_possible(adev->mn_hash, rmn, node, (unsigned long)mm) ++#endif + if (rmn->mm == mm) + goto release_locks; + +@@ -279,10 +231,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) + rmn->adev = adev; + rmn->mm = mm; + rmn->mn.ops = &amdgpu_mn_ops; +- init_rwsem(&rmn->lock); +- rmn->objects = RB_ROOT_CACHED; +- mutex_init(&rmn->read_lock); +- atomic_set(&rmn->recursion, 0); ++ mutex_init(&rmn->lock); ++ rmn->objects = RB_ROOT; + + r = __mmu_notifier_register(&rmn->mn, mm); + if (r) +@@ -328,7 +278,7 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) + + INIT_LIST_HEAD(&bos); + +- down_write(&rmn->lock); ++ mutex_lock(&rmn->lock); + + while ((it = interval_tree_iter_first(&rmn->objects, addr, end))) { + kfree(node); +@@ -340,9 +290,9 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) + } + + if (!node) { +- node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_NOIO); ++ node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_KERNEL); + if (!node) { +- up_write(&rmn->lock); ++ mutex_unlock(&rmn->lock); + return -ENOMEM; + } + } +@@ -357,7 +307,7 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) + + interval_tree_insert(&node->it, &rmn->objects); + +- up_write(&rmn->lock); ++ mutex_unlock(&rmn->lock); + + return 0; + } +@@ -383,7 +333,7 @@ void amdgpu_mn_unregister(struct amdgpu_bo *bo) + return; + } + +- down_write(&rmn->lock); ++ mutex_lock(&rmn->lock); + + /* save the next list entry for later */ + head = bo->mn_list.next; +@@ -398,7 +348,6 @@ void amdgpu_mn_unregister(struct amdgpu_bo *bo) + kfree(node); + } + +- up_write(&rmn->lock); ++ mutex_unlock(&rmn->lock); + mutex_unlock(&adev->mn_lock); + } +- +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +index f421505..fb6c3d6 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +@@ -36,7 +36,6 @@ + #include <drm/drm_cache.h> + #include "amdgpu.h" + #include "amdgpu_trace.h" +-#include "amdgpu_amdkfd.h" + + static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) + { +@@ -47,9 +46,8 @@ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) + + if (bo->tbo.mem.mem_type == AMDGPU_PL_DGMA_IMPORT) + kfree(tbo->mem.bus.addr); +- if (bo->kfd_bo) +- amdgpu_amdkfd_unreserve_system_memory_limit(bo); + amdgpu_bo_kunmap(bo); ++ drm_gem_object_release(&bo->gem_base); + + if (bo->gem_base.import_attach) + drm_prime_gem_destroy(&bo->gem_base, bo->tbo.sg); +@@ -70,12 +68,11 @@ bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo) + return false; + } + +-void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain) ++static void amdgpu_ttm_placement_init(struct amdgpu_device *adev, ++ struct ttm_placement *placement, ++ struct ttm_place *places, ++ u32 domain, u64 flags) + { +- struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev); +- struct ttm_placement *placement = &abo->placement; +- struct ttm_place *places = abo->placements; +- u64 flags = abo->flags; + u32 c = 0, i; + + if ((domain & AMDGPU_GEM_DOMAIN_DGMA) && amdgpu_direct_gma_size) { +@@ -178,6 +175,27 @@ void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain) + placement->busy_placement = places; + } + ++void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain) ++{ ++ struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev); ++ ++ amdgpu_ttm_placement_init(adev, &abo->placement, abo->placements, ++ domain, abo->flags); ++} ++ ++static void amdgpu_fill_placement_to_bo(struct amdgpu_bo *bo, ++ struct ttm_placement *placement) ++{ ++ BUG_ON(placement->num_placement > (AMDGPU_GEM_DOMAIN_MAX + 1)); ++ ++ memcpy(bo->placements, placement->placement, ++ placement->num_placement * sizeof(struct ttm_place)); ++ bo->placement.num_placement = placement->num_placement; ++ bo->placement.num_busy_placement = placement->num_busy_placement; ++ bo->placement.placement = bo->placements; ++ bo->placement.busy_placement = bo->placements; ++} ++ + /** + * amdgpu_bo_create_reserved - create reserved BO for kernel use + * +@@ -309,13 +327,14 @@ void amdgpu_bo_free_kernel(struct amdgpu_bo **bo, u64 *gpu_addr, + *cpu_addr = NULL; + } + +-static int amdgpu_bo_do_create(struct amdgpu_device *adev, +- unsigned long size, int byte_align, +- bool kernel, u32 domain, u64 flags, +- struct sg_table *sg, +- struct reservation_object *resv, +- uint64_t init_value, +- struct amdgpu_bo **bo_ptr) ++int amdgpu_bo_create_restricted(struct amdgpu_device *adev, ++ unsigned long size, int byte_align, ++ bool kernel, u32 domain, u64 flags, ++ struct sg_table *sg, ++ struct ttm_placement *placement, ++ struct reservation_object *resv, ++ uint64_t init_value, ++ struct amdgpu_bo **bo_ptr) + { + struct amdgpu_bo *bo; + enum ttm_bo_type type; +@@ -342,10 +361,13 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, + bo = kzalloc(sizeof(struct amdgpu_bo), GFP_KERNEL); + if (bo == NULL) + return -ENOMEM; +- ++ r = drm_gem_object_init(adev->ddev, &bo->gem_base, size); ++ if (unlikely(r)) { ++ kfree(bo); ++ return r; ++ } + INIT_LIST_HEAD(&bo->shadow_list); + INIT_LIST_HEAD(&bo->va); +- INIT_LIST_HEAD(&bo->gem_objects); + bo->preferred_domains = domain & (AMDGPU_GEM_DOMAIN_VRAM | + AMDGPU_GEM_DOMAIN_GTT | + AMDGPU_GEM_DOMAIN_CPU | +@@ -388,17 +410,13 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, + bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; + #endif + +- bo->tbo.bdev = &adev->mman.bdev; +- amdgpu_ttm_placement_from_domain(bo, domain); ++ amdgpu_fill_placement_to_bo(bo, placement); ++ /* Kernel allocation are uninterruptible */ + + initial_bytes_moved = atomic64_read(&adev->num_bytes_moved); +- /* Kernel allocation are uninterruptible */ + r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type, + &bo->placement, page_align, !kernel, NULL, + acc_size, sg, resv, &amdgpu_ttm_bo_destroy); +- if (unlikely(r != 0)) +- return r; +- + bytes_moved = atomic64_read(&adev->num_bytes_moved) - + initial_bytes_moved; + if (adev->mc.visible_vram_size < adev->mc.real_vram_size && +@@ -408,6 +426,9 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, + else + amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0); + ++ if (unlikely(r != 0)) ++ return r; ++ + if (domain & AMDGPU_GEM_DOMAIN_DGMA && adev->ssg.enabled) + bo->tbo.ssg_can_map = true; + +@@ -422,9 +443,13 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, + if (unlikely(r)) + goto fail_unreserve; + ++#if defined(BUILD_AS_DKMS) ++ dma_fence_wait(fence, false); ++#else + amdgpu_bo_fence(bo, fence, false); + dma_fence_put(bo->tbo.moving); + bo->tbo.moving = dma_fence_get(fence); ++#endif + dma_fence_put(fence); + } + if (!resv) +@@ -459,17 +484,27 @@ static int amdgpu_bo_create_shadow(struct amdgpu_device *adev, + unsigned long size, int byte_align, + struct amdgpu_bo *bo) + { ++ struct ttm_placement placement = {0}; ++ struct ttm_place placements[AMDGPU_GEM_DOMAIN_MAX + 1]; + int r; + + if (bo->shadow) + return 0; + +- r = amdgpu_bo_do_create(adev, size, byte_align, true, +- AMDGPU_GEM_DOMAIN_GTT, +- AMDGPU_GEM_CREATE_CPU_GTT_USWC | +- AMDGPU_GEM_CREATE_SHADOW, +- NULL, bo->tbo.resv, 0, +- &bo->shadow); ++ memset(&placements, 0, sizeof(placements)); ++ amdgpu_ttm_placement_init(adev, &placement, placements, ++ AMDGPU_GEM_DOMAIN_GTT, ++ AMDGPU_GEM_CREATE_CPU_GTT_USWC | ++ AMDGPU_GEM_CREATE_SHADOW); ++ ++ r = amdgpu_bo_create_restricted(adev, size, byte_align, true, ++ AMDGPU_GEM_DOMAIN_GTT, ++ AMDGPU_GEM_CREATE_CPU_GTT_USWC | ++ AMDGPU_GEM_CREATE_SHADOW, ++ NULL, &placement, ++ bo->tbo.resv, ++ 0, ++ &bo->shadow); + if (!r) { + bo->shadow->parent = amdgpu_bo_ref(bo); + mutex_lock(&adev->shadow_list_lock); +@@ -491,11 +526,18 @@ int amdgpu_bo_create(struct amdgpu_device *adev, + uint64_t init_value, + struct amdgpu_bo **bo_ptr) + { ++ struct ttm_placement placement = {0}; ++ struct ttm_place placements[AMDGPU_GEM_DOMAIN_MAX + 1]; + uint64_t parent_flags = flags & ~AMDGPU_GEM_CREATE_SHADOW; + int r; + +- r = amdgpu_bo_do_create(adev, size, byte_align, kernel, domain, +- parent_flags, sg, resv, init_value, bo_ptr); ++ memset(&placements, 0, sizeof(placements)); ++ amdgpu_ttm_placement_init(adev, &placement, placements, ++ domain, parent_flags); ++ ++ r = amdgpu_bo_create_restricted(adev, size, byte_align, kernel, domain, ++ parent_flags, sg, &placement, resv, ++ init_value, bo_ptr); + if (r) + return r; + +@@ -931,7 +973,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, + return; + + abo = container_of(bo, struct amdgpu_bo, tbo); +- amdgpu_vm_bo_invalidate(adev, abo, evict); ++ amdgpu_vm_bo_invalidate(adev, abo); + + amdgpu_bo_kunmap(abo); + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +index f73dba5..024e5cb 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +@@ -35,7 +35,6 @@ + + /* bo virtual addresses in a vm */ + struct amdgpu_bo_va_mapping { +- struct amdgpu_bo_va *bo_va; + struct list_head list; + struct rb_node rb; + uint64_t start; +@@ -50,17 +49,12 @@ struct amdgpu_bo_va { + struct amdgpu_vm_bo_base base; + + /* protected by bo being reserved */ +- unsigned ref_count; +- +- /* all other members protected by the VM PD being reserved */ + struct dma_fence *last_pt_update; ++ unsigned ref_count; + + /* mappings for this bo_va */ + struct list_head invalids; + struct list_head valids; +- +- /* If the mappings are cleared or filled */ +- bool cleared; + }; + + struct amdgpu_bo { +@@ -78,18 +72,16 @@ struct amdgpu_bo { + void *metadata; + u32 metadata_size; + unsigned prime_shared_count; +- /* GEM objects refereing to this BO */ +- struct list_head gem_objects; +- + /* list of all virtual address to which this bo is associated to */ + struct list_head va; + /* Constant after initialization */ ++ struct drm_gem_object gem_base; + struct amdgpu_bo *parent; + struct amdgpu_bo *shadow; + + struct ttm_bo_kmap_obj dma_buf_vmap; + struct amdgpu_mn *mn; +- struct kgd_mem *kfd_bo; ++ struct kfd_process_device *pdd; + + union { + struct list_head mn_list; +@@ -207,6 +199,14 @@ int amdgpu_bo_create(struct amdgpu_device *adev, + struct reservation_object *resv, + uint64_t init_value, + struct amdgpu_bo **bo_ptr); ++int amdgpu_bo_create_restricted(struct amdgpu_device *adev, ++ unsigned long size, int byte_align, ++ bool kernel, u32 domain, u64 flags, ++ struct sg_table *sg, ++ struct ttm_placement *placement, ++ struct reservation_object *resv, ++ uint64_t init_value, ++ struct amdgpu_bo **bo_ptr); + int amdgpu_bo_create_reserved(struct amdgpu_device *adev, + unsigned long size, int align, + u32 domain, struct amdgpu_bo **bo_ptr, +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +index 90adff8..06b824c 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +@@ -136,8 +136,7 @@ void amdgpu_ring_commit(struct amdgpu_ring *ring) + if (ring->funcs->end_use) + ring->funcs->end_use(ring); + +- if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ) +- amdgpu_ring_lru_touch(ring->adev, ring); ++ amdgpu_ring_lru_touch(ring->adev, ring); + } + + /** +@@ -382,7 +381,7 @@ void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring *ring) + static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) + { +- struct amdgpu_ring *ring = file_inode(f)->i_private; ++ struct amdgpu_ring *ring = (struct amdgpu_ring*)kcl_file_private(f); + int r, i; + uint32_t value, result, early[3]; + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +index af8e544..322d2529 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +@@ -36,7 +36,6 @@ + /* some special values for the owner field */ + #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul) + #define AMDGPU_FENCE_OWNER_VM ((void*)1ul) +-#define AMDGPU_FENCE_OWNER_KFD ((void *)2ul) + + #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) + #define AMDGPU_FENCE_FLAG_INT (1 << 1) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c +index 7ee8247..8492a26 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c +@@ -31,7 +31,6 @@ + #include <drm/drmP.h> + #include "amdgpu.h" + #include "amdgpu_trace.h" +-#include "amdgpu_amdkfd.h" + + struct amdgpu_sync_entry { + struct hlist_node node; +@@ -85,20 +84,11 @@ static bool amdgpu_sync_same_dev(struct amdgpu_device *adev, + */ + static void *amdgpu_sync_get_owner(struct dma_fence *f) + { +- struct amd_sched_fence *s_fence; +- struct amdgpu_amdkfd_fence *kfd_fence; +- +- if (f == NULL) +- return AMDGPU_FENCE_OWNER_UNDEFINED; ++ struct amd_sched_fence *s_fence = to_amd_sched_fence(f); + +- s_fence = to_amd_sched_fence(f); + if (s_fence) + return s_fence->owner; + +- kfd_fence = to_amdgpu_amdkfd_fence(f); +- if (kfd_fence) +- return AMDGPU_FENCE_OWNER_KFD; +- + return AMDGPU_FENCE_OWNER_UNDEFINED; + } + +@@ -180,9 +170,7 @@ int amdgpu_sync_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync, + * @sync: sync object to add fences from reservation object to + * @resv: reservation object with embedded fence + * @shared: true if we should only sync to the exclusive fence +- * +- * Sync to the fence except if it is KFD eviction fence and owner is +- * AMDGPU_FENCE_OWNER_VM. ++ * Sync to the fence + */ + int amdgpu_sync_resv(struct amdgpu_device *adev, + struct amdgpu_sync *sync, +@@ -209,15 +197,12 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, + for (i = 0; i < flist->shared_count; ++i) { + f = rcu_dereference_protected(flist->shared[i], + reservation_object_held(resv)); +- fence_owner = amdgpu_sync_get_owner(f); +- if (fence_owner == AMDGPU_FENCE_OWNER_KFD && +- owner != AMDGPU_FENCE_OWNER_UNDEFINED) +- continue; + + if (amdgpu_sync_same_dev(adev, f)) { + /* VM updates are only interesting + * for other VM updates and moves. + */ ++ fence_owner = amdgpu_sync_get_owner(f); + if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) && + (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) && + ((owner == AMDGPU_FENCE_OWNER_VM) != +diff --git a/drivers/gpu/drm/amd/amdgpu/vid.h b/drivers/gpu/drm/amd/amdgpu/vid.h +index d09592a..a648525 100755 +--- a/drivers/gpu/drm/amd/amdgpu/vid.h ++++ b/drivers/gpu/drm/amd/amdgpu/vid.h +@@ -27,8 +27,6 @@ + #define SDMA1_REGISTER_OFFSET 0x200 /* not a register */ + #define SDMA_MAX_INSTANCE 2 + +-#define KFD_VI_SDMA_QUEUE_OFFSET 0x80 /* not a register */ +- + /* crtc instance offsets */ + #define CRTC0_REGISTER_OFFSET (0x1b9c - 0x1b9c) + #define CRTC1_REGISTER_OFFSET (0x1d9c - 0x1b9c) +@@ -369,10 +367,6 @@ + * x=0: tmz_begin + * x=1: tmz_end + */ +-#define PACKET3_INVALIDATE_TLBS 0x98 +-# define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) +-# define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) +- + #define PACKET3_SET_RESOURCES 0xA0 + /* 1. header + * 2. CONTROL +diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig +index 95be0dd..e13c67c 100644 +--- a/drivers/gpu/drm/amd/amdkfd/Kconfig ++++ b/drivers/gpu/drm/amd/amdkfd/Kconfig +@@ -4,7 +4,6 @@ + + config HSA_AMD + tristate "HSA kernel driver for AMD GPU devices" +- depends on (DRM_RADEON || DRM_AMDGPU) && (X86_64 || PPC64 || ARM64) +- select DRM_AMDGPU_USERPTR ++ depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64 + help + Enable this if you want to use HSA features on AMD GPU devices. +diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile +old mode 100755 +new mode 100644 +index dba08ec..b400d56 +--- a/drivers/gpu/drm/amd/amdkfd/Makefile ++++ b/drivers/gpu/drm/amd/amdkfd/Makefile +@@ -1,28 +1,19 @@ +-# SPDX-License-Identifier: GPL-2.0 + # + # Makefile for Heterogenous System Architecture support for AMD GPU devices + # + +-FULL_AMD_PATH=$(src)/.. +- +-ccflags-y := -I$(FULL_AMD_PATH)/include/ \ +- -I$(FULL_AMD_PATH)/include/asic_reg ++ccflags-y := -Idrivers/gpu/drm/amd/include/ \ ++ -Idrivers/gpu/drm/amd/include/asic_reg + + amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ + kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ + kfd_process.o kfd_queue.o kfd_mqd_manager.o \ + kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ +- kfd_mqd_manager_v9.o \ + kfd_kernel_queue.o kfd_kernel_queue_cik.o \ +- kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ +- kfd_packet_manager.o kfd_process_queue_manager.o \ +- kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ +- kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ +- kfd_interrupt.o kfd_events.o cik_event_interrupt.o kfd_int_process_v9.o \ +- kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o \ +- kfd_peerdirect.o kfd_ipc.o +- +-amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o ++ kfd_kernel_queue_vi.o kfd_packet_manager.o \ ++ kfd_process_queue_manager.o kfd_device_queue_manager.o \ ++ kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ ++ kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ ++ kfd_dbgdev.o kfd_dbgmgr.o + + obj-$(CONFIG_HSA_AMD) += amdkfd.o +- +diff --git a/drivers/gpu/drm/amd/amdkfd/backport/Makefile b/drivers/gpu/drm/amd/amdkfd/backport/Makefile +deleted file mode 100644 +index 6a3845e..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/backport/Makefile ++++ /dev/null +@@ -1,7 +0,0 @@ +- +- +-LINUXINCLUDE := $(DKMS_INCLUDE_PREFIX) $(LINUXINCLUDE) +- +-ccflags-y += \ +- -I$(AMDKFD_FULL_PATH) \ +- -include backport/backport.h +diff --git a/drivers/gpu/drm/amd/amdkfd/backport/backport.h b/drivers/gpu/drm/amd/amdkfd/backport/backport.h +deleted file mode 100644 +index e1f8c1d..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/backport/backport.h ++++ /dev/null +@@ -1,6 +0,0 @@ +-#ifndef AMDKFD_BACKPORT_H +-#define AMDKFD_BACKPORT_H +- +-#include <linux/version.h> +- +-#endif +diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +index 00536a1..211fc48 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c ++++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +@@ -24,89 +24,40 @@ + #include "kfd_events.h" + #include "cik_int.h" + +-static bool is_cpc_vm_fault(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry) +-{ +- const struct cik_ih_ring_entry *ihre = +- (const struct cik_ih_ring_entry *)ih_ring_entry; +- +- if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || +- ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && +- ihre->vmid >= dev->vm_info.first_vmid_kfd && +- ihre->vmid <= dev->vm_info.last_vmid_kfd) +- return true; +- return false; +-} +- + static bool cik_event_interrupt_isr(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry, +- uint32_t *patched_ihre, +- bool *patched_flag) ++ const uint32_t *ih_ring_entry) + { ++ unsigned int pasid; + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; +- const struct kfd2kgd_calls *f2g = dev->kfd2kgd; +- struct cik_ih_ring_entry *tmp_ihre = +- (struct cik_ih_ring_entry *) patched_ihre; + +- /* This workaround is due to HW/FW limitation on Hawaii that +- * VMID and PASID are not written into ih_ring_entry +- */ +- if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || +- ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && +- dev->device_info->asic_family == CHIP_HAWAII) { +- *patched_flag = true; +- *tmp_ihre = *ihre; ++ pasid = (ihre->ring_id & 0xffff0000) >> 16; + +- tmp_ihre->vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); +- tmp_ihre->pasid = f2g->get_atc_vmid_pasid_mapping_pasid( +- dev->kgd, tmp_ihre->vmid); +- return (tmp_ihre->pasid != 0) && +- tmp_ihre->vmid >= dev->vm_info.first_vmid_kfd && +- tmp_ihre->vmid <= dev->vm_info.last_vmid_kfd; +- } + /* Do not process in ISR, just request it to be forwarded to WQ. */ +- return (ihre->pasid != 0) && ++ return (pasid != 0) && + (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || +- ihre->source_id == CIK_INTSRC_SDMA_TRAP || + ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || +- ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || +- is_cpc_vm_fault(dev, ih_ring_entry)); ++ ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); + } + + static void cik_event_interrupt_wq(struct kfd_dev *dev, + const uint32_t *ih_ring_entry) + { ++ unsigned int pasid; + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; + +- if (ihre->pasid == 0) ++ pasid = (ihre->ring_id & 0xffff0000) >> 16; ++ ++ if (pasid == 0) + return; + + if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) +- kfd_signal_event_interrupt(ihre->pasid, 0, 0); +- else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) +- kfd_signal_event_interrupt(ihre->pasid, 0, 0); ++ kfd_signal_event_interrupt(pasid, 0, 0); + else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) +- kfd_signal_event_interrupt(ihre->pasid, ihre->data & 0xFF, 8); ++ kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8); + else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) +- kfd_signal_hw_exception_event(ihre->pasid); +- else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || +- ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { +- struct kfd_vm_fault_info info; +- +- kfd_process_vm_fault(dev->dqm, ihre->pasid); +- +- memset(&info, 0, sizeof(info)); +- dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); +- if (!info.page_addr && !info.status) +- return; +- +- if (info.vmid == ihre->vmid) +- kfd_signal_vm_fault_event(dev, ihre->pasid, &info); +- else +- kfd_signal_vm_fault_event(dev, ihre->pasid, NULL); +- } ++ kfd_signal_hw_exception_event(pasid); + } + + const struct kfd_event_interrupt_class event_interrupt_class_cik = { +diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h +index ff8255d..79a16d2 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h ++++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h +@@ -26,32 +26,16 @@ + #include <linux/types.h> + + struct cik_ih_ring_entry { +- uint32_t source_id:8; +- uint32_t reserved1:8; +- uint32_t reserved2:16; +- +- uint32_t data:28; +- uint32_t reserved3:4; +- +- /* pipeid, meid and unused3 are officially called RINGID, +- * but for our purposes, they always decode into pipe and ME. +- */ +- uint32_t pipeid:2; +- uint32_t meid:2; +- uint32_t reserved4:4; +- uint32_t vmid:8; +- uint32_t pasid:16; +- +- uint32_t reserved5; ++ uint32_t source_id; ++ uint32_t data; ++ uint32_t ring_id; ++ uint32_t reserved; + }; + + #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 + #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 + #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 + #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF +-#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 +-#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 +-#define CIK_INTSRC_SDMA_TRAP 0xE0 + + #endif + +diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h +index 37ce6dd..48769d1 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h ++++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h +@@ -33,8 +33,7 @@ + #define APE1_MTYPE(x) ((x) << 7) + + /* valid for both DEFAULT_MTYPE and APE1_MTYPE */ +-#define MTYPE_CACHED_NV 0 +-#define MTYPE_CACHED 1 ++#define MTYPE_CACHED 0 + #define MTYPE_NONCACHED 3 + + #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h +deleted file mode 100644 +index d5d1331..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h ++++ /dev/null +@@ -1,1384 +0,0 @@ +-/* +- * Copyright 2015 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#if 0 +-HW (VI) source code for CWSR trap handler +-#Version 18 + multiple trap handler +- +-// this performance-optimal version was originally from Seven Xu at SRDC +- +-// Revison #18 --... +-/* Rev History +-** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) +-** #4. SR Memory Layout: +-** 1. VGPR-SGPR-HWREG-{LDS} +-** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. +-** #5. Update: 1. Accurate g8sr_ts_save_d timestamp +-** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) +-** #7. Update: 1. don't barrier if noLDS +-** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version +-** 2. Fix SQ issue by s_sleep 2 +-** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last +-** 2. optimize s_buffer save by burst 16sgprs... +-** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. +-** #11. Update 1. Add 2 more timestamp for debug version +-** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance +-** #13. Integ 1. Always use MUBUF for PV trap shader... +-** #14. Update 1. s_buffer_store soft clause... +-** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. +-** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree +-** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] +-** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... +-** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 +-** 2. FUNC - Handle non-CWSR traps +-*/ +- +-var G8SR_WDMEM_HWREG_OFFSET = 0 +-var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes +- +-// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. +- +-var G8SR_DEBUG_TIMESTAMP = 0 +-var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset +-var s_g8sr_ts_save_s = s[34:35] // save start +-var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi +-var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ +-var s_g8sr_ts_save_d = s[40:41] // save end +-var s_g8sr_ts_restore_s = s[42:43] // restore start +-var s_g8sr_ts_restore_d = s[44:45] // restore end +- +-var G8SR_VGPR_SR_IN_DWX4 = 0 +-var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes +-var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 +- +- +-/*************************************************************************/ +-/* control on how to run the shader */ +-/*************************************************************************/ +-//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) +-var EMU_RUN_HACK = 0 +-var EMU_RUN_HACK_RESTORE_NORMAL = 0 +-var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +-var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 +-var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +-var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +-var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +-var SAVE_LDS = 1 +-var WG_BASE_ADDR_LO = 0x9000a000 +-var WG_BASE_ADDR_HI = 0x0 +-var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem +-var CTX_SAVE_CONTROL = 0x0 +-var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +-var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) +-var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write +-var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes +-var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing +- +-/**************************************************************************/ +-/* variables */ +-/**************************************************************************/ +-var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +-var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +-var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 +- +-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits +- +-var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +-var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask +-var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +-var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +-var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +-var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 +- +-var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME +-var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME +-var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +-var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME +-var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME +- +-var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +-var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 +- +- +-/* Save */ +-var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes +-var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE +- +-var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +-var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +-var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +-var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +-var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +-var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 +- +-var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +-var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME +-var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME +-var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME +- +-var s_save_spi_init_lo = exec_lo +-var s_save_spi_init_hi = exec_hi +- +- //tba_lo and tba_hi need to be saved/restored +-var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +-var s_save_pc_hi = ttmp1 +-var s_save_exec_lo = ttmp2 +-var s_save_exec_hi = ttmp3 +-var s_save_status = ttmp4 +-var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +-var s_save_xnack_mask_lo = ttmp6 +-var s_save_xnack_mask_hi = ttmp7 +-var s_save_buf_rsrc0 = ttmp8 +-var s_save_buf_rsrc1 = ttmp9 +-var s_save_buf_rsrc2 = ttmp10 +-var s_save_buf_rsrc3 = ttmp11 +- +-var s_save_mem_offset = tma_lo +-var s_save_alloc_size = s_save_trapsts //conflict +-var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) +-var s_save_m0 = tma_hi +- +-/* Restore */ +-var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +-var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC +- +-var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +-var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +-var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +-var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +-var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +-var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 +- +-var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +-var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +-var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +-var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK +- +-var s_restore_spi_init_lo = exec_lo +-var s_restore_spi_init_hi = exec_hi +- +-var s_restore_mem_offset = ttmp2 +-var s_restore_alloc_size = ttmp3 +-var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored +-var s_restore_mem_offset_save = s_restore_tmp //no conflict +- +-var s_restore_m0 = s_restore_alloc_size //no conflict +- +-var s_restore_mode = ttmp7 +- +-var s_restore_pc_lo = ttmp0 +-var s_restore_pc_hi = ttmp1 +-var s_restore_exec_lo = tma_lo //no conflict +-var s_restore_exec_hi = tma_hi //no conflict +-var s_restore_status = ttmp4 +-var s_restore_trapsts = ttmp5 +-var s_restore_xnack_mask_lo = xnack_mask_lo +-var s_restore_xnack_mask_hi = xnack_mask_hi +-var s_restore_buf_rsrc0 = ttmp8 +-var s_restore_buf_rsrc1 = ttmp9 +-var s_restore_buf_rsrc2 = ttmp10 +-var s_restore_buf_rsrc3 = ttmp11 +- +-/**************************************************************************/ +-/* trap handler entry points */ +-/**************************************************************************/ +-/* Shader Main*/ +- +-shader main +- asic(VI) +- type(CS) +- +- +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore +- //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC +- s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC +- s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. +- s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE +- //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE +- s_branch L_SKIP_RESTORE //NOT restore, SAVE actually +- else +- s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save +- end +- +-L_JUMP_TO_RESTORE: +- s_branch L_RESTORE //restore +- +-L_SKIP_RESTORE: +- +- s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC +- s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save +- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save +- s_cbranch_scc1 L_SAVE //this is the operation for save +- +- // ********* Handle non-CWSR traps ******************* +-if (!EMU_RUN_HACK) +- /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ +- s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 +- s_waitcnt lgkmcnt(0) +- s_or_b32 ttmp7, ttmp8, ttmp9 +- s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set +- s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) +- s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler +- +-L_NO_NEXT_TRAP: +- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception +- s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. +- s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 +- s_addc_u32 ttmp1, ttmp1, 0 +-L_EXCP_CASE: +- s_and_b32 ttmp1, ttmp1, 0xFFFF +- s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) +- s_rfe_b64 [ttmp0, ttmp1] +-end +- // ********* End handling of non-CWSR traps ******************* +- +-/**************************************************************************/ +-/* save routine */ +-/**************************************************************************/ +- +-L_SAVE: +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_save_s +- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +-end +- +- //check whether there is mem_viol +- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK +- s_cbranch_scc0 L_NO_PC_REWIND +- +- //if so, need rewind PC assuming GDS operation gets NACKed +- s_mov_b32 s_save_tmp, 0 //clear mem_viol bit +- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit +- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] +- s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 +- s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc +- +-L_NO_PC_REWIND: +- s_mov_b32 s_save_tmp, 0 //clear saveCtx bit +- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit +- +- s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK +- s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation +- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT +- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT +- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp +- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY +- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp +- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS +- s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG +- +- s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp +- +- /* inform SPI the readiness and wait for SPI's go signal */ +- s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI +- s_mov_b32 s_save_exec_hi, exec_hi +- s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_sq_save_msg +- s_waitcnt lgkmcnt(0) +-end +- +- if (EMU_RUN_HACK) +- +- else +- s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC +- end +- +- L_SLEEP: +- s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 +- +- if (EMU_RUN_HACK) +- +- else +- s_cbranch_execz L_SLEEP +- end +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_spi_wrexec +- s_waitcnt lgkmcnt(0) +-end +- +- /* setup Resource Contants */ +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) +- //calculate wd_addr using absolute thread id +- v_readlane_b32 s_save_tmp, v9, 0 +- s_lshr_b32 s_save_tmp, s_save_tmp, 6 +- s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE +- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO +- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI +- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL +- else +- end +- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) +- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO +- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI +- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL +- else +- end +- +- +- s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo +- s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE +- s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited +- s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC +- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK +- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position +- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC +- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK +- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position +- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE +- +- //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) +- s_mov_b32 s_save_m0, m0 //save M0 +- +- /* global mem offset */ +- s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 +- +- +- +- +- /* save HW registers */ +- ////////////////////////////// +- +- L_SAVE_HWREG: +- // HWREG SR memory offset : size(VGPR)+size(SGPR) +- get_vgpr_size_bytes(s_save_mem_offset) +- get_sgpr_size_bytes(s_save_tmp) +- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp +- +- +- s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- +- write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 +- +- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) +- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 +- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over +- s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO +- s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI +- end +- +- write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC +- write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) +- write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC +- write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) +- write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS +- +- //s_save_trapsts conflicts with s_save_alloc_size +- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +- write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS +- +- write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO +- write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI +- +- //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 +- s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE +- write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) +- write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO +- write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI +- +- +- +- /* the first wave in the threadgroup */ +- // save fist_wave bits in tba_hi unused bit.26 +- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit +- //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] +- s_mov_b32 s_save_exec_hi, 0x0 +- s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] +- +- +- /* save SGPRs */ +- // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... +- ////////////////////////////// +- +- // SGPR SR memory offset : size(VGPR) +- get_vgpr_size_bytes(s_save_mem_offset) +- // TODO, change RSRC word to rearrange memory layout for SGPRS +- +- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 +- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) +- +- if (SGPR_SAVE_USE_SQC) +- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes +- else +- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) +- end +- +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- +- // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 +- //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 +- s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 +- s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset +- s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 +- +- s_mov_b32 m0, 0x0 //SGPR initial index value =0 +- L_SAVE_SGPR_LOOP: +- // SGPR is allocated in 16 SGPR granularity +- s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] +- s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] +- s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] +- s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] +- s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] +- s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] +- s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] +- s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] +- +- write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 +- s_add_u32 m0, m0, 16 //next sgpr index +- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? +- // restore s_save_buf_rsrc0,1 +- //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo +- s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo +- +- +- +- +- /* save first 4 VGPR, then LDS save could use */ +- // each wave will alloc 4 vgprs at least... +- ///////////////////////////////////////////////////////////////////////////////////// +- +- s_mov_b32 s_save_mem_offset, 0 +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- +- // VGPR Allocated in 4-GPR granularity +- +-if G8SR_VGPR_SR_IN_DWX4 +- // the const stride for DWx4 is 4*4 bytes +- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes +- +- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +- +- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +-else +- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 +- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 +- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +-end +- +- +- +- /* save LDS */ +- ////////////////////////////// +- +- L_SAVE_LDS: +- +- // Change EXEC to all threads... +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size +- s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? +- s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE +- +- s_barrier //LDS is used? wait for other waves in the same TG +- //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here +- s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here +- s_cbranch_scc0 L_SAVE_LDS_DONE +- +- // first wave do LDS save; +- +- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw +- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes +- s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes +- +- // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) +- // +- get_vgpr_size_bytes(s_save_mem_offset) +- get_sgpr_size_bytes(s_save_tmp) +- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp +- s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() +- +- +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- s_mov_b32 m0, 0x0 //lds_offset initial value = 0 +- +- +-var LDS_DMA_ENABLE = 0 +-var UNROLL = 0 +-if UNROLL==0 && LDS_DMA_ENABLE==1 +- s_mov_b32 s3, 256*2 +- s_nop 0 +- s_nop 0 +- s_nop 0 +- L_SAVE_LDS_LOOP: +- //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? +- if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity +- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW +- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW +- end +- +- s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes +- s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes +- s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? +- +-elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss +- // store from higest LDS address to lowest +- s_mov_b32 s3, 256*2 +- s_sub_u32 m0, s_save_alloc_size, s3 +- s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 +- s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... +- s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest +- s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc +- s_nop 0 +- s_nop 0 +- s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes +- s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved +- s_add_u32 s0, s0,s_save_alloc_size +- s_addc_u32 s1, s1, 0 +- s_setpc_b64 s[0:1] +- +- +- for var i =0; i< 128; i++ +- // be careful to make here a 64Byte aligned address, which could improve performance... +- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW +- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW +- +- if i!=127 +- s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline +- s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 +- end +- end +- +-else // BUFFER_STORE +- v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 +- v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid +- v_mul_i32_i24 v2, v3, 8 // tid*8 +- v_mov_b32 v3, 256*2 +- s_mov_b32 m0, 0x10000 +- s_mov_b32 s0, s_save_buf_rsrc3 +- s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid +- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT +- +-L_SAVE_LDS_LOOP_VECTOR: +- ds_read_b64 v[0:1], v2 //x =LDS[a], byte address +- s_waitcnt lgkmcnt(0) +- buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 +-// s_waitcnt vmcnt(0) +- v_add_u32 v2, vcc[0:1], v2, v3 +- v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size +- s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR +- +- // restore rsrc3 +- s_mov_b32 s_save_buf_rsrc3, s0 +- +-end +- +-L_SAVE_LDS_DONE: +- +- +- /* save VGPRs - set the Rest VGPRs */ +- ////////////////////////////////////////////////////////////////////////////////////// +- L_SAVE_VGPR: +- // VGPR SR memory offset: 0 +- // TODO rearrange the RSRC words to use swizzle for VGPR save... +- +- s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 +- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible +- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- +- // VGPR Allocated in 4-GPR granularity +- +-if G8SR_VGPR_SR_IN_DWX4 +- // the const stride for DWx4 is 4*4 bytes +- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes +- +- s_mov_b32 m0, 4 // skip first 4 VGPRs +- s_cmp_lt_u32 m0, s_save_alloc_size +- s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs +- +- s_set_gpr_idx_on m0, 0x1 // This will change M0 +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 +-L_SAVE_VGPR_LOOP: +- v_mov_b32 v0, v0 // v0 = v[0+m0] +- v_mov_b32 v1, v1 +- v_mov_b32 v2, v2 +- v_mov_b32 v3, v3 +- +- +- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +- s_add_u32 m0, m0, 4 +- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 +- s_cmp_lt_u32 m0, s_save_alloc_size +- s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? +- s_set_gpr_idx_off +-L_SAVE_VGPR_LOOP_END: +- +- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +-else +- // VGPR store using dw burst +- s_mov_b32 m0, 0x4 //VGPR initial index value =0 +- s_cmp_lt_u32 m0, s_save_alloc_size +- s_cbranch_scc0 L_SAVE_VGPR_END +- +- +- s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later +- +- L_SAVE_VGPR_LOOP: +- v_mov_b32 v0, v0 //v0 = v[0+m0] +- v_mov_b32 v1, v1 //v0 = v[0+m0] +- v_mov_b32 v2, v2 //v0 = v[0+m0] +- v_mov_b32 v3, v3 //v0 = v[0+m0] +- +- if(USE_MTBUF_INSTEAD_OF_MUBUF) +- tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 +- else +- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 +- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 +- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +- end +- +- s_add_u32 m0, m0, 4 //next vgpr index +- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes +- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? +- s_set_gpr_idx_off +-end +- +-L_SAVE_VGPR_END: +- +- +- +- +- +- +- /* S_PGM_END_SAVED */ //FIXME graphics ONLY +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) +- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] +- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 +- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over +- s_rfe_b64 s_save_pc_lo //Return to the main shader program +- else +- end +- +-// Save Done timestamp +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_save_d +- // SGPR SR memory offset : size(VGPR) +- get_vgpr_size_bytes(s_save_mem_offset) +- s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET +- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +- // Need reset rsrc2?? +- s_mov_b32 m0, s_save_mem_offset +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 +-end +- +- +- s_branch L_END_PGM +- +- +- +-/**************************************************************************/ +-/* restore routine */ +-/**************************************************************************/ +- +-L_RESTORE: +- /* Setup Resource Contants */ +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) +- //calculate wd_addr using absolute thread id +- v_readlane_b32 s_restore_tmp, v9, 0 +- s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 +- s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE +- s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO +- s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI +- s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL +- else +- end +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_restore_s +- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +- // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... +- s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] +- s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. +-end +- +- +- +- s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo +- s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi +- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE +- s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) +- s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC +- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK +- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position +- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC +- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK +- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position +- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE +- +- /* global mem offset */ +-// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 +- +- /* the first wave in the threadgroup */ +- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK +- s_cbranch_scc0 L_RESTORE_VGPR +- +- /* restore LDS */ +- ////////////////////////////// +- L_RESTORE_LDS: +- +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size +- s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? +- s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR +- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw +- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes +- s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes +- +- // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) +- // +- get_vgpr_size_bytes(s_restore_mem_offset) +- get_sgpr_size_bytes(s_restore_tmp) +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? +- +- +- if (SWIZZLE_EN) +- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- s_mov_b32 m0, 0x0 //lds_offset initial value = 0 +- +- L_RESTORE_LDS_LOOP: +- if (SAVE_LDS) +- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW +- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW +- end +- s_add_u32 m0, m0, 256*2 // 128 DW +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW +- s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? +- +- +- /* restore VGPRs */ +- ////////////////////////////// +- L_RESTORE_VGPR: +- // VGPR SR memory offset : 0 +- s_mov_b32 s_restore_mem_offset, 0x0 +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size +- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 +- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) +- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) +- if (SWIZZLE_EN) +- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +-if G8SR_VGPR_SR_IN_DWX4 +- get_vgpr_size_bytes(s_restore_mem_offset) +- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 +- +- // the const stride for DWx4 is 4*4 bytes +- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes +- +- s_mov_b32 m0, s_restore_alloc_size +- s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 +- +-L_RESTORE_VGPR_LOOP: +- buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 +- s_waitcnt vmcnt(0) +- s_sub_u32 m0, m0, 4 +- v_mov_b32 v0, v0 // v[0+m0] = v0 +- v_mov_b32 v1, v1 +- v_mov_b32 v2, v2 +- v_mov_b32 v3, v3 +- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 +- s_cmp_eq_u32 m0, 0x8000 +- s_cbranch_scc0 L_RESTORE_VGPR_LOOP +- s_set_gpr_idx_off +- +- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes +- +-else +- // VGPR load using dw burst +- s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 +- s_mov_b32 m0, 4 //VGPR initial index value = 1 +- s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 +- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later +- +- L_RESTORE_VGPR_LOOP: +- if(USE_MTBUF_INSTEAD_OF_MUBUF) +- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 +- else +- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 +- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 +- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 +- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 +- end +- s_waitcnt vmcnt(0) //ensure data ready +- v_mov_b32 v0, v0 //v[0+m0] = v0 +- v_mov_b32 v1, v1 +- v_mov_b32 v2, v2 +- v_mov_b32 v3, v3 +- s_add_u32 m0, m0, 4 //next vgpr index +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes +- s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? +- s_set_gpr_idx_off +- /* VGPR restore on v0 */ +- if(USE_MTBUF_INSTEAD_OF_MUBUF) +- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 +- else +- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 +- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 +- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 +- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 +- end +- +-end +- +- /* restore SGPRs */ +- ////////////////////////////// +- +- // SGPR SR memory offset : size(VGPR) +- get_vgpr_size_bytes(s_restore_mem_offset) +- get_sgpr_size_bytes(s_restore_tmp) +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp +- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group +- // TODO, change RSRC word to rearrange memory layout for SGPRS +- +- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size +- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 +- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) +- +- if (SGPR_SAVE_USE_SQC) +- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes +- else +- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) +- end +- if (SWIZZLE_EN) +- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), +- However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG +- */ +- s_mov_b32 m0, s_restore_alloc_size +- +- L_RESTORE_SGPR_LOOP: +- read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made +- s_waitcnt lgkmcnt(0) //ensure data ready +- +- s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] +- +- s_movreld_b64 s0, s0 //s[0+m0] = s0 +- s_movreld_b64 s2, s2 +- s_movreld_b64 s4, s4 +- s_movreld_b64 s6, s6 +- s_movreld_b64 s8, s8 +- s_movreld_b64 s10, s10 +- s_movreld_b64 s12, s12 +- s_movreld_b64 s14, s14 +- +- s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 +- s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? +- +- /* restore HW registers */ +- ////////////////////////////// +- L_RESTORE_HWREG: +- +- +-if G8SR_DEBUG_TIMESTAMP +- s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo +- s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi +-end +- +- // HWREG SR memory offset : size(VGPR)+size(SGPR) +- get_vgpr_size_bytes(s_restore_mem_offset) +- get_sgpr_size_bytes(s_restore_tmp) +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp +- +- +- s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes +- if (SWIZZLE_EN) +- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 +- read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC +- read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) +- read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC +- read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) +- read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS +- read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS +- read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO +- read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI +- read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE +- read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO +- read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI +- +- s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS +- +- s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS +- +- //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) +- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) +- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over +- end +- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) +- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal +- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over +- end +- +- s_mov_b32 m0, s_restore_m0 +- s_mov_b32 exec_lo, s_restore_exec_lo +- s_mov_b32 exec_hi, s_restore_exec_hi +- +- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts +- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 +- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts +- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT +- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 +- //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore +- s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode +- //reuse s_restore_m0 as a temp register +- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK +- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT +- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT +- s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero +- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 +- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK +- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT +- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 +- s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK +- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT +- s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp +- +- s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 +- s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 +- s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu +- +- s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_restore_d +- s_waitcnt lgkmcnt(0) +-end +- +-// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution +- s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc +- +- +-/**************************************************************************/ +-/* the END */ +-/**************************************************************************/ +-L_END_PGM: +- s_endpgm +- +-end +- +- +-/**************************************************************************/ +-/* the helper functions */ +-/**************************************************************************/ +- +-//Only for save hwreg to mem +-function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) +- s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on +- s_mov_b32 m0, s_mem_offset +- s_buffer_store_dword s, s_rsrc, m0 glc:1 +- s_add_u32 s_mem_offset, s_mem_offset, 4 +- s_mov_b32 m0, exec_lo +-end +- +- +-// HWREG are saved before SGPRs, so all HWREG could be use. +-function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) +- +- s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 +- s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 +- s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 +- s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 +- s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 +- s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc +-end +- +- +-function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) +- s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 +- s_add_u32 s_mem_offset, s_mem_offset, 4 +-end +- +-function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) +- s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 +- s_sub_u32 s_mem_offset, s_mem_offset, 4*16 +-end +- +- +- +-function get_lds_size_bytes(s_lds_size_byte) +- // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW +- s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size +- s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW +-end +- +-function get_vgpr_size_bytes(s_vgpr_size_byte) +- s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size +- s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 +- s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible +-end +- +-function get_sgpr_size_bytes(s_sgpr_size_byte) +- s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size +- s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 +- s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) +-end +- +-function get_hwreg_size_bytes +- return 128 //HWREG size 128 bytes +-end +- +- +-#endif +- +-static const uint32_t cwsr_trap_carrizo_hex[] = { +- 0xbf820001, 0xbf820123, +- 0xb8f4f802, 0x89748674, +- 0xb8f5f803, 0x8675ff75, +- 0x00000400, 0xbf850011, +- 0xc00a1e37, 0x00000000, +- 0xbf8c007f, 0x87777978, +- 0xbf840002, 0xb974f802, +- 0xbe801d78, 0xb8f5f803, +- 0x8675ff75, 0x000001ff, +- 0xbf850002, 0x80708470, +- 0x82718071, 0x8671ff71, +- 0x0000ffff, 0xb974f802, +- 0xbe801f70, 0xb8f5f803, +- 0x8675ff75, 0x00000100, +- 0xbf840006, 0xbefa0080, +- 0xb97a0203, 0x8671ff71, +- 0x0000ffff, 0x80f08870, +- 0x82f18071, 0xbefa0080, +- 0xb97a0283, 0xbef60068, +- 0xbef70069, 0xb8fa1c07, +- 0x8e7a9c7a, 0x87717a71, +- 0xb8fa03c7, 0x8e7a9b7a, +- 0x87717a71, 0xb8faf807, +- 0x867aff7a, 0x00007fff, +- 0xb97af807, 0xbef2007e, +- 0xbef3007f, 0xbefe0180, +- 0xbf900004, 0xbf8e0002, +- 0xbf88fffe, 0xbef8007e, +- 0x8679ff7f, 0x0000ffff, +- 0x8779ff79, 0x00040000, +- 0xbefa0080, 0xbefb00ff, +- 0x00807fac, 0x867aff7f, +- 0x08000000, 0x8f7a837a, +- 0x877b7a7b, 0x867aff7f, +- 0x70000000, 0x8f7a817a, +- 0x877b7a7b, 0xbeef007c, +- 0xbeee0080, 0xb8ee2a05, +- 0x806e816e, 0x8e6e8a6e, +- 0xb8fa1605, 0x807a817a, +- 0x8e7a867a, 0x806e7a6e, +- 0xbefa0084, 0xbefa00ff, +- 0x01000000, 0xbefe007c, +- 0xbefc006e, 0xc0611bfc, +- 0x0000007c, 0x806e846e, +- 0xbefc007e, 0xbefe007c, +- 0xbefc006e, 0xc0611c3c, +- 0x0000007c, 0x806e846e, +- 0xbefc007e, 0xbefe007c, +- 0xbefc006e, 0xc0611c7c, +- 0x0000007c, 0x806e846e, +- 0xbefc007e, 0xbefe007c, +- 0xbefc006e, 0xc0611cbc, +- 0x0000007c, 0x806e846e, +- 0xbefc007e, 0xbefe007c, +- 0xbefc006e, 0xc0611cfc, +- 0x0000007c, 0x806e846e, +- 0xbefc007e, 0xbefe007c, +- 0xbefc006e, 0xc0611d3c, +- 0x0000007c, 0x806e846e, +- 0xbefc007e, 0xb8f5f803, +- 0xbefe007c, 0xbefc006e, +- 0xc0611d7c, 0x0000007c, +- 0x806e846e, 0xbefc007e, +- 0xbefe007c, 0xbefc006e, +- 0xc0611dbc, 0x0000007c, +- 0x806e846e, 0xbefc007e, +- 0xbefe007c, 0xbefc006e, +- 0xc0611dfc, 0x0000007c, +- 0x806e846e, 0xbefc007e, +- 0xb8eff801, 0xbefe007c, +- 0xbefc006e, 0xc0611bfc, +- 0x0000007c, 0x806e846e, +- 0xbefc007e, 0xbefe007c, +- 0xbefc006e, 0xc0611b3c, +- 0x0000007c, 0x806e846e, +- 0xbefc007e, 0xbefe007c, +- 0xbefc006e, 0xc0611b7c, +- 0x0000007c, 0x806e846e, +- 0xbefc007e, 0x867aff7f, +- 0x04000000, 0xbef30080, +- 0x8773737a, 0xb8ee2a05, +- 0x806e816e, 0x8e6e8a6e, +- 0xb8f51605, 0x80758175, +- 0x8e758475, 0x8e7a8275, +- 0xbefa00ff, 0x01000000, +- 0xbef60178, 0x80786e78, +- 0x82798079, 0xbefc0080, +- 0xbe802b00, 0xbe822b02, +- 0xbe842b04, 0xbe862b06, +- 0xbe882b08, 0xbe8a2b0a, +- 0xbe8c2b0c, 0xbe8e2b0e, +- 0xc06b003c, 0x00000000, +- 0xc06b013c, 0x00000010, +- 0xc06b023c, 0x00000020, +- 0xc06b033c, 0x00000030, +- 0x8078c078, 0x82798079, +- 0x807c907c, 0xbf0a757c, +- 0xbf85ffeb, 0xbef80176, +- 0xbeee0080, 0xbefe00c1, +- 0xbeff00c1, 0xbefa00ff, +- 0x01000000, 0xe0724000, +- 0x6e1e0000, 0xe0724100, +- 0x6e1e0100, 0xe0724200, +- 0x6e1e0200, 0xe0724300, +- 0x6e1e0300, 0xbefe00c1, +- 0xbeff00c1, 0xb8f54306, +- 0x8675c175, 0xbf84002c, +- 0xbf8a0000, 0x867aff73, +- 0x04000000, 0xbf840028, +- 0x8e758675, 0x8e758275, +- 0xbefa0075, 0xb8ee2a05, +- 0x806e816e, 0x8e6e8a6e, +- 0xb8fa1605, 0x807a817a, +- 0x8e7a867a, 0x806e7a6e, +- 0x806eff6e, 0x00000080, +- 0xbefa00ff, 0x01000000, +- 0xbefc0080, 0xd28c0002, +- 0x000100c1, 0xd28d0003, +- 0x000204c1, 0xd1060002, +- 0x00011103, 0x7e0602ff, +- 0x00000200, 0xbefc00ff, +- 0x00010000, 0xbe80007b, +- 0x867bff7b, 0xff7fffff, +- 0x877bff7b, 0x00058000, +- 0xd8ec0000, 0x00000002, +- 0xbf8c007f, 0xe0765000, +- 0x6e1e0002, 0x32040702, +- 0xd0c9006a, 0x0000eb02, +- 0xbf87fff7, 0xbefb0000, +- 0xbeee00ff, 0x00000400, +- 0xbefe00c1, 0xbeff00c1, +- 0xb8f52a05, 0x80758175, +- 0x8e758275, 0x8e7a8875, +- 0xbefa00ff, 0x01000000, +- 0xbefc0084, 0xbf0a757c, +- 0xbf840015, 0xbf11017c, +- 0x8075ff75, 0x00001000, +- 0x7e000300, 0x7e020301, +- 0x7e040302, 0x7e060303, +- 0xe0724000, 0x6e1e0000, +- 0xe0724100, 0x6e1e0100, +- 0xe0724200, 0x6e1e0200, +- 0xe0724300, 0x6e1e0300, +- 0x807c847c, 0x806eff6e, +- 0x00000400, 0xbf0a757c, +- 0xbf85ffef, 0xbf9c0000, +- 0xbf8200ca, 0xbef8007e, +- 0x8679ff7f, 0x0000ffff, +- 0x8779ff79, 0x00040000, +- 0xbefa0080, 0xbefb00ff, +- 0x00807fac, 0x8676ff7f, +- 0x08000000, 0x8f768376, +- 0x877b767b, 0x8676ff7f, +- 0x70000000, 0x8f768176, +- 0x877b767b, 0x8676ff7f, +- 0x04000000, 0xbf84001e, +- 0xbefe00c1, 0xbeff00c1, +- 0xb8f34306, 0x8673c173, +- 0xbf840019, 0x8e738673, +- 0x8e738273, 0xbefa0073, +- 0xb8f22a05, 0x80728172, +- 0x8e728a72, 0xb8f61605, +- 0x80768176, 0x8e768676, +- 0x80727672, 0x8072ff72, +- 0x00000080, 0xbefa00ff, +- 0x01000000, 0xbefc0080, +- 0xe0510000, 0x721e0000, +- 0xe0510100, 0x721e0000, +- 0x807cff7c, 0x00000200, +- 0x8072ff72, 0x00000200, +- 0xbf0a737c, 0xbf85fff6, +- 0xbef20080, 0xbefe00c1, +- 0xbeff00c1, 0xb8f32a05, +- 0x80738173, 0x8e738273, +- 0x8e7a8873, 0xbefa00ff, +- 0x01000000, 0xbef60072, +- 0x8072ff72, 0x00000400, +- 0xbefc0084, 0xbf11087c, +- 0x8073ff73, 0x00008000, +- 0xe0524000, 0x721e0000, +- 0xe0524100, 0x721e0100, +- 0xe0524200, 0x721e0200, +- 0xe0524300, 0x721e0300, +- 0xbf8c0f70, 0x7e000300, +- 0x7e020301, 0x7e040302, +- 0x7e060303, 0x807c847c, +- 0x8072ff72, 0x00000400, +- 0xbf0a737c, 0xbf85ffee, +- 0xbf9c0000, 0xe0524000, +- 0x761e0000, 0xe0524100, +- 0x761e0100, 0xe0524200, +- 0x761e0200, 0xe0524300, +- 0x761e0300, 0xb8f22a05, +- 0x80728172, 0x8e728a72, +- 0xb8f61605, 0x80768176, +- 0x8e768676, 0x80727672, +- 0x80f2c072, 0xb8f31605, +- 0x80738173, 0x8e738473, +- 0x8e7a8273, 0xbefa00ff, +- 0x01000000, 0xbefc0073, +- 0xc031003c, 0x00000072, +- 0x80f2c072, 0xbf8c007f, +- 0x80fc907c, 0xbe802d00, +- 0xbe822d02, 0xbe842d04, +- 0xbe862d06, 0xbe882d08, +- 0xbe8a2d0a, 0xbe8c2d0c, +- 0xbe8e2d0e, 0xbf06807c, +- 0xbf84fff1, 0xb8f22a05, +- 0x80728172, 0x8e728a72, +- 0xb8f61605, 0x80768176, +- 0x8e768676, 0x80727672, +- 0xbefa0084, 0xbefa00ff, +- 0x01000000, 0xc0211cfc, +- 0x00000072, 0x80728472, +- 0xc0211c3c, 0x00000072, +- 0x80728472, 0xc0211c7c, +- 0x00000072, 0x80728472, +- 0xc0211bbc, 0x00000072, +- 0x80728472, 0xc0211bfc, +- 0x00000072, 0x80728472, +- 0xc0211d3c, 0x00000072, +- 0x80728472, 0xc0211d7c, +- 0x00000072, 0x80728472, +- 0xc0211a3c, 0x00000072, +- 0x80728472, 0xc0211a7c, +- 0x00000072, 0x80728472, +- 0xc0211dfc, 0x00000072, +- 0x80728472, 0xc0211b3c, +- 0x00000072, 0x80728472, +- 0xc0211b7c, 0x00000072, +- 0x80728472, 0xbf8c007f, +- 0x8671ff71, 0x0000ffff, +- 0xbefc0073, 0xbefe006e, +- 0xbeff006f, 0x867375ff, +- 0x000003ff, 0xb9734803, +- 0x867375ff, 0xfffff800, +- 0x8f738b73, 0xb973a2c3, +- 0xb977f801, 0x8673ff71, +- 0xf0000000, 0x8f739c73, +- 0x8e739073, 0xbef60080, +- 0x87767376, 0x8673ff71, +- 0x08000000, 0x8f739b73, +- 0x8e738f73, 0x87767376, +- 0x8673ff74, 0x00800000, +- 0x8f739773, 0xb976f807, +- 0x86fe7e7e, 0x86ea6a6a, +- 0xb974f802, 0xbf8a0000, +- 0x95807370, 0xbf810000, +-}; +- +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +deleted file mode 100644 +index ae2af3d..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm ++++ /dev/null +@@ -1,1388 +0,0 @@ +-/* +- * Copyright 2016 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#if 0 +-HW (GFX9) source code for CWSR trap handler +-#Version 18 + multiple trap handler +- +-// this performance-optimal version was originally from Seven Xu at SRDC +- +-// Revison #18 --... +-/* Rev History +-** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) +-** #4. SR Memory Layout: +-** 1. VGPR-SGPR-HWREG-{LDS} +-** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. +-** #5. Update: 1. Accurate g8sr_ts_save_d timestamp +-** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) +-** #7. Update: 1. don't barrier if noLDS +-** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version +-** 2. Fix SQ issue by s_sleep 2 +-** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last +-** 2. optimize s_buffer save by burst 16sgprs... +-** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. +-** #11. Update 1. Add 2 more timestamp for debug version +-** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance +-** #13. Integ 1. Always use MUBUF for PV trap shader... +-** #14. Update 1. s_buffer_store soft clause... +-** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. +-** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree +-** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] +-** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... +-** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 +-** 2. FUNC - Handle non-CWSR traps +-*/ +- +-var G8SR_WDMEM_HWREG_OFFSET = 0 +-var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes +- +-// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. +- +-var G8SR_DEBUG_TIMESTAMP = 0 +-var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset +-var s_g8sr_ts_save_s = s[34:35] // save start +-var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi +-var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ +-var s_g8sr_ts_save_d = s[40:41] // save end +-var s_g8sr_ts_restore_s = s[42:43] // restore start +-var s_g8sr_ts_restore_d = s[44:45] // restore end +- +-var G8SR_VGPR_SR_IN_DWX4 = 0 +-var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes +-var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 +- +- +-/*************************************************************************/ +-/* control on how to run the shader */ +-/*************************************************************************/ +-//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) +-var EMU_RUN_HACK = 0 +-var EMU_RUN_HACK_RESTORE_NORMAL = 0 +-var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +-var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 +-var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +-var SAVE_LDS = 1 +-var WG_BASE_ADDR_LO = 0x9000a000 +-var WG_BASE_ADDR_HI = 0x0 +-var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem +-var CTX_SAVE_CONTROL = 0x0 +-var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +-var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) +-var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write +-var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes +-var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing +- +-/**************************************************************************/ +-/* variables */ +-/**************************************************************************/ +-var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +-var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +-var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 +-var SQ_WAVE_STATUS_HALT_MASK = 0x2000 +- +-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits +- +-var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +-var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask +-var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +-var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +-var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +-var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 +-var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 +- +-var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME +-var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +-var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME +- +-var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +-var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 +- +- +-/* Save */ +-var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes +-var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE +- +-var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +-var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +-var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +-var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +-var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +-var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 +- +-var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +-var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME +-var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME +-var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME +- +-var s_save_spi_init_lo = exec_lo +-var s_save_spi_init_hi = exec_hi +- +-var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +-var s_save_pc_hi = ttmp1 +-var s_save_exec_lo = ttmp2 +-var s_save_exec_hi = ttmp3 +-var s_save_status = ttmp4 +-var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +-var s_save_xnack_mask_lo = ttmp6 +-var s_save_xnack_mask_hi = ttmp7 +-var s_save_buf_rsrc0 = ttmp8 +-var s_save_buf_rsrc1 = ttmp9 +-var s_save_buf_rsrc2 = ttmp10 +-var s_save_buf_rsrc3 = ttmp11 +- +-var s_save_mem_offset = ttmp14 +-var s_save_alloc_size = s_save_trapsts //conflict +-var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) +-var s_save_m0 = ttmp15 +- +-/* Restore */ +-var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +-var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC +- +-var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +-var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +-var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +-var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +-var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +-var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 +- +-var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +-var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +-var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +-var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK +- +-var s_restore_spi_init_lo = exec_lo +-var s_restore_spi_init_hi = exec_hi +- +-var s_restore_mem_offset = ttmp12 +-var s_restore_alloc_size = ttmp3 +-var s_restore_tmp = ttmp6 +-var s_restore_mem_offset_save = s_restore_tmp //no conflict +- +-var s_restore_m0 = s_restore_alloc_size //no conflict +- +-var s_restore_mode = ttmp7 +- +-var s_restore_pc_lo = ttmp0 +-var s_restore_pc_hi = ttmp1 +-var s_restore_exec_lo = ttmp14 +-var s_restore_exec_hi = ttmp15 +-var s_restore_status = ttmp4 +-var s_restore_trapsts = ttmp5 +-var s_restore_xnack_mask_lo = xnack_mask_lo +-var s_restore_xnack_mask_hi = xnack_mask_hi +-var s_restore_buf_rsrc0 = ttmp8 +-var s_restore_buf_rsrc1 = ttmp9 +-var s_restore_buf_rsrc2 = ttmp10 +-var s_restore_buf_rsrc3 = ttmp11 +- +-/**************************************************************************/ +-/* trap handler entry points */ +-/**************************************************************************/ +-/* Shader Main*/ +- +-shader main +- asic(GFX9) +- type(CS) +- +- +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore +- //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC +- s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC +- s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. +- s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE +- //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE +- s_branch L_SKIP_RESTORE //NOT restore, SAVE actually +- else +- s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save +- end +- +-L_JUMP_TO_RESTORE: +- s_branch L_RESTORE //restore +- +-L_SKIP_RESTORE: +- +- s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC +- s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save +- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +- s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save +- s_cbranch_scc1 L_SAVE //this is the operation for save +- +- // ********* Handle non-CWSR traps ******************* +-if (!EMU_RUN_HACK) +- // Illegal instruction is a non-maskable exception which blocks context save. +- // Halt the wavefront and return from the trap. +- s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK +- s_cbranch_scc1 L_HALT_WAVE +- +- // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. +- // Instead, halt the wavefront and return from the trap. +- s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK +- s_cbranch_scc0 L_NO_MEM_VIOL +- +-L_HALT_WAVE: +- s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK +- s_branch L_EXCP_CASE +- +-L_NO_MEM_VIOL: +- /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ +- s_getreg_b32 ttmp14,hwreg(HW_REG_SQ_SHADER_TMA_LO) +- s_getreg_b32 ttmp15,hwreg(HW_REG_SQ_SHADER_TMA_HI) +- s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 +- s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [ttmp14, ttmp15], 0 +- s_waitcnt lgkmcnt(0) +- s_or_b32 ttmp7, ttmp8, ttmp9 +- s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set +- s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) +- s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler +- +-L_NO_NEXT_TRAP: +- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception +- s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. +- s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 +- s_addc_u32 ttmp1, ttmp1, 0 +-L_EXCP_CASE: +- s_and_b32 ttmp1, ttmp1, 0xFFFF +- s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) +- s_rfe_b64 [ttmp0, ttmp1] +-end +- // ********* End handling of non-CWSR traps ******************* +- +-/**************************************************************************/ +-/* save routine */ +-/**************************************************************************/ +- +-L_SAVE: +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_save_s +- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +-end +- +- //check whether there is mem_viol +- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK +- s_cbranch_scc0 L_NO_PC_REWIND +- +- //if so, need rewind PC assuming GDS operation gets NACKed +- s_mov_b32 s_save_tmp, 0 //clear mem_viol bit +- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit +- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] +- s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 +- s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc +- +-L_NO_PC_REWIND: +- s_mov_b32 s_save_tmp, 0 //clear saveCtx bit +- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit +- +- s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK +- s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation +- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT +- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT +- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp +- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY +- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp +- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS +- s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG +- +- s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp +- +- /* inform SPI the readiness and wait for SPI's go signal */ +- s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI +- s_mov_b32 s_save_exec_hi, exec_hi +- s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_sq_save_msg +- s_waitcnt lgkmcnt(0) +-end +- +- if (EMU_RUN_HACK) +- +- else +- s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC +- end +- +- L_SLEEP: +- s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 +- +- if (EMU_RUN_HACK) +- +- else +- s_cbranch_execz L_SLEEP +- end +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_spi_wrexec +- s_waitcnt lgkmcnt(0) +-end +- +- /* setup Resource Contants */ +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) +- //calculate wd_addr using absolute thread id +- v_readlane_b32 s_save_tmp, v9, 0 +- s_lshr_b32 s_save_tmp, s_save_tmp, 6 +- s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE +- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO +- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI +- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL +- else +- end +- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) +- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO +- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI +- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL +- else +- end +- +- +- s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo +- s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE +- s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited +- s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC +- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK +- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position +- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC +- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK +- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position +- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE +- +- //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) +- s_mov_b32 s_save_m0, m0 //save M0 +- +- /* global mem offset */ +- s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 +- +- +- +- +- /* save HW registers */ +- ////////////////////////////// +- +- L_SAVE_HWREG: +- // HWREG SR memory offset : size(VGPR)+size(SGPR) +- get_vgpr_size_bytes(s_save_mem_offset) +- get_sgpr_size_bytes(s_save_tmp) +- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp +- +- +- s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- +- write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 +- +- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) +- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 +- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over +- end +- +- write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC +- write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) +- write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC +- write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) +- write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS +- +- //s_save_trapsts conflicts with s_save_alloc_size +- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +- write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS +- +- write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO +- write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI +- +- //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 +- s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE +- write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) +- +- +- +- /* the first wave in the threadgroup */ +- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit +- s_mov_b32 s_save_exec_hi, 0x0 +- s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] +- +- +- /* save SGPRs */ +- // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... +- ////////////////////////////// +- +- // SGPR SR memory offset : size(VGPR) +- get_vgpr_size_bytes(s_save_mem_offset) +- // TODO, change RSRC word to rearrange memory layout for SGPRS +- +- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 +- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) +- +- if (SGPR_SAVE_USE_SQC) +- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes +- else +- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) +- end +- +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- +- // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 +- //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 +- s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 +- s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset +- s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 +- +- s_mov_b32 m0, 0x0 //SGPR initial index value =0 +- s_nop 0x0 //Manually inserted wait states +- L_SAVE_SGPR_LOOP: +- // SGPR is allocated in 16 SGPR granularity +- s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] +- s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] +- s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] +- s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] +- s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] +- s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] +- s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] +- s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] +- +- write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 +- s_add_u32 m0, m0, 16 //next sgpr index +- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? +- // restore s_save_buf_rsrc0,1 +- //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo +- s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo +- +- +- +- +- /* save first 4 VGPR, then LDS save could use */ +- // each wave will alloc 4 vgprs at least... +- ///////////////////////////////////////////////////////////////////////////////////// +- +- s_mov_b32 s_save_mem_offset, 0 +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- +- // VGPR Allocated in 4-GPR granularity +- +-if G8SR_VGPR_SR_IN_DWX4 +- // the const stride for DWx4 is 4*4 bytes +- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes +- +- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +- +- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +-else +- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 +- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 +- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +-end +- +- +- +- /* save LDS */ +- ////////////////////////////// +- +- L_SAVE_LDS: +- +- // Change EXEC to all threads... +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size +- s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? +- s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE +- +- s_barrier //LDS is used? wait for other waves in the same TG +- s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here +- s_cbranch_scc0 L_SAVE_LDS_DONE +- +- // first wave do LDS save; +- +- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw +- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes +- s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes +- +- // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) +- // +- get_vgpr_size_bytes(s_save_mem_offset) +- get_sgpr_size_bytes(s_save_tmp) +- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp +- s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() +- +- +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- s_mov_b32 m0, 0x0 //lds_offset initial value = 0 +- +- +-var LDS_DMA_ENABLE = 0 +-var UNROLL = 0 +-if UNROLL==0 && LDS_DMA_ENABLE==1 +- s_mov_b32 s3, 256*2 +- s_nop 0 +- s_nop 0 +- s_nop 0 +- L_SAVE_LDS_LOOP: +- //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? +- if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity +- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW +- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW +- end +- +- s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes +- s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes +- s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? +- +-elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss +- // store from higest LDS address to lowest +- s_mov_b32 s3, 256*2 +- s_sub_u32 m0, s_save_alloc_size, s3 +- s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 +- s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... +- s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest +- s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc +- s_nop 0 +- s_nop 0 +- s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes +- s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved +- s_add_u32 s0, s0,s_save_alloc_size +- s_addc_u32 s1, s1, 0 +- s_setpc_b64 s[0:1] +- +- +- for var i =0; i< 128; i++ +- // be careful to make here a 64Byte aligned address, which could improve performance... +- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW +- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW +- +- if i!=127 +- s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline +- s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 +- end +- end +- +-else // BUFFER_STORE +- v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 +- v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid +- v_mul_i32_i24 v2, v3, 8 // tid*8 +- v_mov_b32 v3, 256*2 +- s_mov_b32 m0, 0x10000 +- s_mov_b32 s0, s_save_buf_rsrc3 +- s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid +- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT +- +-L_SAVE_LDS_LOOP_VECTOR: +- ds_read_b64 v[0:1], v2 //x =LDS[a], byte address +- s_waitcnt lgkmcnt(0) +- buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 +-// s_waitcnt vmcnt(0) +-// v_add_u32 v2, vcc[0:1], v2, v3 +- v_add_u32 v2, v2, v3 +- v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size +- s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR +- +- // restore rsrc3 +- s_mov_b32 s_save_buf_rsrc3, s0 +- +-end +- +-L_SAVE_LDS_DONE: +- +- +- /* save VGPRs - set the Rest VGPRs */ +- ////////////////////////////////////////////////////////////////////////////////////// +- L_SAVE_VGPR: +- // VGPR SR memory offset: 0 +- // TODO rearrange the RSRC words to use swizzle for VGPR save... +- +- s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 +- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible +- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) +- if (SWIZZLE_EN) +- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- +- // VGPR Allocated in 4-GPR granularity +- +-if G8SR_VGPR_SR_IN_DWX4 +- // the const stride for DWx4 is 4*4 bytes +- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes +- +- s_mov_b32 m0, 4 // skip first 4 VGPRs +- s_cmp_lt_u32 m0, s_save_alloc_size +- s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs +- +- s_set_gpr_idx_on m0, 0x1 // This will change M0 +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 +-L_SAVE_VGPR_LOOP: +- v_mov_b32 v0, v0 // v0 = v[0+m0] +- v_mov_b32 v1, v1 +- v_mov_b32 v2, v2 +- v_mov_b32 v3, v3 +- +- +- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +- s_add_u32 m0, m0, 4 +- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 +- s_cmp_lt_u32 m0, s_save_alloc_size +- s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? +- s_set_gpr_idx_off +-L_SAVE_VGPR_LOOP_END: +- +- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +-else +- // VGPR store using dw burst +- s_mov_b32 m0, 0x4 //VGPR initial index value =0 +- s_cmp_lt_u32 m0, s_save_alloc_size +- s_cbranch_scc0 L_SAVE_VGPR_END +- +- +- s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 +- s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later +- +- L_SAVE_VGPR_LOOP: +- v_mov_b32 v0, v0 //v0 = v[0+m0] +- v_mov_b32 v1, v1 //v0 = v[0+m0] +- v_mov_b32 v2, v2 //v0 = v[0+m0] +- v_mov_b32 v3, v3 //v0 = v[0+m0] +- +- if(USE_MTBUF_INSTEAD_OF_MUBUF) +- tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 +- else +- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 +- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 +- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +- end +- +- s_add_u32 m0, m0, 4 //next vgpr index +- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes +- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? +- s_set_gpr_idx_off +-end +- +-L_SAVE_VGPR_END: +- +- +- +- +- +- +- /* S_PGM_END_SAVED */ //FIXME graphics ONLY +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) +- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] +- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 +- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over +- s_rfe_b64 s_save_pc_lo //Return to the main shader program +- else +- end +- +-// Save Done timestamp +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_save_d +- // SGPR SR memory offset : size(VGPR) +- get_vgpr_size_bytes(s_save_mem_offset) +- s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET +- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +- // Need reset rsrc2?? +- s_mov_b32 m0, s_save_mem_offset +- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 +-end +- +- +- s_branch L_END_PGM +- +- +- +-/**************************************************************************/ +-/* restore routine */ +-/**************************************************************************/ +- +-L_RESTORE: +- /* Setup Resource Contants */ +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) +- //calculate wd_addr using absolute thread id +- v_readlane_b32 s_restore_tmp, v9, 0 +- s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 +- s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE +- s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO +- s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI +- s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL +- else +- end +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_restore_s +- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +- // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... +- s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] +- s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. +-end +- +- +- +- s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo +- s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi +- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE +- s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) +- s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC +- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK +- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position +- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC +- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK +- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position +- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE +- +- /* global mem offset */ +-// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 +- +- /* the first wave in the threadgroup */ +- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK +- s_cbranch_scc0 L_RESTORE_VGPR +- +- /* restore LDS */ +- ////////////////////////////// +- L_RESTORE_LDS: +- +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size +- s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? +- s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR +- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw +- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes +- s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes +- +- // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) +- // +- get_vgpr_size_bytes(s_restore_mem_offset) +- get_sgpr_size_bytes(s_restore_tmp) +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? +- +- +- if (SWIZZLE_EN) +- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- s_mov_b32 m0, 0x0 //lds_offset initial value = 0 +- +- L_RESTORE_LDS_LOOP: +- if (SAVE_LDS) +- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW +- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW +- end +- s_add_u32 m0, m0, 256*2 // 128 DW +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW +- s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? +- +- +- /* restore VGPRs */ +- ////////////////////////////// +- L_RESTORE_VGPR: +- // VGPR SR memory offset : 0 +- s_mov_b32 s_restore_mem_offset, 0x0 +- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead +- s_mov_b32 exec_hi, 0xFFFFFFFF +- +- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size +- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 +- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) +- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) +- if (SWIZZLE_EN) +- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +-if G8SR_VGPR_SR_IN_DWX4 +- get_vgpr_size_bytes(s_restore_mem_offset) +- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 +- +- // the const stride for DWx4 is 4*4 bytes +- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes +- +- s_mov_b32 m0, s_restore_alloc_size +- s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 +- +-L_RESTORE_VGPR_LOOP: +- buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 +- s_waitcnt vmcnt(0) +- s_sub_u32 m0, m0, 4 +- v_mov_b32 v0, v0 // v[0+m0] = v0 +- v_mov_b32 v1, v1 +- v_mov_b32 v2, v2 +- v_mov_b32 v3, v3 +- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 +- s_cmp_eq_u32 m0, 0x8000 +- s_cbranch_scc0 L_RESTORE_VGPR_LOOP +- s_set_gpr_idx_off +- +- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 +- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes +- +-else +- // VGPR load using dw burst +- s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 +- s_mov_b32 m0, 4 //VGPR initial index value = 1 +- s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 +- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later +- +- L_RESTORE_VGPR_LOOP: +- if(USE_MTBUF_INSTEAD_OF_MUBUF) +- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 +- else +- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 +- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 +- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 +- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 +- end +- s_waitcnt vmcnt(0) //ensure data ready +- v_mov_b32 v0, v0 //v[0+m0] = v0 +- v_mov_b32 v1, v1 +- v_mov_b32 v2, v2 +- v_mov_b32 v3, v3 +- s_add_u32 m0, m0, 4 //next vgpr index +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes +- s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 +- s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? +- s_set_gpr_idx_off +- /* VGPR restore on v0 */ +- if(USE_MTBUF_INSTEAD_OF_MUBUF) +- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 +- else +- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 +- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 +- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 +- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 +- end +- +-end +- +- /* restore SGPRs */ +- ////////////////////////////// +- +- // SGPR SR memory offset : size(VGPR) +- get_vgpr_size_bytes(s_restore_mem_offset) +- get_sgpr_size_bytes(s_restore_tmp) +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp +- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group +- // TODO, change RSRC word to rearrange memory layout for SGPRS +- +- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size +- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 +- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) +- +- if (SGPR_SAVE_USE_SQC) +- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes +- else +- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) +- end +- if (SWIZZLE_EN) +- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- s_mov_b32 m0, s_restore_alloc_size +- +- L_RESTORE_SGPR_LOOP: +- read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made +- s_waitcnt lgkmcnt(0) //ensure data ready +- +- s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] +- s_nop 0 // hazard SALU M0=> S_MOVREL +- +- s_movreld_b64 s0, s0 //s[0+m0] = s0 +- s_movreld_b64 s2, s2 +- s_movreld_b64 s4, s4 +- s_movreld_b64 s6, s6 +- s_movreld_b64 s8, s8 +- s_movreld_b64 s10, s10 +- s_movreld_b64 s12, s12 +- s_movreld_b64 s14, s14 +- +- s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 +- s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? +- +- /* restore HW registers */ +- ////////////////////////////// +- L_RESTORE_HWREG: +- +- +-if G8SR_DEBUG_TIMESTAMP +- s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo +- s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi +-end +- +- // HWREG SR memory offset : size(VGPR)+size(SGPR) +- get_vgpr_size_bytes(s_restore_mem_offset) +- get_sgpr_size_bytes(s_restore_tmp) +- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp +- +- +- s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes +- if (SWIZZLE_EN) +- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? +- else +- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +- end +- +- read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 +- read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC +- read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) +- read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC +- read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) +- read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS +- read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS +- read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO +- read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI +- read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE +- +- s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS +- +- s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS +- +- //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: +- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) +- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) +- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over +- end +- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) +- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal +- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over +- end +- +- s_mov_b32 m0, s_restore_m0 +- s_mov_b32 exec_lo, s_restore_exec_lo +- s_mov_b32 exec_hi, s_restore_exec_hi +- +- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts +- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 +- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts +- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT +- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 +- //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore +- s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode +- //reuse s_restore_m0 as a temp register +- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK +- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT +- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT +- s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero +- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 +- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK +- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT +- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 +- s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK +- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT +- s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp +- +- s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 +- s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 +- s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu +- +- s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time +- +-if G8SR_DEBUG_TIMESTAMP +- s_memrealtime s_g8sr_ts_restore_d +- s_waitcnt lgkmcnt(0) +-end +- +-// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution +- s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc +- +- +-/**************************************************************************/ +-/* the END */ +-/**************************************************************************/ +-L_END_PGM: +- s_endpgm +- +-end +- +- +-/**************************************************************************/ +-/* the helper functions */ +-/**************************************************************************/ +- +-//Only for save hwreg to mem +-function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) +- s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on +- s_mov_b32 m0, s_mem_offset +- s_buffer_store_dword s, s_rsrc, m0 glc:1 +- s_add_u32 s_mem_offset, s_mem_offset, 4 +- s_mov_b32 m0, exec_lo +-end +- +- +-// HWREG are saved before SGPRs, so all HWREG could be use. +-function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) +- +- s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 +- s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 +- s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 +- s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 +- s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 +- s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc +-end +- +- +-function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) +- s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 +- s_add_u32 s_mem_offset, s_mem_offset, 4 +-end +- +-function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) +- s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 +- s_sub_u32 s_mem_offset, s_mem_offset, 4*16 +-end +- +- +- +-function get_lds_size_bytes(s_lds_size_byte) +- // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW +- s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size +- s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW +-end +- +-function get_vgpr_size_bytes(s_vgpr_size_byte) +- s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size +- s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 +- s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible +-end +- +-function get_sgpr_size_bytes(s_sgpr_size_byte) +- s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size +- s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 +- s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) +-end +- +-function get_hwreg_size_bytes +- return 128 //HWREG size 128 bytes +-end +- +- +- +-#endif +- +-static const uint32_t cwsr_trap_gfx9_hex[] = { +- 0xbf820001, 0xbf820124, +- 0xb8f0f802, 0x89708670, +- 0xb8f1f803, 0x8674ff71, +- 0x00000400, 0xbf85001d, +- 0x8674ff71, 0x00000800, +- 0xbf850003, 0x8674ff71, +- 0x00000100, 0xbf840003, +- 0x8770ff70, 0x00002000, +- 0xbf820010, 0xb8faf812, +- 0xb8fbf813, 0x8efa887a, +- 0xc00a1d3d, 0x00000000, +- 0xbf8cc07f, 0x87737574, +- 0xbf840002, 0xb970f802, +- 0xbe801d74, 0xb8f1f803, +- 0x8671ff71, 0x000001ff, +- 0xbf850002, 0x806c846c, +- 0x826d806d, 0x866dff6d, +- 0x0000ffff, 0xb970f802, +- 0xbe801f6c, 0xb8f1f803, +- 0x8671ff71, 0x00000100, +- 0xbf840006, 0xbef60080, +- 0xb9760203, 0x866dff6d, +- 0x0000ffff, 0x80ec886c, +- 0x82ed806d, 0xbef60080, +- 0xb9760283, 0xbef20068, +- 0xbef30069, 0xb8f62407, +- 0x8e769c76, 0x876d766d, +- 0xb8f603c7, 0x8e769b76, +- 0x876d766d, 0xb8f6f807, +- 0x8676ff76, 0x00007fff, +- 0xb976f807, 0xbeee007e, +- 0xbeef007f, 0xbefe0180, +- 0xbf900004, 0xbf8e0002, +- 0xbf88fffe, 0xbef4007e, +- 0x8675ff7f, 0x0000ffff, +- 0x8775ff75, 0x00040000, +- 0xbef60080, 0xbef700ff, +- 0x00807fac, 0x8676ff7f, +- 0x08000000, 0x8f768376, +- 0x87777677, 0x8676ff7f, +- 0x70000000, 0x8f768176, +- 0x87777677, 0xbefb007c, +- 0xbefa0080, 0xb8fa2a05, +- 0x807a817a, 0x8e7a8a7a, +- 0xb8f61605, 0x80768176, +- 0x8e768676, 0x807a767a, +- 0xbef60084, 0xbef600ff, +- 0x01000000, 0xbefe007c, +- 0xbefc007a, 0xc0611efa, +- 0x0000007c, 0x807a847a, +- 0xbefc007e, 0xbefe007c, +- 0xbefc007a, 0xc0611b3a, +- 0x0000007c, 0x807a847a, +- 0xbefc007e, 0xbefe007c, +- 0xbefc007a, 0xc0611b7a, +- 0x0000007c, 0x807a847a, +- 0xbefc007e, 0xbefe007c, +- 0xbefc007a, 0xc0611bba, +- 0x0000007c, 0x807a847a, +- 0xbefc007e, 0xbefe007c, +- 0xbefc007a, 0xc0611bfa, +- 0x0000007c, 0x807a847a, +- 0xbefc007e, 0xbefe007c, +- 0xbefc007a, 0xc0611c3a, +- 0x0000007c, 0x807a847a, +- 0xbefc007e, 0xb8f1f803, +- 0xbefe007c, 0xbefc007a, +- 0xc0611c7a, 0x0000007c, +- 0x807a847a, 0xbefc007e, +- 0xbefe007c, 0xbefc007a, +- 0xc0611cba, 0x0000007c, +- 0x807a847a, 0xbefc007e, +- 0xbefe007c, 0xbefc007a, +- 0xc0611cfa, 0x0000007c, +- 0x807a847a, 0xbefc007e, +- 0xb8fbf801, 0xbefe007c, +- 0xbefc007a, 0xc0611efa, +- 0x0000007c, 0x807a847a, +- 0xbefc007e, 0x8676ff7f, +- 0x04000000, 0xbeef0080, +- 0x876f6f76, 0xb8fa2a05, +- 0x807a817a, 0x8e7a8a7a, +- 0xb8f11605, 0x80718171, +- 0x8e718471, 0x8e768271, +- 0xbef600ff, 0x01000000, +- 0xbef20174, 0x80747a74, +- 0x82758075, 0xbefc0080, +- 0xbf800000, 0xbe802b00, +- 0xbe822b02, 0xbe842b04, +- 0xbe862b06, 0xbe882b08, +- 0xbe8a2b0a, 0xbe8c2b0c, +- 0xbe8e2b0e, 0xc06b003a, +- 0x00000000, 0xc06b013a, +- 0x00000010, 0xc06b023a, +- 0x00000020, 0xc06b033a, +- 0x00000030, 0x8074c074, +- 0x82758075, 0x807c907c, +- 0xbf0a717c, 0xbf85ffeb, +- 0xbef40172, 0xbefa0080, +- 0xbefe00c1, 0xbeff00c1, +- 0xbef600ff, 0x01000000, +- 0xe0724000, 0x7a1d0000, +- 0xe0724100, 0x7a1d0100, +- 0xe0724200, 0x7a1d0200, +- 0xe0724300, 0x7a1d0300, +- 0xbefe00c1, 0xbeff00c1, +- 0xb8f14306, 0x8671c171, +- 0xbf84002c, 0xbf8a0000, +- 0x8676ff6f, 0x04000000, +- 0xbf840028, 0x8e718671, +- 0x8e718271, 0xbef60071, +- 0xb8fa2a05, 0x807a817a, +- 0x8e7a8a7a, 0xb8f61605, +- 0x80768176, 0x8e768676, +- 0x807a767a, 0x807aff7a, +- 0x00000080, 0xbef600ff, +- 0x01000000, 0xbefc0080, +- 0xd28c0002, 0x000100c1, +- 0xd28d0003, 0x000204c1, +- 0xd1060002, 0x00011103, +- 0x7e0602ff, 0x00000200, +- 0xbefc00ff, 0x00010000, +- 0xbe800077, 0x8677ff77, +- 0xff7fffff, 0x8777ff77, +- 0x00058000, 0xd8ec0000, +- 0x00000002, 0xbf8cc07f, +- 0xe0765000, 0x7a1d0002, +- 0x68040702, 0xd0c9006a, +- 0x0000e302, 0xbf87fff7, +- 0xbef70000, 0xbefa00ff, +- 0x00000400, 0xbefe00c1, +- 0xbeff00c1, 0xb8f12a05, +- 0x80718171, 0x8e718271, +- 0x8e768871, 0xbef600ff, +- 0x01000000, 0xbefc0084, +- 0xbf0a717c, 0xbf840015, +- 0xbf11017c, 0x8071ff71, +- 0x00001000, 0x7e000300, +- 0x7e020301, 0x7e040302, +- 0x7e060303, 0xe0724000, +- 0x7a1d0000, 0xe0724100, +- 0x7a1d0100, 0xe0724200, +- 0x7a1d0200, 0xe0724300, +- 0x7a1d0300, 0x807c847c, +- 0x807aff7a, 0x00000400, +- 0xbf0a717c, 0xbf85ffef, +- 0xbf9c0000, 0xbf8200c5, +- 0xbef4007e, 0x8675ff7f, +- 0x0000ffff, 0x8775ff75, +- 0x00040000, 0xbef60080, +- 0xbef700ff, 0x00807fac, +- 0x8672ff7f, 0x08000000, +- 0x8f728372, 0x87777277, +- 0x8672ff7f, 0x70000000, +- 0x8f728172, 0x87777277, +- 0x8672ff7f, 0x04000000, +- 0xbf84001e, 0xbefe00c1, +- 0xbeff00c1, 0xb8ef4306, +- 0x866fc16f, 0xbf840019, +- 0x8e6f866f, 0x8e6f826f, +- 0xbef6006f, 0xb8f82a05, +- 0x80788178, 0x8e788a78, +- 0xb8f21605, 0x80728172, +- 0x8e728672, 0x80787278, +- 0x8078ff78, 0x00000080, +- 0xbef600ff, 0x01000000, +- 0xbefc0080, 0xe0510000, +- 0x781d0000, 0xe0510100, +- 0x781d0000, 0x807cff7c, +- 0x00000200, 0x8078ff78, +- 0x00000200, 0xbf0a6f7c, +- 0xbf85fff6, 0xbef80080, +- 0xbefe00c1, 0xbeff00c1, +- 0xb8ef2a05, 0x806f816f, +- 0x8e6f826f, 0x8e76886f, +- 0xbef600ff, 0x01000000, +- 0xbef20078, 0x8078ff78, +- 0x00000400, 0xbefc0084, +- 0xbf11087c, 0x806fff6f, +- 0x00008000, 0xe0524000, +- 0x781d0000, 0xe0524100, +- 0x781d0100, 0xe0524200, +- 0x781d0200, 0xe0524300, +- 0x781d0300, 0xbf8c0f70, +- 0x7e000300, 0x7e020301, +- 0x7e040302, 0x7e060303, +- 0x807c847c, 0x8078ff78, +- 0x00000400, 0xbf0a6f7c, +- 0xbf85ffee, 0xbf9c0000, +- 0xe0524000, 0x721d0000, +- 0xe0524100, 0x721d0100, +- 0xe0524200, 0x721d0200, +- 0xe0524300, 0x721d0300, +- 0xb8f82a05, 0x80788178, +- 0x8e788a78, 0xb8f21605, +- 0x80728172, 0x8e728672, +- 0x80787278, 0x80f8c078, +- 0xb8ef1605, 0x806f816f, +- 0x8e6f846f, 0x8e76826f, +- 0xbef600ff, 0x01000000, +- 0xbefc006f, 0xc031003a, +- 0x00000078, 0x80f8c078, +- 0xbf8cc07f, 0x80fc907c, +- 0xbf800000, 0xbe802d00, +- 0xbe822d02, 0xbe842d04, +- 0xbe862d06, 0xbe882d08, +- 0xbe8a2d0a, 0xbe8c2d0c, +- 0xbe8e2d0e, 0xbf06807c, +- 0xbf84fff0, 0xb8f82a05, +- 0x80788178, 0x8e788a78, +- 0xb8f21605, 0x80728172, +- 0x8e728672, 0x80787278, +- 0xbef60084, 0xbef600ff, +- 0x01000000, 0xc0211bfa, +- 0x00000078, 0x80788478, +- 0xc0211b3a, 0x00000078, +- 0x80788478, 0xc0211b7a, +- 0x00000078, 0x80788478, +- 0xc0211eba, 0x00000078, +- 0x80788478, 0xc0211efa, +- 0x00000078, 0x80788478, +- 0xc0211c3a, 0x00000078, +- 0x80788478, 0xc0211c7a, +- 0x00000078, 0x80788478, +- 0xc0211a3a, 0x00000078, +- 0x80788478, 0xc0211a7a, +- 0x00000078, 0x80788478, +- 0xc0211cfa, 0x00000078, +- 0x80788478, 0xbf8cc07f, +- 0x866dff6d, 0x0000ffff, +- 0xbefc006f, 0xbefe007a, +- 0xbeff007b, 0x866f71ff, +- 0x000003ff, 0xb96f4803, +- 0x866f71ff, 0xfffff800, +- 0x8f6f8b6f, 0xb96fa2c3, +- 0xb973f801, 0x866fff6d, +- 0xf0000000, 0x8f6f9c6f, +- 0x8e6f906f, 0xbef20080, +- 0x87726f72, 0x866fff6d, +- 0x08000000, 0x8f6f9b6f, +- 0x8e6f8f6f, 0x87726f72, +- 0x866fff70, 0x00800000, +- 0x8f6f976f, 0xb972f807, +- 0x86fe7e7e, 0x86ea6a6a, +- 0xb970f802, 0xbf8a0000, +- 0x95806f6c, 0xbf810000, +-}; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +index 207a05e..6316aad 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +@@ -25,7 +25,6 @@ + #include <linux/err.h> + #include <linux/fs.h> + #include <linux/sched.h> +-#include <linux/sched/mm.h> + #include <linux/slab.h> + #include <linux/uaccess.h> + #include <linux/compat.h> +@@ -34,17 +33,13 @@ + #include <linux/mm.h> + #include <linux/mman.h> + #include <asm/processor.h> +-#include <linux/ptrace.h> +- + #include "kfd_priv.h" + #include "kfd_device_queue_manager.h" + #include "kfd_dbgmgr.h" +-#include "kfd_ipc.h" + + static long kfd_ioctl(struct file *, unsigned int, unsigned long); + static int kfd_open(struct inode *, struct file *); + static int kfd_mmap(struct file *, struct vm_area_struct *); +-static bool kfd_is_large_bar(struct kfd_dev *dev); + + static const char kfd_dev_name[] = "kfd"; + +@@ -60,14 +55,6 @@ static int kfd_char_dev_major = -1; + static struct class *kfd_class; + struct device *kfd_device; + +-static char *kfd_devnode(struct device *dev, umode_t *mode) +-{ +- if (mode && dev->devt == MKDEV(kfd_char_dev_major, 0)) +- *mode = 0666; +- +- return NULL; +-} +- + int kfd_chardev_init(void) + { + int err = 0; +@@ -82,8 +69,6 @@ int kfd_chardev_init(void) + if (IS_ERR(kfd_class)) + goto err_class_create; + +- kfd_class->devnode = kfd_devnode; +- + kfd_device = device_create(kfd_class, NULL, + MKDEV(kfd_char_dev_major, 0), + NULL, kfd_dev_name); +@@ -132,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep) + return -EPERM; + } + +- process = kfd_create_process(filep); ++ process = kfd_create_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + +@@ -157,12 +142,12 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, + struct kfd_ioctl_create_queue_args *args) + { + if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { +- pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); ++ pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + return -EINVAL; + } + + if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { +- pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); ++ pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + return -EINVAL; + } + +@@ -170,26 +155,26 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, + (!access_ok(VERIFY_WRITE, + (const void __user *) args->ring_base_address, + sizeof(uint64_t)))) { +- pr_err("Can't access ring base address\n"); ++ pr_err("kfd: can't access ring base address\n"); + return -EFAULT; + } + + if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { +- pr_err("Ring size must be a power of 2 or 0\n"); ++ pr_err("kfd: ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + + if (!access_ok(VERIFY_WRITE, + (const void __user *) args->read_pointer_address, + sizeof(uint32_t))) { +- pr_err("Can't access read pointer\n"); ++ pr_err("kfd: can't access read pointer\n"); + return -EFAULT; + } + + if (!access_ok(VERIFY_WRITE, + (const void __user *) args->write_pointer_address, + sizeof(uint32_t))) { +- pr_err("Can't access write pointer\n"); ++ pr_err("kfd: can't access write pointer\n"); + return -EFAULT; + } + +@@ -197,7 +182,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, + !access_ok(VERIFY_WRITE, + (const void __user *) args->eop_buffer_address, + sizeof(uint32_t))) { +- pr_debug("Can't access eop buffer"); ++ pr_debug("kfd: can't access eop buffer"); + return -EFAULT; + } + +@@ -205,7 +190,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, + !access_ok(VERIFY_WRITE, + (const void __user *) args->ctx_save_restore_address, + sizeof(uint32_t))) { +- pr_debug("Can't access ctx save restore buffer"); ++ pr_debug("kfd: can't access ctx save restore buffer"); + return -EFAULT; + } + +@@ -221,7 +206,6 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, + q_properties->ctx_save_restore_area_address = + args->ctx_save_restore_address; + q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; +- q_properties->ctl_stack_size = args->ctl_stack_size; + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || + args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->type = KFD_QUEUE_TYPE_COMPUTE; +@@ -235,27 +219,27 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, + else + q_properties->format = KFD_QUEUE_FORMAT_PM4; + +- pr_debug("Queue Percentage: %d, %d\n", ++ pr_debug("Queue Percentage (%d, %d)\n", + q_properties->queue_percent, args->queue_percentage); + +- pr_debug("Queue Priority: %d, %d\n", ++ pr_debug("Queue Priority (%d, %d)\n", + q_properties->priority, args->queue_priority); + +- pr_debug("Queue Address: 0x%llX, 0x%llX\n", ++ pr_debug("Queue Address (0x%llX, 0x%llX)\n", + q_properties->queue_address, args->ring_base_address); + +- pr_debug("Queue Size: 0x%llX, %u\n", ++ pr_debug("Queue Size (0x%llX, %u)\n", + q_properties->queue_size, args->ring_size); + +- pr_debug("Queue r/w Pointers: %p, %p\n", +- q_properties->read_ptr, +- q_properties->write_ptr); ++ pr_debug("Queue r/w Pointers (0x%llX, 0x%llX)\n", ++ (uint64_t) q_properties->read_ptr, ++ (uint64_t) q_properties->write_ptr); + +- pr_debug("Queue Format: %d\n", q_properties->format); ++ pr_debug("Queue Format (%d)\n", q_properties->format); + +- pr_debug("Queue EOP: 0x%llX\n", q_properties->eop_ring_buffer_address); ++ pr_debug("Queue EOP (0x%llX)\n", q_properties->eop_ring_buffer_address); + +- pr_debug("Queue CTX save area: 0x%llX\n", ++ pr_debug("Queue CTX save arex (0x%llX)\n", + q_properties->ctx_save_restore_area_address); + + return 0; +@@ -273,16 +257,16 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + + memset(&q_properties, 0, sizeof(struct queue_properties)); + +- pr_debug("Creating queue ioctl\n"); ++ pr_debug("kfd: creating queue ioctl\n"); + + err = set_queue_properties_from_user(&q_properties, args); + if (err) + return err; + +- pr_debug("Looking for gpu id 0x%x\n", args->gpu_id); ++ pr_debug("kfd: looking for gpu id 0x%x\n", args->gpu_id); + dev = kfd_device_by_id(args->gpu_id); +- if (!dev) { +- pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); ++ if (dev == NULL) { ++ pr_debug("kfd: gpu id 0x%x was not found\n", args->gpu_id); + return -EINVAL; + } + +@@ -294,11 +278,12 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + goto err_bind_process; + } + +- pr_debug("Creating queue for PASID %d on gpu 0x%x\n", ++ pr_debug("kfd: creating queue for PASID %d on GPU 0x%x\n", + p->pasid, + dev->id); + +- err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id); ++ err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, ++ 0, q_properties.type, &queue_id); + if (err != 0) + goto err_create_queue; + +@@ -306,28 +291,20 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + + + /* Return gpu_id as doorbell offset for mmap usage */ +- args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; +- args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); ++ args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); + args->doorbell_offset <<= PAGE_SHIFT; +- if (KFD_IS_SOC15(dev->device_info->asic_family)) +- /* On SOC15 ASICs, doorbell allocation must be +- * per-device, and independent from the per-process +- * queue_id. Return the doorbell offset within the +- * doorbell aperture to user mode. +- */ +- args->doorbell_offset |= q_properties.doorbell_off; + + mutex_unlock(&p->mutex); + +- pr_debug("Queue id %d was created successfully\n", args->queue_id); ++ pr_debug("kfd: queue id %d was created successfully\n", args->queue_id); + +- pr_debug("Ring buffer address == 0x%016llX\n", ++ pr_debug("ring buffer address == 0x%016llX\n", + args->ring_base_address); + +- pr_debug("Read ptr address == 0x%016llX\n", ++ pr_debug("read ptr address == 0x%016llX\n", + args->read_pointer_address); + +- pr_debug("Write ptr address == 0x%016llX\n", ++ pr_debug("write ptr address == 0x%016llX\n", + args->write_pointer_address); + + return 0; +@@ -344,7 +321,7 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, + int retval; + struct kfd_ioctl_destroy_queue_args *args = data; + +- pr_debug("Destroying queue id %d for pasid %d\n", ++ pr_debug("kfd: destroying queue id %d for PASID %d\n", + args->queue_id, + p->pasid); + +@@ -364,12 +341,12 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, + struct queue_properties properties; + + if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { +- pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); ++ pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + return -EINVAL; + } + + if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { +- pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); ++ pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + return -EINVAL; + } + +@@ -377,12 +354,12 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, + (!access_ok(VERIFY_WRITE, + (const void __user *) args->ring_base_address, + sizeof(uint64_t)))) { +- pr_err("Can't access ring base address\n"); ++ pr_err("kfd: can't access ring base address\n"); + return -EFAULT; + } + + if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { +- pr_err("Ring size must be a power of 2 or 0\n"); ++ pr_err("kfd: ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + +@@ -391,7 +368,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, + properties.queue_percent = args->queue_percentage; + properties.priority = args->queue_priority; + +- pr_debug("Updating queue id %d for pasid %d\n", ++ pr_debug("kfd: updating queue id %d for PASID %d\n", + args->queue_id, p->pasid); + + mutex_lock(&p->mutex); +@@ -403,58 +380,6 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, + return retval; + } + +-static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, +- void *data) +-{ +- int retval; +- const int max_num_cus = 1024; +- struct kfd_ioctl_set_cu_mask_args *args = data; +- struct queue_properties properties; +- uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; +- size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32); +- +- if ((args->num_cu_mask % 32) != 0) { +- pr_debug("num_cu_mask 0x%x must be a multiple of 32", +- args->num_cu_mask); +- return -EINVAL; +- } +- +- properties.cu_mask_count = args->num_cu_mask; +- if (properties.cu_mask_count == 0) { +- pr_debug("CU mask cannot be 0"); +- return -EINVAL; +- } +- +- /* To prevent an unreasonably large CU mask size, set an arbitrary +- * limit of max_num_cus bits. We can then just drop any CU mask bits +- * past max_num_cus bits and just use the first max_num_cus bits. +- */ +- if (properties.cu_mask_count > max_num_cus) { +- pr_debug("CU mask cannot be greater than 1024 bits"); +- properties.cu_mask_count = max_num_cus; +- cu_mask_size = sizeof(uint32_t) * (max_num_cus/32); +- } +- +- properties.cu_mask = kzalloc(cu_mask_size, GFP_KERNEL); +- if (!properties.cu_mask) +- return -ENOMEM; +- +- retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size); +- if (retval) { +- pr_debug("Could not copy CU mask from userspace"); +- kfree(properties.cu_mask); +- return -EFAULT; +- } +- +- mutex_lock(&p->mutex); +- +- retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); +- +- mutex_unlock(&p->mutex); +- +- return retval; +-} +- + static int kfd_ioctl_set_memory_policy(struct file *filep, + struct kfd_process *p, void *data) + { +@@ -475,7 +400,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, + } + + dev = kfd_device_by_id(args->gpu_id); +- if (!dev) ++ if (dev == NULL) + return -EINVAL; + + mutex_lock(&p->mutex); +@@ -507,38 +432,6 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, + return err; + } + +-static int kfd_ioctl_set_trap_handler(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_set_trap_handler_args *args = data; +- struct kfd_dev *dev; +- int err = 0; +- struct kfd_process_device *pdd; +- +- dev = kfd_device_by_id(args->gpu_id); +- if (!dev) +- return -EINVAL; +- +- mutex_lock(&p->mutex); +- +- pdd = kfd_bind_process_to_device(dev, p); +- if (IS_ERR(pdd)) { +- err = -ESRCH; +- goto out; +- } +- +- if (dev->dqm->ops.set_trap_handler(dev->dqm, +- &pdd->qpd, +- args->tba_addr, +- args->tma_addr)) +- err = -EINVAL; +- +-out: +- mutex_unlock(&p->mutex); +- +- return err; +-} +- + static int kfd_ioctl_dbg_register(struct file *filep, + struct kfd_process *p, void *data) + { +@@ -550,11 +443,16 @@ static int kfd_ioctl_dbg_register(struct file *filep, + long status = 0; + + dev = kfd_device_by_id(args->gpu_id); +- if (!dev) ++ if (dev == NULL) + return -EINVAL; + +- mutex_lock(&p->mutex); ++ if (dev->device_info->asic_family == CHIP_CARRIZO) { ++ pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); ++ return -EINVAL; ++ } ++ + mutex_lock(kfd_get_dbgmgr_mutex()); ++ mutex_lock(&p->mutex); + + /* + * make sure that we have pdd, if this the first queue created for +@@ -562,11 +460,12 @@ static int kfd_ioctl_dbg_register(struct file *filep, + */ + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { +- status = PTR_ERR(pdd); +- goto out; ++ mutex_unlock(&p->mutex); ++ mutex_unlock(kfd_get_dbgmgr_mutex()); ++ return PTR_ERR(pdd); + } + +- if (!dev->dbgmgr) { ++ if (dev->dbgmgr == NULL) { + /* In case of a legal call, we have no dbgmgr yet */ + create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev); + if (create_ok) { +@@ -581,9 +480,8 @@ static int kfd_ioctl_dbg_register(struct file *filep, + status = -EINVAL; + } + +-out: +- mutex_unlock(kfd_get_dbgmgr_mutex()); + mutex_unlock(&p->mutex); ++ mutex_unlock(kfd_get_dbgmgr_mutex()); + + return status; + } +@@ -596,7 +494,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, + long status; + + dev = kfd_device_by_id(args->gpu_id); +- if (!dev) ++ if (dev == NULL) + return -EINVAL; + + if (dev->device_info->asic_family == CHIP_CARRIZO) { +@@ -607,7 +505,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, + mutex_lock(kfd_get_dbgmgr_mutex()); + + status = kfd_dbgmgr_unregister(dev->dbgmgr, p); +- if (!status) { ++ if (status == 0) { + kfd_dbgmgr_destroy(dev->dbgmgr); + dev->dbgmgr = NULL; + } +@@ -641,13 +539,21 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, + memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info)); + + dev = kfd_device_by_id(args->gpu_id); +- if (!dev) ++ if (dev == NULL) ++ return -EINVAL; ++ ++ if (dev->device_info->asic_family == CHIP_CARRIZO) { ++ pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); + return -EINVAL; ++ } + + cmd_from_user = (void __user *) args->content_ptr; + +- if (args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE || +- (args->buf_size_in_bytes <= sizeof(*args))) ++ /* Validate arguments */ ++ ++ if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) || ++ (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) || ++ (cmd_from_user == NULL)) + return -EINVAL; + + /* this is the actual buffer to work with */ +@@ -673,9 +579,9 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, + /* skip over the addresses buffer */ + args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; + +- if (args_idx >= args->buf_size_in_bytes) { +- status = -EINVAL; +- goto out; ++ if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) { ++ kfree(args_buff); ++ return -EINVAL; + } + + watch_mask_value = (uint64_t) args_buff[args_idx]; +@@ -697,9 +603,9 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, + args_idx += sizeof(aw_info.watch_mask); + } + +- if (args_idx > args->buf_size_in_bytes) { +- status = -EINVAL; +- goto out; ++ if (args_idx >= args->buf_size_in_bytes - sizeof(args)) { ++ kfree(args_buff); ++ return -EINVAL; + } + + /* Currently HSA Event is not supported for DBG */ +@@ -711,7 +617,6 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, + + mutex_unlock(kfd_get_dbgmgr_mutex()); + +-out: + kfree(args_buff); + + return status; +@@ -741,9 +646,14 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, + sizeof(wac_info.trapId); + + dev = kfd_device_by_id(args->gpu_id); +- if (!dev) ++ if (dev == NULL) + return -EINVAL; + ++ if (dev->device_info->asic_family == CHIP_CARRIZO) { ++ pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); ++ return -EINVAL; ++ } ++ + /* input size must match the computed "compact" size */ + if (args->buf_size_in_bytes != computed_buff_size) { + pr_debug("size mismatch, computed : actual %u : %u\n", +@@ -802,37 +712,22 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, + { + struct kfd_ioctl_get_clock_counters_args *args = data; + struct kfd_dev *dev; +-#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \ +- || (defined OS_NAME_RHEL_7_2) +- struct timespec time; +-#else + struct timespec64 time; +-#endif + + dev = kfd_device_by_id(args->gpu_id); +- if (dev) +- /* Reading GPU clock counter from KGD */ +- args->gpu_clock_counter = +- dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); +- else +- /* Node without GPU resource */ +- args->gpu_clock_counter = 0; ++ if (dev == NULL) ++ return -EINVAL; ++ ++ /* Reading GPU clock counter from KGD */ ++ args->gpu_clock_counter = ++ dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); + + /* No access to rdtsc. Using raw monotonic time */ +-#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \ +- || (defined OS_NAME_RHEL_7_2) +- getrawmonotonic(&time); +- args->cpu_clock_counter = (uint64_t)timespec_to_ns(&time); +- +- get_monotonic_boottime(&time); +- args->system_clock_counter = (uint64_t)timespec_to_ns(&time); +-#else + getrawmonotonic64(&time); + args->cpu_clock_counter = (uint64_t)timespec64_to_ns(&time); + + get_monotonic_boottime64(&time); + args->system_clock_counter = (uint64_t)timespec64_to_ns(&time); +-#endif + + /* Since the counter is in nano-seconds we use 1GHz frequency */ + args->system_clock_freq = 1000000000; +@@ -887,104 +782,12 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, + "scratch_limit %llX\n", pdd->scratch_limit); + + args->num_of_nodes++; +- +- pdd = kfd_get_next_process_device_data(p, pdd); +- } while (pdd && (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); +- } +- +- mutex_unlock(&p->mutex); +- +- return 0; +-} +- +-static int kfd_ioctl_get_process_apertures_new(struct file *filp, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_get_process_apertures_new_args *args = data; +- struct kfd_process_device_apertures *pa; +- struct kfd_process_device *pdd; +- uint32_t nodes = 0; +- int ret; +- +- dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); +- +- if (args->num_of_nodes == 0) { +- /* Return number of nodes, so that user space can alloacate +- * sufficient memory +- */ +- mutex_lock(&p->mutex); +- +- if (!kfd_has_process_device_data(p)) +- goto out_upwrite; +- +- /* Run over all pdd of the process */ +- pdd = kfd_get_first_process_device_data(p); +- do { +- args->num_of_nodes++; +- pdd = kfd_get_next_process_device_data(p, pdd); +- } while (pdd); +- +- goto out_upwrite; +- } +- +- /* Fill in process-aperture information for all available +- * nodes, but not more than args->num_of_nodes as that is +- * the amount of memory allocated by user +- */ +- pa = kzalloc((sizeof(struct kfd_process_device_apertures) * +- args->num_of_nodes), GFP_KERNEL); +- if (!pa) +- return -ENOMEM; +- +- mutex_lock(&p->mutex); +- +- if (!kfd_has_process_device_data(p)) { +- args->num_of_nodes = 0; +- kfree(pa); +- goto out_upwrite; ++ } while ((pdd = kfd_get_next_process_device_data(p, pdd)) != NULL && ++ (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); + } + +- /* Run over all pdd of the process */ +- pdd = kfd_get_first_process_device_data(p); +- do { +- pa[nodes].gpu_id = pdd->dev->id; +- pa[nodes].lds_base = pdd->lds_base; +- pa[nodes].lds_limit = pdd->lds_limit; +- pa[nodes].gpuvm_base = pdd->gpuvm_base; +- pa[nodes].gpuvm_limit = pdd->gpuvm_limit; +- pa[nodes].scratch_base = pdd->scratch_base; +- pa[nodes].scratch_limit = pdd->scratch_limit; +- +- dev_dbg(kfd_device, +- "gpu id %u\n", pdd->dev->id); +- dev_dbg(kfd_device, +- "lds_base %llX\n", pdd->lds_base); +- dev_dbg(kfd_device, +- "lds_limit %llX\n", pdd->lds_limit); +- dev_dbg(kfd_device, +- "gpuvm_base %llX\n", pdd->gpuvm_base); +- dev_dbg(kfd_device, +- "gpuvm_limit %llX\n", pdd->gpuvm_limit); +- dev_dbg(kfd_device, +- "scratch_base %llX\n", pdd->scratch_base); +- dev_dbg(kfd_device, +- "scratch_limit %llX\n", pdd->scratch_limit); +- nodes++; +- +- pdd = kfd_get_next_process_device_data(p, pdd); +- } while (pdd && (nodes < args->num_of_nodes)); + mutex_unlock(&p->mutex); + +- args->num_of_nodes = nodes; +- ret = copy_to_user( +- (void __user *)args->kfd_process_device_apertures_ptr, +- pa, +- (nodes * sizeof(struct kfd_process_device_apertures))); +- kfree(pa); +- return ret ? -EFAULT : 0; +- +-out_upwrite: +- mutex_unlock(&p->mutex); + return 0; + } + +@@ -992,57 +795,15 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, + void *data) + { + struct kfd_ioctl_create_event_args *args = data; +- struct kfd_dev *kfd; +- struct kfd_process_device *pdd; +- int err = -EINVAL; +- void *mem, *kern_addr = NULL; +- +- pr_debug("Event page offset 0x%llx\n", args->event_page_offset); +- +- if (args->event_page_offset) { +- kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); +- if (!kfd) { +- pr_err("Getting device by id failed in %s\n", __func__); +- return -EFAULT; +- } +- if (!kfd->device_info->is_need_iommu_device) { +- mutex_lock(&p->mutex); +- pdd = kfd_bind_process_to_device(kfd, p); +- if (IS_ERR(pdd)) { +- err = PTR_ERR(pdd); +- goto out_upwrite; +- } +- mem = kfd_process_device_translate_handle(pdd, +- GET_IDR_HANDLE(args->event_page_offset)); +- if (!mem) { +- pr_err("Can't find BO, offset is 0x%llx\n", +- args->event_page_offset); +- err = -EFAULT; +- goto out_upwrite; +- } +- mutex_unlock(&p->mutex); +- +- /* Map dGPU gtt BO to kernel */ +- kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, +- mem, &kern_addr); +- } +- } ++ int err; + +- err = kfd_event_create(filp, p, +- args->event_type, +- args->auto_reset != 0, +- args->node_id, +- &args->event_id, +- &args->event_trigger_data, +- &args->event_page_offset, +- &args->event_slot_index, +- kern_addr); ++ err = kfd_event_create(filp, p, args->event_type, ++ args->auto_reset != 0, args->node_id, ++ &args->event_id, &args->event_trigger_data, ++ &args->event_page_offset, ++ &args->event_slot_index); + + return err; +- +-out_upwrite: +- mutex_unlock(&p->mutex); +- return err; + } + + static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, +@@ -1085,870 +846,9 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, + + return err; + } +-static int kfd_ioctl_alloc_scratch_memory(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_alloc_memory_of_scratch_args *args = data; +- struct kfd_process_device *pdd; +- struct kfd_dev *dev; +- long err; +- +- if (args->size == 0) +- return -EINVAL; +- +- dev = kfd_device_by_id(args->gpu_id); +- if (!dev) +- return -EINVAL; +- +- mutex_lock(&p->mutex); +- +- pdd = kfd_bind_process_to_device(dev, p); +- if (IS_ERR(pdd)) { +- err = PTR_ERR(pdd); +- goto bind_process_to_device_fail; +- } +- +- pdd->sh_hidden_private_base_vmid = args->va_addr; +- pdd->qpd.sh_hidden_private_base = args->va_addr; +- +- mutex_unlock(&p->mutex); +- +- if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && +- pdd->qpd.vmid != 0) { +- err = dev->kfd2kgd->alloc_memory_of_scratch( +- dev->kgd, args->va_addr, pdd->qpd.vmid); +- if (err != 0) +- goto alloc_memory_of_scratch_failed; +- } +- +- return 0; +- +-bind_process_to_device_fail: +- mutex_unlock(&p->mutex); +-alloc_memory_of_scratch_failed: +- return -EFAULT; +-} +- +-bool kfd_is_large_bar(struct kfd_dev *dev) +-{ +- struct kfd_local_mem_info mem_info; +- +- if (debug_largebar) { +- pr_debug("Simulate large-bar allocation on non large-bar machine\n"); +- return true; +- } +- +- if (dev->device_info->is_need_iommu_device) +- return false; +- +- dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info); +- if (mem_info.local_mem_size_private == 0 && +- mem_info.local_mem_size_public > 0) +- return true; +- return false; +-} +- +-static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_alloc_memory_of_gpu_args *args = data; +- struct kfd_process_device *pdd; +- void *mem; +- struct kfd_dev *dev; +- int idr_handle; +- long err; +- uint64_t offset = args->mmap_offset; +- uint32_t flags = args->flags; +- struct vm_area_struct *vma; +- +- if (args->size == 0) +- return -EINVAL; +- +- dev = kfd_device_by_id(args->gpu_id); +- if (!dev) +- return -EINVAL; +- +- if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { +- /* Check if the userptr corresponds to another (or third-party) +- * device local memory. If so treat is as a doorbell. User +- * space will be oblivious of this and will use this doorbell +- * BO as a regular userptr BO +- */ +- vma = find_vma(current->mm, args->mmap_offset); +- if (vma && (vma->vm_flags & VM_IO)) { +- unsigned long pfn; +- +- follow_pfn(vma, args->mmap_offset, &pfn); +- flags |= KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL; +- flags &= ~KFD_IOC_ALLOC_MEM_FLAGS_USERPTR; +- offset = (pfn << PAGE_SHIFT); +- } +- } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { +- if (args->size != kfd_doorbell_process_slice(dev)) +- return -EINVAL; +- offset = kfd_get_process_doorbells(dev, p); +- } +- +- mutex_lock(&p->mutex); +- +- pdd = kfd_bind_process_to_device(dev, p); +- if (IS_ERR(pdd)) { +- err = PTR_ERR(pdd); +- goto err_unlock; +- } +- +- err = dev->kfd2kgd->alloc_memory_of_gpu( +- dev->kgd, args->va_addr, args->size, +- pdd->vm, (struct kgd_mem **) &mem, &offset, +- flags); +- +- if (err) +- goto err_unlock; +- +- idr_handle = kfd_process_device_create_obj_handle(pdd, mem, +- args->va_addr, args->size, NULL); +- if (idr_handle < 0) { +- err = -EFAULT; +- goto err_free; +- } +- +- mutex_unlock(&p->mutex); +- +- args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); +- if ((args->flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) != 0 && +- !kfd_is_large_bar(dev)) { +- args->mmap_offset = 0; +- } else { +- args->mmap_offset = KFD_MMAP_TYPE_MAP_BO; +- args->mmap_offset |= KFD_MMAP_GPU_ID(args->gpu_id); +- args->mmap_offset <<= PAGE_SHIFT; +- args->mmap_offset |= offset; +- } +- +- return 0; +- +-err_free: +- dev->kfd2kgd->free_memory_of_gpu(dev->kgd, +- (struct kgd_mem *) mem, +- pdd->vm); +-err_unlock: +- mutex_unlock(&p->mutex); +- return err; +-} +- +-static int kfd_ioctl_free_memory_of_gpu(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_free_memory_of_gpu_args *args = data; +- struct kfd_process_device *pdd; +- struct kfd_bo *buf_obj; +- struct kfd_dev *dev; +- int ret; +- +- dev = kfd_device_by_id(GET_GPU_ID(args->handle)); +- if (!dev) +- return -EINVAL; +- +- mutex_lock(&p->mutex); +- +- pdd = kfd_get_process_device_data(dev, p); +- if (!pdd) { +- pr_err("Process device data doesn't exist\n"); +- ret = -EINVAL; +- goto err_unlock; +- } +- +- buf_obj = kfd_process_device_find_bo(pdd, +- GET_IDR_HANDLE(args->handle)); +- if (!buf_obj) { +- ret = -EINVAL; +- goto err_unlock; +- } +- run_rdma_free_callback(buf_obj); +- +- ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem, +- pdd->vm); +- +- /* If freeing the buffer failed, leave the handle in place for +- * clean-up during process tear-down. +- */ +- if (ret == 0) +- kfd_process_device_remove_obj_handle( +- pdd, GET_IDR_HANDLE(args->handle)); +- +-err_unlock: +- mutex_unlock(&p->mutex); +- return ret; +-} +- +-static int kfd_ioctl_map_memory_to_gpu(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_map_memory_to_gpu_args *args = data; +- struct kfd_process_device *pdd, *peer_pdd; +- void *mem; +- struct kfd_dev *dev, *peer; +- long err = 0; +- int i, num_dev = 0; +- uint32_t *devices_arr = NULL; +- +- dev = kfd_device_by_id(GET_GPU_ID(args->handle)); +- if (!dev) +- return -EINVAL; +- +- if (args->device_ids_array_size > 0 && +- (args->device_ids_array_size < sizeof(uint32_t))) { +- pr_err("Node IDs array size %u\n", +- args->device_ids_array_size); +- return -EFAULT; +- } +- +- if (args->device_ids_array_size > 0) { +- devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); +- if (!devices_arr) +- return -ENOMEM; +- +- err = copy_from_user(devices_arr, +- (void __user *)args->device_ids_array_ptr, +- args->device_ids_array_size); +- if (err != 0) { +- err = -EFAULT; +- goto copy_from_user_failed; +- } +- } +- +- mutex_lock(&p->mutex); +- +- pdd = kfd_bind_process_to_device(dev, p); +- if (IS_ERR(pdd)) { +- err = PTR_ERR(pdd); +- goto bind_process_to_device_failed; +- } +- +- mem = kfd_process_device_translate_handle(pdd, +- GET_IDR_HANDLE(args->handle)); +- if (!mem) { +- err = PTR_ERR(mem); +- goto get_mem_obj_from_handle_failed; +- } +- +- if (args->device_ids_array_size > 0) { +- num_dev = args->device_ids_array_size / sizeof(uint32_t); +- for (i = 0 ; i < num_dev; i++) { +- peer = kfd_device_by_id(devices_arr[i]); +- if (!peer) { +- pr_err("Getting device by id failed for 0x%x\n", +- devices_arr[i]); +- err = -EFAULT; +- goto get_mem_obj_from_handle_failed; +- } +- +- peer_pdd = kfd_bind_process_to_device(peer, p); +- if (!peer_pdd) { +- err = -EFAULT; +- goto get_mem_obj_from_handle_failed; +- } +- err = peer->kfd2kgd->map_memory_to_gpu( +- peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); +- if (err != 0) +- pr_err("Failed to map\n"); +- } +- } else { +- err = dev->kfd2kgd->map_memory_to_gpu( +- dev->kgd, (struct kgd_mem *)mem, pdd->vm); +- if (err != 0) +- pr_err("Failed to map\n"); +- } +- +- mutex_unlock(&p->mutex); +- +- err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true); +- if (err) { +- pr_debug("Sync memory failed, wait interrupted by user signal\n"); +- goto sync_memory_failed; +- } +- +- /* Flush TLBs after waiting for the page table updates to complete */ +- if (args->device_ids_array_size > 0) { +- for (i = 0; i < num_dev; i++) { +- peer = kfd_device_by_id(devices_arr[i]); +- if (WARN_ON_ONCE(!peer)) +- continue; +- peer_pdd = kfd_get_process_device_data(dev, p); +- if (WARN_ON_ONCE(!peer_pdd)) +- continue; +- kfd_flush_tlb(peer, p->pasid); +- } +- } else { +- kfd_flush_tlb(dev, p->pasid); +- } +- +- if (args->device_ids_array_size > 0 && devices_arr) +- kfree(devices_arr); +- +- return err; +- +-bind_process_to_device_failed: +-get_mem_obj_from_handle_failed: +- mutex_unlock(&p->mutex); +-copy_from_user_failed: +-sync_memory_failed: +- kfree(devices_arr); +- return err; +-} +- +-int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd) +-{ +- int err; +- struct kfd_dev *dev = pdd->dev; +- +- err = dev->kfd2kgd->unmap_memory_to_gpu( +- dev->kgd, (struct kgd_mem *) mem, pdd->vm); +- +- if (err != 0) +- return err; +- +- kfd_flush_tlb(dev, pdd->process->pasid); +- +- return 0; +-} +- +-static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; +- struct kfd_process_device *pdd, *peer_pdd; +- void *mem; +- struct kfd_dev *dev, *peer; +- long err = 0; +- uint32_t *devices_arr = NULL, num_dev, i; +- +- dev = kfd_device_by_id(GET_GPU_ID(args->handle)); +- if (!dev) +- return -EINVAL; +- +- if (args->device_ids_array_size > 0 && +- (args->device_ids_array_size < sizeof(uint32_t))) { +- pr_err("Node IDs array size %u\n", +- args->device_ids_array_size); +- return -EFAULT; +- } +- +- if (args->device_ids_array_size > 0) { +- devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); +- if (!devices_arr) +- return -ENOMEM; +- +- err = copy_from_user(devices_arr, +- (void __user *)args->device_ids_array_ptr, +- args->device_ids_array_size); +- if (err != 0) { +- err = -EFAULT; +- goto copy_from_user_failed; +- } +- } +- +- mutex_lock(&p->mutex); +- +- pdd = kfd_get_process_device_data(dev, p); +- if (!pdd) { +- pr_err("Process device data doesn't exist\n"); +- err = PTR_ERR(pdd); +- goto bind_process_to_device_failed; +- } +- +- mem = kfd_process_device_translate_handle(pdd, +- GET_IDR_HANDLE(args->handle)); +- if (!mem) { +- err = PTR_ERR(mem); +- goto get_mem_obj_from_handle_failed; +- } +- +- if (args->device_ids_array_size > 0) { +- num_dev = args->device_ids_array_size / sizeof(uint32_t); +- for (i = 0 ; i < num_dev; i++) { +- peer = kfd_device_by_id(devices_arr[i]); +- if (!peer) { +- err = -EFAULT; +- goto get_mem_obj_from_handle_failed; +- } +- +- peer_pdd = kfd_get_process_device_data(peer, p); +- if (!peer_pdd) { +- err = -EFAULT; +- goto get_mem_obj_from_handle_failed; +- } +- kfd_unmap_memory_from_gpu(mem, peer_pdd); +- } +- kfree(devices_arr); +- } else +- kfd_unmap_memory_from_gpu(mem, pdd); +- +- mutex_unlock(&p->mutex); +- +- return 0; +- +-bind_process_to_device_failed: +-get_mem_obj_from_handle_failed: +- mutex_unlock(&p->mutex); +-copy_from_user_failed: +- kfree(devices_arr); +- return err; +-} +- +-static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_set_process_dgpu_aperture_args *args = data; +- struct kfd_dev *dev; +- struct kfd_process_device *pdd; +- long err; +- +- dev = kfd_device_by_id(args->gpu_id); +- if (!dev) +- return -EINVAL; +- +- mutex_lock(&p->mutex); +- +- pdd = kfd_bind_process_to_device(dev, p); +- if (IS_ERR(pdd)) { +- err = PTR_ERR(pdd); +- goto exit; +- } +- +- err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base, +- args->dgpu_limit); +- +-exit: +- mutex_unlock(&p->mutex); +- return err; +-} +- +-static int kfd_ioctl_get_dmabuf_info(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_get_dmabuf_info_args *args = data; +- struct kfd_dev *dev = NULL; +- struct kgd_dev *dma_buf_kgd; +- void *metadata_buffer = NULL; +- uint32_t flags; +- unsigned int i; +- int r; +- +- /* Find a KFD GPU device that supports the get_dmabuf_info query */ +- for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) +- if (dev && dev->kfd2kgd->get_dmabuf_info) +- break; +- if (!dev) +- return -EINVAL; +- +- if (args->metadata_ptr) { +- metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL); +- if (!metadata_buffer) +- return -ENOMEM; +- } +- +- /* Get dmabuf info from KGD */ +- r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd, +- &dma_buf_kgd, &args->size, +- metadata_buffer, args->metadata_size, +- &args->metadata_size, &flags); +- if (r) +- goto exit; +- +- /* Reverse-lookup gpu_id from kgd pointer */ +- dev = kfd_device_by_kgd(dma_buf_kgd); +- if (!dev) { +- r = -EINVAL; +- goto exit; +- } +- args->gpu_id = dev->id; +- args->flags = flags; +- +- /* Copy metadata buffer to user mode */ +- if (metadata_buffer) { +- r = copy_to_user((void __user *)args->metadata_ptr, +- metadata_buffer, args->metadata_size); +- if (r != 0) +- r = -EFAULT; +- } +- +-exit: +- kfree(metadata_buffer); +- +- return r; +-} +- +-static int kfd_ioctl_import_dmabuf(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_import_dmabuf_args *args = data; +- struct kfd_dev *dev; +- int r; +- +- dev = kfd_device_by_id(args->gpu_id); +- if (!dev) +- return -EINVAL; +- +- r = kfd_ipc_import_dmabuf(dev, p, args->gpu_id, args->dmabuf_fd, +- args->va_addr, &args->handle, NULL); +- if (r) +- pr_err("Failed to import dmabuf\n"); +- +- return r; +-} +- +-static int kfd_ioctl_ipc_export_handle(struct file *filep, +- struct kfd_process *p, +- void *data) +-{ +- struct kfd_ioctl_ipc_export_handle_args *args = data; +- struct kfd_dev *dev; +- int r; +- +- dev = kfd_device_by_id(args->gpu_id); +- if (!dev) +- return -EINVAL; +- +- r = kfd_ipc_export_as_handle(dev, p, args->handle, args->share_handle); +- if (r) +- pr_err("Failed to export IPC handle\n"); +- +- return r; +-} +- +-static int kfd_ioctl_ipc_import_handle(struct file *filep, +- struct kfd_process *p, +- void *data) +-{ +- struct kfd_ioctl_ipc_import_handle_args *args = data; +- struct kfd_dev *dev = NULL; +- int r; +- +- dev = kfd_device_by_id(args->gpu_id); +- if (!dev) +- return -EINVAL; +- +- r = kfd_ipc_import_handle(dev, p, args->gpu_id, args->share_handle, +- args->va_addr, &args->handle, +- &args->mmap_offset); +- if (r) +- pr_err("Failed to import IPC handle\n"); +- +- return r; +-} +- +-static int kfd_ioctl_get_tile_config(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_get_tile_config_args *args = data; +- struct kfd_dev *dev; +- struct tile_config config; +- int err = 0; +- +- dev = kfd_device_by_id(args->gpu_id); +- +- dev->kfd2kgd->get_tile_config(dev->kgd, &config); +- +- args->gb_addr_config = config.gb_addr_config; +- args->num_banks = config.num_banks; +- args->num_ranks = config.num_ranks; +- +- if (args->num_tile_configs > config.num_tile_configs) +- args->num_tile_configs = config.num_tile_configs; +- err = copy_to_user((void __user *)args->tile_config_ptr, +- config.tile_config_ptr, +- args->num_tile_configs * sizeof(uint32_t)); +- if (err) { +- args->num_tile_configs = 0; +- return -EFAULT; +- } +- +- if (args->num_macro_tile_configs > config.num_macro_tile_configs) +- args->num_macro_tile_configs = +- config.num_macro_tile_configs; +- err = copy_to_user((void __user *)args->macro_tile_config_ptr, +- config.macro_tile_config_ptr, +- args->num_macro_tile_configs * sizeof(uint32_t)); +- if (err) { +- args->num_macro_tile_configs = 0; +- return -EFAULT; +- } +- +- return 0; +-} +- +-#if defined(BUILD_AS_DKMS) +-static int kfd_ioctl_cross_memory_copy(struct file *filep, +- struct kfd_process *local_p, void *data) +-{ +- return 0; +-} +-#else +-static int kfd_ioctl_cross_memory_copy(struct file *filep, +- struct kfd_process *local_p, void *data) +-{ +- struct kfd_ioctl_cross_memory_copy_args *args = data; +- struct kfd_memory_range *src_array, *dst_array; +- struct kfd_bo *src_bo, *dst_bo; +- struct kfd_process *remote_p, *src_p, *dst_p; +- struct task_struct *remote_task; +- struct mm_struct *remote_mm; +- struct pid *remote_pid; +- struct dma_fence *fence = NULL, *lfence = NULL; +- uint64_t dst_va_addr; +- uint64_t copied, total_copied = 0; +- uint64_t src_offset, dst_offset; +- int i, j = 0, err = 0; +- +- /* Check parameters */ +- if (args->src_mem_range_array == 0 || args->dst_mem_range_array == 0 || +- args->src_mem_array_size == 0 || args->dst_mem_array_size == 0) +- return -EINVAL; +- args->bytes_copied = 0; +- +- /* Allocate space for source and destination arrays */ +- src_array = kmalloc_array((args->src_mem_array_size + +- args->dst_mem_array_size), +- sizeof(struct kfd_memory_range), +- GFP_KERNEL); +- if (!src_array) +- return -ENOMEM; +- dst_array = &src_array[args->src_mem_array_size]; +- +- if (copy_from_user(src_array, (void __user *)args->src_mem_range_array, +- args->src_mem_array_size * +- sizeof(struct kfd_memory_range))) { +- err = -EFAULT; +- goto copy_from_user_fail; +- } +- if (copy_from_user(dst_array, (void __user *)args->dst_mem_range_array, +- args->dst_mem_array_size * +- sizeof(struct kfd_memory_range))) { +- err = -EFAULT; +- goto copy_from_user_fail; +- } +- +- /* Get remote process */ +- remote_pid = find_get_pid(args->pid); +- if (!remote_pid) { +- pr_err("Cross mem copy failed. Invalid PID %d\n", args->pid); +- err = -ESRCH; +- goto copy_from_user_fail; +- } +- +- remote_task = get_pid_task(remote_pid, PIDTYPE_PID); +- if (!remote_pid) { +- pr_err("Cross mem copy failed. Invalid PID or task died %d\n", +- args->pid); +- err = -ESRCH; +- goto get_pid_task_fail; +- } +- +- /* Check access permission */ +- remote_mm = mm_access(remote_task, PTRACE_MODE_ATTACH_REALCREDS); +- if (!remote_mm || IS_ERR(remote_mm)) { +- err = IS_ERR(remote_mm) ? PTR_ERR(remote_mm) : -ESRCH; +- if (err == -EACCES) { +- pr_err("Cross mem copy failed. Permission error\n"); +- err = -EPERM; +- } else +- pr_err("Cross mem copy failed. Invalid task %d\n", +- err); +- goto mm_access_fail; +- } +- +- remote_p = kfd_get_process(remote_task); +- if (!remote_p) { +- pr_err("Cross mem copy failed. Invalid kfd process %d\n", +- args->pid); +- err = -EINVAL; +- goto kfd_process_fail; +- } +- +- if (KFD_IS_CROSS_MEMORY_WRITE(args->flags)) { +- src_p = local_p; +- dst_p = remote_p; +- pr_debug("CMA WRITE: local -> remote\n"); +- } else { +- src_p = remote_p; +- dst_p = local_p; +- pr_debug("CMA READ: remote -> local\n"); +- } +- +- +- /* For each source kfd_range: +- * - Find the BO. Each range has to be within the same BO. +- * - Copy this range to single or multiple destination BOs. +- * - dst_va_addr - will point to next va address into which data will +- * be copied. +- * - dst_bo & src_bo - the current destination and source BOs +- * - src_offset & dst_offset - offset into the respective BOs from +- * data will be sourced or copied +- */ +- dst_va_addr = dst_array[0].va_addr; +- mutex_lock(&dst_p->mutex); +- dst_bo = kfd_process_find_bo_from_interval(dst_p, +- dst_va_addr, +- dst_va_addr + dst_array[0].size - 1); +- mutex_unlock(&dst_p->mutex); +- if (!dst_bo) { +- err = -EFAULT; +- goto kfd_process_fail; +- } +- dst_offset = dst_va_addr - dst_bo->it.start; +- +- for (i = 0; i < args->src_mem_array_size; i++) { +- uint64_t src_va_addr_end = src_array[i].va_addr + +- src_array[i].size - 1; +- uint64_t src_size_to_copy = src_array[i].size; +- +- mutex_lock(&src_p->mutex); +- src_bo = kfd_process_find_bo_from_interval(src_p, +- src_array[i].va_addr, +- src_va_addr_end); +- mutex_unlock(&src_p->mutex); +- if (!src_bo || src_va_addr_end > src_bo->it.last) { +- pr_err("Cross mem copy failed. Invalid range\n"); +- err = -EFAULT; +- break; +- } +- +- src_offset = src_array[i].va_addr - src_bo->it.start; +- +- /* Copy src_bo to one or multiple dst_bo(s) based on size and +- * and current copy location. +- */ +- while (j < args->dst_mem_array_size) { +- uint64_t copy_size; +- int64_t space_left; +- +- /* Find the current copy_size. This will be smaller of +- * the following +- * - space left in the current dest memory range +- * - data left to copy from source range +- */ +- space_left = (dst_array[j].va_addr + dst_array[j].size) +- - dst_va_addr; +- copy_size = (src_size_to_copy < space_left) ? +- src_size_to_copy : space_left; +- +- /* Check both BOs belong to same device */ +- if (src_bo->dev->kgd != dst_bo->dev->kgd) { +- pr_err("Cross Memory failed. Not same device\n"); +- err = -EINVAL; +- break; +- } +- +- /* Store prev fence. Release it when a later fence is +- * created +- */ +- lfence = fence; +- fence = NULL; +- +- err = dst_bo->dev->kfd2kgd->copy_mem_to_mem( +- src_bo->dev->kgd, +- src_bo->mem, src_offset, +- dst_bo->mem, dst_offset, +- copy_size, +- &fence, &copied); +- +- if (err) { +- pr_err("GPU Cross mem copy failed\n"); +- err = -EFAULT; +- break; +- } +- +- /* Later fence available. Release old fence */ +- if (fence && lfence) { +- dma_fence_put(lfence); +- lfence = NULL; +- } +- +- total_copied += copied; +- src_size_to_copy -= copied; +- space_left -= copied; +- dst_va_addr += copied; +- dst_offset += copied; +- src_offset += copied; +- if (dst_va_addr > dst_bo->it.last + 1) { +- pr_err("Cross mem copy failed. Memory overflow\n"); +- err = -EFAULT; +- break; +- } +- +- /* If the cur dest range is full move to next one */ +- if (space_left <= 0) { +- if (++j >= args->dst_mem_array_size) +- break; +- +- dst_va_addr = dst_array[j].va_addr; +- dst_bo = kfd_process_find_bo_from_interval( +- dst_p, +- dst_va_addr, +- dst_va_addr + +- dst_array[j].size - 1); +- dst_offset = dst_va_addr - dst_bo->it.start; +- } +- +- /* If the cur src range is done, move to next one */ +- if (src_size_to_copy <= 0) +- break; +- } +- if (err) +- break; +- } +- +- /* Wait for the last fence irrespective of error condition */ +- if (fence) { +- if (dma_fence_wait_timeout(fence, false, msecs_to_jiffies(1000)) +- < 0) +- pr_err("Cross mem copy failed. BO timed out\n"); +- dma_fence_put(fence); +- } else if (lfence) { +- pr_debug("GPU copy fail. But wait for prev DMA to finish\n"); +- dma_fence_wait_timeout(lfence, true, msecs_to_jiffies(1000)); +- dma_fence_put(lfence); +- } +- +-kfd_process_fail: +- mmput(remote_mm); +-mm_access_fail: +- put_task_struct(remote_task); +-get_pid_task_fail: +- put_pid(remote_pid); +-copy_from_user_fail: +- kfree(src_array); +- +- /* An error could happen after partial copy. In that case this will +- * reflect partial amount of bytes copied +- */ +- args->bytes_copied = total_copied; +- return err; +-} +-#endif +- +-static int kfd_ioctl_get_queue_wave_state(struct file *filep, +- struct kfd_process *p, void *data) +-{ +- struct kfd_ioctl_get_queue_wave_state_args *args = data; +- int r; +- +- mutex_lock(&p->mutex); +- +- r = pqm_get_wave_state(&p->pqm, args->queue_id, +- (void __user *)args->ctl_stack_address, +- &args->ctl_stack_used_size, +- &args->save_area_used_size); +- +- mutex_unlock(&p->mutex); +- +- return r; +-} + + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ +- [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ +- .cmd_drv = 0, .name = #ioctl} ++ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0, .name = #ioctl} + + /** Ioctl table */ + static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { +@@ -1999,55 +899,6 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, + kfd_ioctl_dbg_wave_control, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, +- kfd_ioctl_alloc_memory_of_gpu, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU, +- kfd_ioctl_free_memory_of_gpu, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU, +- kfd_ioctl_map_memory_to_gpu, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, +- kfd_ioctl_unmap_memory_from_gpu, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH, +- kfd_ioctl_alloc_scratch_memory, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, +- kfd_ioctl_set_cu_mask, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, +- kfd_ioctl_set_process_dgpu_aperture, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, +- kfd_ioctl_set_trap_handler, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, +- kfd_ioctl_get_process_apertures_new, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO, +- kfd_ioctl_get_dmabuf_info, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, +- kfd_ioctl_import_dmabuf, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, +- kfd_ioctl_get_tile_config, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_IMPORT_HANDLE, +- kfd_ioctl_ipc_import_handle, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_EXPORT_HANDLE, +- kfd_ioctl_ipc_export_handle, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_CROSS_MEMORY_COPY, +- kfd_ioctl_cross_memory_copy, 0), +- +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE, +- kfd_ioctl_get_queue_wave_state, 0) +- + }; + + #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) +@@ -2143,37 +994,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) + { + struct kfd_process *process; +- struct kfd_dev *kfd; +- unsigned long vm_pgoff; +- int retval; + + process = kfd_get_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + +- vm_pgoff = vma->vm_pgoff; +- vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff); +- +- switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { +- case KFD_MMAP_TYPE_DOORBELL: +- kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); +- if (!kfd) +- return -EFAULT; +- return kfd_doorbell_mmap(kfd, process, vma); +- +- case KFD_MMAP_TYPE_EVENTS: ++ if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == ++ KFD_MMAP_DOORBELL_MASK) { ++ vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; ++ return kfd_doorbell_mmap(process, vma); ++ } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == ++ KFD_MMAP_EVENTS_MASK) { ++ vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; + return kfd_event_mmap(process, vma); +- +- case KFD_MMAP_TYPE_MAP_BO: +- kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); +- if (!kfd) +- return -EFAULT; +- retval = kfd->kfd2kgd->mmap_bo(kfd->kgd, vma); +- return retval; +- +- case KFD_MMAP_TYPE_RESERVED_MEM: +- return kfd_reserved_mem_mmap(process, vma); +- + } + + return -EFAULT; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +deleted file mode 100644 +index 4e94081..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c ++++ /dev/null +@@ -1,1304 +0,0 @@ +-#include <linux/kernel.h> +-#include <linux/acpi.h> +-#include <linux/mm.h> +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +-#include <linux/amd-iommu.h> +-#endif +-#include <linux/pci.h> +-#include "kfd_crat.h" +-#include "kfd_priv.h" +-#include "kfd_topology.h" +- +-/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. +- * GPU processor ID are expressed with Bit[31]=1. +- * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs +- * used in the CRAT. +- */ +-static uint32_t gpu_processor_id_low = 0x80001000; +- +-/* Return the next available gpu_processor_id and increment it for next GPU +- * @total_cu_count - Total CUs present in the GPU including ones +- * masked off +- */ +-static inline unsigned int get_and_inc_gpu_processor_id( +- unsigned int total_cu_count) +-{ +- int current_id = gpu_processor_id_low; +- +- gpu_processor_id_low += total_cu_count; +- return current_id; +-} +- +-/* Static table to describe GPU Cache information */ +-struct kfd_gpu_cache_info { +- uint32_t cache_size; +- uint32_t cache_level; +- uint32_t flags; +- /* Indicates how many Compute Units share this cache +- * Value = 1 indicates the cache is not shared +- */ +- uint32_t num_cu_shared; +-}; +- +-static struct kfd_gpu_cache_info kaveri_cache_info[] = { +- { +- /* TCP L1 Cache per CU */ +- .cache_size = 16, +- .cache_level = 1, +- .flags = (CRAT_CACHE_FLAGS_ENABLED | +- CRAT_CACHE_FLAGS_DATA_CACHE | +- CRAT_CACHE_FLAGS_SIMD_CACHE), +- .num_cu_shared = 1, +- +- }, +- { +- /* Scalar L1 Instruction Cache (in SQC module) per bank */ +- .cache_size = 16, +- .cache_level = 1, +- .flags = (CRAT_CACHE_FLAGS_ENABLED | +- CRAT_CACHE_FLAGS_INST_CACHE | +- CRAT_CACHE_FLAGS_SIMD_CACHE), +- .num_cu_shared = 2, +- }, +- { +- /* Scalar L1 Data Cache (in SQC module) per bank */ +- .cache_size = 8, +- .cache_level = 1, +- .flags = (CRAT_CACHE_FLAGS_ENABLED | +- CRAT_CACHE_FLAGS_DATA_CACHE | +- CRAT_CACHE_FLAGS_SIMD_CACHE), +- .num_cu_shared = 2, +- }, +- +- /* TODO: Add L2 Cache information */ +-}; +- +- +-static struct kfd_gpu_cache_info carrizo_cache_info[] = { +- { +- /* TCP L1 Cache per CU */ +- .cache_size = 16, +- .cache_level = 1, +- .flags = (CRAT_CACHE_FLAGS_ENABLED | +- CRAT_CACHE_FLAGS_DATA_CACHE | +- CRAT_CACHE_FLAGS_SIMD_CACHE), +- .num_cu_shared = 1, +- }, +- { +- /* Scalar L1 Instruction Cache (in SQC module) per bank */ +- .cache_size = 8, +- .cache_level = 1, +- .flags = (CRAT_CACHE_FLAGS_ENABLED | +- CRAT_CACHE_FLAGS_INST_CACHE | +- CRAT_CACHE_FLAGS_SIMD_CACHE), +- .num_cu_shared = 4, +- }, +- { +- /* Scalar L1 Data Cache (in SQC module) per bank. */ +- .cache_size = 4, +- .cache_level = 1, +- .flags = (CRAT_CACHE_FLAGS_ENABLED | +- CRAT_CACHE_FLAGS_DATA_CACHE | +- CRAT_CACHE_FLAGS_SIMD_CACHE), +- .num_cu_shared = 4, +- }, +- +- /* TODO: Add L2 Cache information */ +-}; +- +-/* NOTE: In future if more information is added to struct kfd_gpu_cache_info +- * the following ASICs may need a separate table. +- */ +-#define hawaii_cache_info kaveri_cache_info +-#define tonga_cache_info carrizo_cache_info +-#define fiji_cache_info carrizo_cache_info +-#define polaris10_cache_info carrizo_cache_info +-#define polaris11_cache_info carrizo_cache_info +-/* TODO - check & update Vega10 cache details */ +-#define vega10_cache_info carrizo_cache_info +-#define raven_cache_info carrizo_cache_info +- +-static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, +- struct crat_subtype_computeunit *cu) +-{ +- dev->node_props.cpu_cores_count = cu->num_cpu_cores; +- dev->node_props.cpu_core_id_base = cu->processor_id_low; +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) +- dev->node_props.capability |= HSA_CAP_ATS_PRESENT; +-#endif +- +- pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, +- cu->processor_id_low); +-} +- +-static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, +- struct crat_subtype_computeunit *cu) +-{ +- dev->node_props.simd_id_base = cu->processor_id_low; +- dev->node_props.simd_count = cu->num_simd_cores; +- dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; +- dev->node_props.max_waves_per_simd = cu->max_waves_simd; +- dev->node_props.wave_front_size = cu->wave_front_size; +- dev->node_props.array_count = cu->array_count; +- dev->node_props.cu_per_simd_array = cu->num_cu_per_array; +- dev->node_props.simd_per_cu = cu->num_simd_per_cu; +- dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; +- if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) +- dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; +- pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); +-} +- +-/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct +- * topology device present in the device_list +- */ +-static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, +- struct list_head *device_list) +-{ +- struct kfd_topology_device *dev; +- +- pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", +- cu->proximity_domain, cu->hsa_capability); +- list_for_each_entry(dev, device_list, list) { +- if (cu->proximity_domain == dev->proximity_domain) { +- if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) +- kfd_populated_cu_info_cpu(dev, cu); +- +- if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) +- kfd_populated_cu_info_gpu(dev, cu); +- break; +- } +- } +- +- return 0; +-} +- +-/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct +- * topology device present in the device_list +- */ +-static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, +- struct list_head *device_list) +-{ +- struct kfd_mem_properties *props; +- struct kfd_topology_device *dev; +- +- pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", +- mem->proximity_domain); +- list_for_each_entry(dev, device_list, list) { +- if (mem->proximity_domain == dev->proximity_domain) { +- props = kfd_alloc_struct(props); +- if (!props) +- return -ENOMEM; +- +- /* We're on GPU node */ +- if (dev->node_props.cpu_cores_count == 0) { +- /* APU */ +- if (mem->visibility_type == 0) +- props->heap_type = +- HSA_MEM_HEAP_TYPE_FB_PRIVATE; +- /* dGPU */ +- else +- props->heap_type = mem->visibility_type; +- } else +- props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; +- +- if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) +- props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; +- if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) +- props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; +- +- props->size_in_bytes = +- ((uint64_t)mem->length_high << 32) + +- mem->length_low; +- props->width = mem->width; +- +- dev->node_props.mem_banks_count++; +- list_add_tail(&props->list, &dev->mem_props); +- +- break; +- } +- } +- +- return 0; +-} +- +-/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct +- * topology device present in the device_list +- */ +-static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, +- struct list_head *device_list) +-{ +- struct kfd_cache_properties *props; +- struct kfd_topology_device *dev; +- uint32_t id; +- uint32_t total_num_of_cu; +- +- id = cache->processor_id_low; +- +- list_for_each_entry(dev, device_list, list) { +- total_num_of_cu = (dev->node_props.array_count * +- dev->node_props.cu_per_simd_array); +- +- /* Cache infomration in CRAT doesn't have proximity_domain +- * information as it is associated with a CPU core or GPU +- * Compute Unit. So map the cache using CPU core Id or SIMD +- * (GPU) ID. +- * TODO: This works because currently we can safely assume that +- * Compute Units are parsed before caches are parsed. In +- * future, remove this dependency +- */ +- if ((id >= dev->node_props.cpu_core_id_base && +- id <= dev->node_props.cpu_core_id_base + +- dev->node_props.cpu_cores_count) || +- (id >= dev->node_props.simd_id_base && +- id < dev->node_props.simd_id_base + +- total_num_of_cu)) { +- props = kfd_alloc_struct(props); +- if (!props) +- return -ENOMEM; +- +- props->processor_id_low = id; +- props->cache_level = cache->cache_level; +- props->cache_size = cache->cache_size; +- props->cacheline_size = cache->cache_line_size; +- props->cachelines_per_tag = cache->lines_per_tag; +- props->cache_assoc = cache->associativity; +- props->cache_latency = cache->cache_latency; +- memcpy(props->sibling_map, cache->sibling_map, +- sizeof(props->sibling_map)); +- +- if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) +- props->cache_type |= HSA_CACHE_TYPE_DATA; +- if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) +- props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; +- if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) +- props->cache_type |= HSA_CACHE_TYPE_CPU; +- if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) +- props->cache_type |= HSA_CACHE_TYPE_HSACU; +- +- dev->cache_count++; +- dev->node_props.caches_count++; +- list_add_tail(&props->list, &dev->cache_props); +- +- break; +- } +- } +- +- return 0; +-} +- +-/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct +- * topology device present in the device_list +- */ +-static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, +- struct list_head *device_list) +-{ +- struct kfd_iolink_properties *props = NULL, *props2; +- struct kfd_topology_device *dev, *cpu_dev; +- uint32_t id_from; +- uint32_t id_to; +- +- id_from = iolink->proximity_domain_from; +- id_to = iolink->proximity_domain_to; +- +- pr_debug("Found IO link entry in CRAT table with id_from=%d\n", +- id_from); +- list_for_each_entry(dev, device_list, list) { +- if (id_from == dev->proximity_domain) { +- props = kfd_alloc_struct(props); +- if (!props) +- return -ENOMEM; +- +- props->node_from = id_from; +- props->node_to = id_to; +- props->ver_maj = iolink->version_major; +- props->ver_min = iolink->version_minor; +- props->iolink_type = iolink->io_interface_type; +- +- if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) +- props->weight = 20; +- else +- props->weight = node_distance(id_from, id_to); +- +- props->min_latency = iolink->minimum_latency; +- props->max_latency = iolink->maximum_latency; +- props->min_bandwidth = iolink->minimum_bandwidth_mbs; +- props->max_bandwidth = iolink->maximum_bandwidth_mbs; +- props->rec_transfer_size = +- iolink->recommended_transfer_size; +- +- dev->io_link_count++; +- dev->node_props.io_links_count++; +- list_add_tail(&props->list, &dev->io_link_props); +- break; +- } +- } +- +- /* CPU topology is created before GPUs are detected, so CPU->GPU +- * links are not built at that time. If a PCIe type is discovered, it +- * means a GPU is detected and we are adding GPU->CPU to the topology. +- * At this time, also add the corresponded CPU->GPU link. +- */ +- if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) { +- cpu_dev = kfd_topology_device_by_proximity_domain(id_to); +- if (!cpu_dev) +- return -ENODEV; +- /* same everything but the other direction */ +- props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); +- props2->node_from = id_to; +- props2->node_to = id_from; +- props2->kobj = NULL; +- cpu_dev->io_link_count++; +- cpu_dev->node_props.io_links_count++; +- list_add_tail(&props2->list, &cpu_dev->io_link_props); +- } +- +- return 0; +-} +- +-/* kfd_parse_subtype - parse subtypes and attach it to correct topology device +- * present in the device_list +- * @sub_type_hdr - subtype section of crat_image +- * @device_list - list of topology devices present in this crat_image +- */ +-static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, +- struct list_head *device_list) +-{ +- struct crat_subtype_computeunit *cu; +- struct crat_subtype_memory *mem; +- struct crat_subtype_cache *cache; +- struct crat_subtype_iolink *iolink; +- int ret = 0; +- +- switch (sub_type_hdr->type) { +- case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: +- cu = (struct crat_subtype_computeunit *)sub_type_hdr; +- ret = kfd_parse_subtype_cu(cu, device_list); +- break; +- case CRAT_SUBTYPE_MEMORY_AFFINITY: +- mem = (struct crat_subtype_memory *)sub_type_hdr; +- ret = kfd_parse_subtype_mem(mem, device_list); +- break; +- case CRAT_SUBTYPE_CACHE_AFFINITY: +- cache = (struct crat_subtype_cache *)sub_type_hdr; +- ret = kfd_parse_subtype_cache(cache, device_list); +- break; +- case CRAT_SUBTYPE_TLB_AFFINITY: +- /* For now, nothing to do here */ +- pr_debug("Found TLB entry in CRAT table (not processing)\n"); +- break; +- case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: +- /* For now, nothing to do here */ +- pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); +- break; +- case CRAT_SUBTYPE_IOLINK_AFFINITY: +- iolink = (struct crat_subtype_iolink *)sub_type_hdr; +- ret = kfd_parse_subtype_iolink(iolink, device_list); +- break; +- default: +- pr_warn("Unknown subtype %d in CRAT\n", +- sub_type_hdr->type); +- } +- +- return ret; +-} +- +-/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT +- * create a kfd_topology_device and add in to device_list. Also parse +- * CRAT subtypes and attach it to appropriate kfd_topology_device +- * @crat_image - input image containing CRAT +- * @device_list - [OUT] list of kfd_topology_device generated after +- * parsing crat_image +- * @proximity_domain - Proximity domain of the first device in the table +- * +- * Return - 0 if successful else -ve value +- */ +-int kfd_parse_crat_table(void *crat_image, +- struct list_head *device_list, +- uint32_t proximity_domain) +-{ +- struct kfd_topology_device *top_dev = NULL; +- struct crat_subtype_generic *sub_type_hdr; +- uint16_t node_id; +- int ret = 0; +- struct crat_header *crat_table = (struct crat_header *)crat_image; +- uint16_t num_nodes; +- uint32_t image_len; +- uint32_t last_header_type, last_header_length; +- +- if (!crat_image) +- return -EINVAL; +- +- if (!list_empty(device_list)) { +- pr_warn("Error device list should be empty\n"); +- return -EINVAL; +- } +- +- num_nodes = crat_table->num_domains; +- image_len = crat_table->length; +- +- pr_info("Parsing CRAT table with %d nodes\n", num_nodes); +- +- for (node_id = 0; node_id < num_nodes; node_id++) { +- top_dev = kfd_create_topology_device(device_list); +- if (!top_dev) +- break; +- top_dev->proximity_domain = proximity_domain++; +- } +- +- if (!top_dev) { +- ret = -ENOMEM; +- goto err; +- } +- +- memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); +- memcpy(top_dev->oem_table_id, crat_table->oem_table_id, +- CRAT_OEMTABLEID_LENGTH); +- top_dev->oem_revision = crat_table->oem_revision; +- +- last_header_type = last_header_length = 0; +- sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); +- while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < +- ((char *)crat_image) + image_len) { +- pr_debug("Parsing CRAT subtype header %p enabled: %s type: 0x%x length %d\n", +- sub_type_hdr, +- (sub_type_hdr->flags & +- CRAT_SUBTYPE_FLAGS_ENABLED) +- ? "true" : "false", +- sub_type_hdr->type, +- sub_type_hdr->length); +- +- if (sub_type_hdr->length == 0) { +- pr_err("Parsing wrong CRAT's subtype header last header type: %d last header len %d\n", +- last_header_type, last_header_type); +- pr_err("Current header type %d length %d\n", +- sub_type_hdr->type, sub_type_hdr->length); +- break; +- } +- +- if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { +- ret = kfd_parse_subtype(sub_type_hdr, device_list); +- if (ret != 0) +- break; +- } +- +- last_header_type = sub_type_hdr->type; +- last_header_length = sub_type_hdr->length; +- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + +- sub_type_hdr->length); +- } +- +-err: +- if (ret) +- kfd_release_topology_device_list(device_list); +- +- return ret; +-} +- +-/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ +-static int fill_in_pcache(struct crat_subtype_cache *pcache, +- struct kfd_gpu_cache_info *pcache_info, +- struct kfd_cu_info *cu_info, +- int mem_available, +- int cu_bitmask, +- int cache_type, unsigned int cu_processor_id, +- int cu_block) +-{ +- unsigned int cu_sibling_map_mask; +- int first_active_cu; +- +- /* First check if enough memory is available */ +- if (mem_available - sizeof(struct crat_subtype_cache) < 0) +- return -ENOMEM; +- +- cu_sibling_map_mask = cu_bitmask; +- cu_sibling_map_mask >>= cu_block; +- cu_sibling_map_mask &= +- ((1 << pcache_info[cache_type].num_cu_shared) - 1); +- first_active_cu = ffs(cu_sibling_map_mask); +- +- /* CU could be inactive. In case of shared cache find the first active +- * CU. and incase of non-shared cache check if the CU is inactive. If +- * inactive active skip it +- */ +- if (first_active_cu) { +- memset(pcache, 0, sizeof(struct crat_subtype_cache)); +- pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; +- pcache->length = sizeof(struct crat_subtype_cache); +- pcache->flags = pcache_info[cache_type].flags; +- pcache->processor_id_low = cu_processor_id +- + (first_active_cu - 1); +- pcache->cache_level = pcache_info[cache_type].cache_level; +- pcache->cache_size = pcache_info[cache_type].cache_size; +- +- /* Sibling map is w.r.t processor_id_low, so shift out +- * inactive CU +- */ +- cu_sibling_map_mask = +- cu_sibling_map_mask >> (first_active_cu - 1); +- +- pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); +- pcache->sibling_map[1] = +- (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); +- pcache->sibling_map[2] = +- (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); +- pcache->sibling_map[3] = +- (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); +- return 0; +- } +- return 1; +-} +- +-/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info +- * tables +- * +- * @kdev - [IN] GPU device +- * @gpu_processor_id - [IN] GPU processor ID to which these caches +- * associate +- * @available_size - [IN] Amount of memory available in pcache +- * @cu_info - [IN] Compute Unit info obtained from KGD +- * @pcache - [OUT] memory into which cache data is to be filled in. +- * @size_filled - [OUT] amount of data used up in pcache. +- * @num_of_entries - [OUT] number of caches added +- */ +-static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, +- int gpu_processor_id, +- int available_size, +- struct kfd_cu_info *cu_info, +- struct crat_subtype_cache *pcache, +- int *size_filled, +- int *num_of_entries) +-{ +- struct kfd_gpu_cache_info *pcache_info; +- int num_of_cache_types = 0; +- int i, j, k; +- int ct = 0; +- int mem_available = available_size; +- unsigned int cu_processor_id; +- int ret; +- +- switch (kdev->device_info->asic_family) { +- case CHIP_KAVERI: +- pcache_info = kaveri_cache_info; +- num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); +- break; +- case CHIP_HAWAII: +- pcache_info = hawaii_cache_info; +- num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); +- break; +- case CHIP_CARRIZO: +- pcache_info = carrizo_cache_info; +- num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); +- break; +- case CHIP_TONGA: +- pcache_info = tonga_cache_info; +- num_of_cache_types = ARRAY_SIZE(tonga_cache_info); +- break; +- case CHIP_FIJI: +- pcache_info = fiji_cache_info; +- num_of_cache_types = ARRAY_SIZE(fiji_cache_info); +- break; +- case CHIP_POLARIS10: +- pcache_info = polaris10_cache_info; +- num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); +- break; +- case CHIP_POLARIS11: +- pcache_info = polaris11_cache_info; +- num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); +- break; +- case CHIP_VEGA10: +- pcache_info = vega10_cache_info; +- num_of_cache_types = ARRAY_SIZE(vega10_cache_info); +- break; +- case CHIP_RAVEN: +- pcache_info = raven_cache_info; +- num_of_cache_types = ARRAY_SIZE(raven_cache_info); +- break; +- default: +- return -EINVAL; +- } +- +- *size_filled = 0; +- *num_of_entries = 0; +- +- /* For each type of cache listed in the kfd_gpu_cache_info table, +- * go through all available Compute Units. +- * The [i,j,k] loop will +- * if kfd_gpu_cache_info.num_cu_shared = 1 +- * will parse through all available CU +- * If (kfd_gpu_cache_info.num_cu_shared != 1) +- * then it will consider only one CU from +- * the shared unit +- */ +- +- for (ct = 0; ct < num_of_cache_types; ct++) { +- cu_processor_id = gpu_processor_id; +- for (i = 0; i < cu_info->num_shader_engines; i++) { +- for (j = 0; j < cu_info->num_shader_arrays_per_engine; +- j++) { +- for (k = 0; k < cu_info->num_cu_per_sh; +- k += pcache_info[ct].num_cu_shared) { +- +- ret = fill_in_pcache(pcache, +- pcache_info, +- cu_info, +- mem_available, +- cu_info->cu_bitmap[i][j], +- ct, +- cu_processor_id, +- k); +- +- if (ret < 0) +- break; +- +- if (!ret) { +- pcache++; +- (*num_of_entries)++; +- mem_available -= +- sizeof(*pcache); +- (*size_filled) += +- sizeof(*pcache); +- } +- +- /* Move to next CU block */ +- cu_processor_id += +- pcache_info[ct].num_cu_shared; +- } +- } +- } +- } +- +- pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); +- +- return 0; +-} +- +-/* +- * kfd_create_crat_image_acpi - Allocates memory for CRAT image and +- * copies CRAT from ACPI (if available). +- * NOTE: Call kfd_destroy_crat_image to free CRAT image memory +- * +- * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then +- * crat_image will be NULL +- * @size: [OUT] size of crat_image +- * +- * Return 0 if successful else return -ve value +- */ +-#ifdef CONFIG_ACPI +-int kfd_create_crat_image_acpi(void **crat_image, size_t *size) +-{ +- struct acpi_table_header *crat_table; +- acpi_status status; +- void *pcrat_image; +- +- if (!crat_image) +- return -EINVAL; +- +- *crat_image = NULL; +- +- /* Fetch the CRAT table from ACPI */ +- status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); +- if (status == AE_NOT_FOUND) { +- pr_warn("CRAT table not found\n"); +- return -ENODATA; +- } else if (ACPI_FAILURE(status)) { +- const char *err = acpi_format_exception(status); +- +- pr_err("CRAT table error: %s\n", err); +- return -EINVAL; +- } +- +- if (ignore_crat) { +- pr_info("CRAT table disabled by module option\n"); +- return -ENODATA; +- } +- +- pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); +- if (!pcrat_image) { +- pr_err("No memory for allocating CRAT image\n"); +- return -ENOMEM; +- } +- +- memcpy(pcrat_image, crat_table, crat_table->length); +- +- *crat_image = pcrat_image; +- *size = crat_table->length; +- +- return 0; +-} +-#endif +- +-/* Memory required to create Virtual CRAT. +- * Since there is no easy way to predict the amount of memory required, the +- * following amount are allocated for CPU and GPU Virtual CRAT. This is +- * expected to cover all known conditions. But to be safe additional check +- * is put in the code to ensure we don't overwrite. +- */ +-#define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) +-#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) +- +-/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node +- * +- * @numa_node_id: CPU NUMA node id +- * @avail_size: Available size in the memory +- * @sub_type_hdr: Memory into which compute info will be filled in +- * +- * Return 0 if successful else return -ve value +- */ +-static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, +- int proximity_domain, +- struct crat_subtype_computeunit *sub_type_hdr) +-{ +- const struct cpumask *cpumask; +- +- *avail_size -= sizeof(struct crat_subtype_computeunit); +- if (*avail_size < 0) +- return -ENOMEM; +- +- memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); +- +- /* Fill in subtype header data */ +- sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; +- sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); +- sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; +- +- cpumask = cpumask_of_node(numa_node_id); +- +- /* Fill in CU data */ +- sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; +- sub_type_hdr->proximity_domain = proximity_domain; +- sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); +- if (sub_type_hdr->processor_id_low == -1) +- return -EINVAL; +- +- sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); +- +- return 0; +-} +- +-/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node +- * +- * @numa_node_id: CPU NUMA node id +- * @avail_size: Available size in the memory +- * @sub_type_hdr: Memory into which compute info will be filled in +- * +- * Return 0 if successful else return -ve value +- */ +-static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, +- int proximity_domain, +- struct crat_subtype_memory *sub_type_hdr) +-{ +- uint64_t mem_in_bytes = 0; +- pg_data_t *pgdat; +- int zone_type; +- +- *avail_size -= sizeof(struct crat_subtype_memory); +- if (*avail_size < 0) +- return -ENOMEM; +- +- memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); +- +- /* Fill in subtype header data */ +- sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; +- sub_type_hdr->length = sizeof(struct crat_subtype_memory); +- sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; +- +- /* Fill in Memory Subunit data */ +- +- /* Unlike si_meminfo, si_meminfo_node is not exported. So +- * the following lines are duplicated from si_meminfo_node +- * function +- */ +- pgdat = NODE_DATA(numa_node_id); +- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) +- mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; +- mem_in_bytes <<= PAGE_SHIFT; +- +- sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); +- sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); +- sub_type_hdr->proximity_domain = proximity_domain; +- +- return 0; +-} +- +-#ifdef CONFIG_X86_64 +-static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, +- uint32_t *num_entries, +- struct crat_subtype_iolink *sub_type_hdr) +-{ +- int nid; +- struct cpuinfo_x86 *c = &cpu_data(0); +- uint8_t link_type; +- +- if (c->x86_vendor == X86_VENDOR_AMD) +- link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; +- else +- link_type = CRAT_IOLINK_TYPE_QPI_1_1; +- +- *num_entries = 0; +- +- /* Create IO links from this node to other CPU nodes */ +- for_each_online_node(nid) { +- if (nid == numa_node_id) /* node itself */ +- continue; +- +- *avail_size -= sizeof(struct crat_subtype_iolink); +- if (*avail_size < 0) +- return -ENOMEM; +- +- memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); +- +- /* Fill in subtype header data */ +- sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; +- sub_type_hdr->length = sizeof(struct crat_subtype_iolink); +- sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; +- +- /* Fill in IO link data */ +- sub_type_hdr->proximity_domain_from = numa_node_id; +- sub_type_hdr->proximity_domain_to = nid; +- sub_type_hdr->io_interface_type = link_type; +- +- (*num_entries)++; +- sub_type_hdr++; +- } +- +- return 0; +-} +-#endif +- +-/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU +- * +- * @pcrat_image: Fill in VCRAT for CPU +- * @size: [IN] allocated size of crat_image. +- * [OUT] actual size of data filled in crat_image +- */ +-static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) +-{ +- struct crat_header *crat_table = (struct crat_header *)pcrat_image; +- struct crat_subtype_generic *sub_type_hdr; +- int avail_size = *size; +- int numa_node_id; +- int ret = 0; +-#ifdef CONFIG_ACPI +- struct acpi_table_header *acpi_table; +- acpi_status status; +-#endif +-#ifdef CONFIG_X86_64 +- uint32_t entries = 0; +-#endif +- +- if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) +- return -EINVAL; +- +- /* Fill in CRAT Header. +- * Modify length and total_entries as subunits are added. +- */ +- avail_size -= sizeof(struct crat_header); +- if (avail_size < 0) +- return -ENOMEM; +- +- memset(crat_table, 0, sizeof(struct crat_header)); +- memcpy(&crat_table->signature, CRAT_SIGNATURE, +- sizeof(crat_table->signature)); +- crat_table->length = sizeof(struct crat_header); +- +-#ifdef CONFIG_ACPI +- status = acpi_get_table("DSDT", 0, &acpi_table); +- if (status == AE_NOT_FOUND) +- pr_warn("DSDT table not found for OEM information\n"); +- else { +- crat_table->oem_revision = acpi_table->revision; +- memcpy(crat_table->oem_id, acpi_table->oem_id, +- CRAT_OEMID_LENGTH); +- memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, +- CRAT_OEMTABLEID_LENGTH); +- } +-#else +- crat_table->oem_revision = 0; +- memcpy(crat_table->oem_id, "INV", CRAT_OEMID_LENGTH); +- memcpy(crat_table->oem_table_id, "UNAVAIL", CRAT_OEMTABLEID_LENGTH); +-#endif +- crat_table->total_entries = 0; +- crat_table->num_domains = 0; +- +- sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); +- +- for_each_online_node(numa_node_id) { +- if (kfd_numa_node_to_apic_id(numa_node_id) == -1) +- continue; +- +- /* Fill in Subtype: Compute Unit */ +- ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, +- crat_table->num_domains, +- (struct crat_subtype_computeunit *)sub_type_hdr); +- if (ret < 0) +- return ret; +- crat_table->length += sub_type_hdr->length; +- crat_table->total_entries++; +- +- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + +- sub_type_hdr->length); +- +- /* Fill in Subtype: Memory */ +- ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, +- crat_table->num_domains, +- (struct crat_subtype_memory *)sub_type_hdr); +- if (ret < 0) +- return ret; +- crat_table->length += sub_type_hdr->length; +- crat_table->total_entries++; +- +- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + +- sub_type_hdr->length); +- +- /* Fill in Subtype: IO Link */ +-#ifdef CONFIG_X86_64 +- ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, +- &entries, +- (struct crat_subtype_iolink *)sub_type_hdr); +- if (ret < 0) +- return ret; +- crat_table->length += (sub_type_hdr->length * entries); +- crat_table->total_entries += entries; +- +- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + +- sub_type_hdr->length * entries); +-#else +- pr_info("IO link not available for non x86 platforms\n"); +-#endif +- +- crat_table->num_domains++; +- } +- +- /* TODO: Add cache Subtype for CPU. +- * Currently, CPU cache information is available in function +- * detect_cache_attributes(cpu) defined in the file +- * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not +- * exported and to get the same information the code needs to be +- * duplicated. +- */ +- +- *size = crat_table->length; +- pr_info("Virtual CRAT table created for CPU\n"); +- +- return 0; +-} +- +-static int kfd_fill_gpu_memory_affinity(int *avail_size, +- struct kfd_dev *kdev, uint8_t type, uint64_t size, +- struct crat_subtype_memory *sub_type_hdr, +- uint32_t proximity_domain, +- const struct kfd_local_mem_info *local_mem_info) +-{ +- *avail_size -= sizeof(struct crat_subtype_memory); +- if (*avail_size < 0) +- return -ENOMEM; +- +- memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); +- sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; +- sub_type_hdr->length = sizeof(struct crat_subtype_memory); +- sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; +- +- sub_type_hdr->proximity_domain = proximity_domain; +- +- pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", +- type, size); +- +- sub_type_hdr->length_low = lower_32_bits(size); +- sub_type_hdr->length_high = upper_32_bits(size); +- +- sub_type_hdr->width = local_mem_info->vram_width; +- sub_type_hdr->visibility_type = type; +- +- return 0; +-} +- +-/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU +- * to its NUMA node +- * @avail_size: Available size in the memory +- * @kdev - [IN] GPU device +- * @sub_type_hdr: Memory into which io link info will be filled in +- * @proximity_domain - proximity domain of the GPU node +- * +- * Return 0 if successful else return -ve value +- */ +-static int kfd_fill_gpu_direct_io_link(int *avail_size, +- struct kfd_dev *kdev, +- struct crat_subtype_iolink *sub_type_hdr, +- uint32_t proximity_domain) +-{ +- *avail_size -= sizeof(struct crat_subtype_iolink); +- if (*avail_size < 0) +- return -ENOMEM; +- +- memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); +- +- /* Fill in subtype header data */ +- sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; +- sub_type_hdr->length = sizeof(struct crat_subtype_iolink); +- sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; +- +- /* Fill in IOLINK subtype. +- * TODO: Fill-in other fields of iolink subtype +- */ +- sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; +- sub_type_hdr->proximity_domain_from = proximity_domain; +-#ifdef CONFIG_NUMA +- if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) +- sub_type_hdr->proximity_domain_to = 0; +- else +- sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; +-#else +- sub_type_hdr->proximity_domain_to = 0; +-#endif +- return 0; +-} +- +-/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU +- * +- * @pcrat_image: Fill in VCRAT for GPU +- * @size: [IN] allocated size of crat_image. +- * [OUT] actual size of data filled in crat_image +- */ +-static int kfd_create_vcrat_image_gpu(void *pcrat_image, +- size_t *size, struct kfd_dev *kdev, +- uint32_t proximity_domain) +-{ +- struct crat_header *crat_table = (struct crat_header *)pcrat_image; +- struct crat_subtype_generic *sub_type_hdr; +- struct crat_subtype_computeunit *cu; +- struct kfd_cu_info cu_info; +- int avail_size = *size; +- uint32_t total_num_of_cu; +- int num_of_cache_entries = 0; +- int cache_mem_filled = 0; +- int ret = 0; +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- struct amd_iommu_device_info iommu_info; +- const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | +- AMD_IOMMU_DEVICE_FLAG_PRI_SUP | +- AMD_IOMMU_DEVICE_FLAG_PASID_SUP; +-#endif +- struct kfd_local_mem_info local_mem_info; +- +- if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) +- return -EINVAL; +- +- /* Fill the CRAT Header. +- * Modify length and total_entries as subunits are added. +- */ +- avail_size -= sizeof(struct crat_header); +- if (avail_size < 0) +- return -ENOMEM; +- +- memset(crat_table, 0, sizeof(struct crat_header)); +- +- memcpy(&crat_table->signature, CRAT_SIGNATURE, +- sizeof(crat_table->signature)); +- /* Change length as we add more subtypes*/ +- crat_table->length = sizeof(struct crat_header); +- crat_table->num_domains = 1; +- crat_table->total_entries = 0; +- +- /* Fill in Subtype: Compute Unit +- * First fill in the sub type header and then sub type data +- */ +- avail_size -= sizeof(struct crat_subtype_computeunit); +- if (avail_size < 0) +- return -ENOMEM; +- +- sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); +- memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); +- +- sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; +- sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); +- sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; +- +- /* Fill CU subtype data */ +- cu = (struct crat_subtype_computeunit *)sub_type_hdr; +- cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; +- cu->proximity_domain = proximity_domain; +- +- kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); +- cu->num_simd_per_cu = cu_info.simd_per_cu; +- cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; +- cu->max_waves_simd = cu_info.max_waves_per_simd; +- +- cu->wave_front_size = cu_info.wave_front_size; +- cu->array_count = cu_info.num_shader_arrays_per_engine * +- cu_info.num_shader_engines; +- total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); +- cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); +- cu->num_cu_per_array = cu_info.num_cu_per_sh; +- cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; +- cu->num_banks = cu_info.num_shader_engines; +- cu->lds_size_in_kb = cu_info.lds_size; +- +- cu->hsa_capability = 0; +- +- /* Check if this node supports IOMMU. During parsing this flag will +- * translate to HSA_CAP_ATS_PRESENT +- */ +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- iommu_info.flags = 0; +- if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) { +- if ((iommu_info.flags & required_iommu_flags) == +- required_iommu_flags) +- cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; +- } +-#endif +- +- crat_table->length += sub_type_hdr->length; +- crat_table->total_entries++; +- +- /* Fill in Subtype: Memory. Only on systems with large BAR (no +- * private FB), report memory as public. On other systems +- * report the total FB size (public+private) as a single +- * private heap. +- */ +- kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); +- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + +- sub_type_hdr->length); +- +- if (debug_largebar) +- local_mem_info.local_mem_size_private = 0; +- +- if (local_mem_info.local_mem_size_private == 0) +- ret = kfd_fill_gpu_memory_affinity(&avail_size, +- kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, +- local_mem_info.local_mem_size_public, +- (struct crat_subtype_memory *)sub_type_hdr, +- proximity_domain, +- &local_mem_info); +- else +- ret = kfd_fill_gpu_memory_affinity(&avail_size, +- kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, +- local_mem_info.local_mem_size_public + +- local_mem_info.local_mem_size_private, +- (struct crat_subtype_memory *)sub_type_hdr, +- proximity_domain, +- &local_mem_info); +- if (ret < 0) +- return ret; +- +- crat_table->length += sizeof(struct crat_subtype_memory); +- crat_table->total_entries++; +- +- /* TODO: Fill in cache information. This information is NOT readily +- * available in KGD +- */ +- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + +- sub_type_hdr->length); +- ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, +- avail_size, +- &cu_info, +- (struct crat_subtype_cache *)sub_type_hdr, +- &cache_mem_filled, +- &num_of_cache_entries); +- +- if (ret < 0) +- return ret; +- +- crat_table->length += cache_mem_filled; +- crat_table->total_entries += num_of_cache_entries; +- avail_size -= cache_mem_filled; +- +- /* Fill in Subtype: IO_LINKS +- * Only direct links are added here which is Link from GPU to +- * to its NUMA node. Indirect links are added by userspace. +- */ +- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + +- cache_mem_filled); +- ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, +- (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); +- +- if (ret < 0) +- return ret; +- +- crat_table->length += sub_type_hdr->length; +- crat_table->total_entries++; +- +- *size = crat_table->length; +- pr_info("Virtual CRAT table created for GPU\n"); +- +- return ret; +-} +- +-/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and +- * creates a Virtual CRAT (VCRAT) image +- * +- * NOTE: Call kfd_destroy_crat_image to free CRAT image memory +- * +- * @crat_image: VCRAT image created because ACPI does not have a +- * CRAT for this device +- * @size: [OUT] size of virtual crat_image +- * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device +- * COMPUTE_UNIT_GPU - Create VCRAT for GPU +- * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU +- * -- this option is not currently implemented. +- * The assumption is that all AMD APUs will have CRAT +- * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU +- * +- * Return 0 if successful else return -ve value +- */ +-int kfd_create_crat_image_virtual(void **crat_image, size_t *size, +- int flags, struct kfd_dev *kdev, uint32_t proximity_domain) +-{ +- void *pcrat_image = NULL; +- int ret = 0; +- +- if (!crat_image) +- return -EINVAL; +- +- *crat_image = NULL; +- +- /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and +- * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover +- * all the current conditions. A check is put not to overwrite beyond +- * allocated size +- */ +- switch (flags) { +- case COMPUTE_UNIT_CPU: +- pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); +- if (!pcrat_image) +- return -ENOMEM; +- *size = VCRAT_SIZE_FOR_CPU; +- ret = kfd_create_vcrat_image_cpu(pcrat_image, size); +- break; +- case COMPUTE_UNIT_GPU: +- if (!kdev) +- return -EINVAL; +- pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); +- if (!pcrat_image) +- return -ENOMEM; +- *size = VCRAT_SIZE_FOR_GPU; +- ret = kfd_create_vcrat_image_gpu(pcrat_image, size, +- kdev, proximity_domain); +- break; +- case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): +- /* TODO: */ +- ret = -EINVAL; +- pr_err("VCRAT not implemented for APU\n"); +- break; +- default: +- ret = -EINVAL; +- } +- +- if (!ret) +- *crat_image = pcrat_image; +- else +- kfree(pcrat_image); +- +- return ret; +-} +- +- +-/* kfd_destroy_crat_image +- * +- * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) +- * +- */ +-void kfd_destroy_crat_image(void *crat_image) +-{ +- kfree(crat_image); +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +index 00de41f..a374fa3 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +@@ -24,7 +24,6 @@ + #define KFD_CRAT_H_INCLUDED + + #include <linux/types.h> +-#include "kfd_priv.h" + + #pragma pack(1) + +@@ -45,10 +44,6 @@ + + #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) + +-/* Compute Unit flags */ +-#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ +-#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ +- + struct crat_header { + uint32_t signature; + uint32_t length; +@@ -110,7 +105,7 @@ struct crat_subtype_computeunit { + uint8_t wave_front_size; + uint8_t num_banks; + uint16_t micro_engine_id; +- uint8_t array_count; ++ uint8_t num_arrays; + uint8_t num_cu_per_array; + uint8_t num_simd_per_cu; + uint8_t max_slots_scatch_cu; +@@ -132,14 +127,13 @@ struct crat_subtype_memory { + uint8_t length; + uint16_t reserved; + uint32_t flags; +- uint32_t proximity_domain; ++ uint32_t promixity_domain; + uint32_t base_addr_low; + uint32_t base_addr_high; + uint32_t length_low; + uint32_t length_high; + uint32_t width; +- uint8_t visibility_type; /* for virtual (dGPU) CRAT */ +- uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; ++ uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; + }; + + /* +@@ -228,12 +222,9 @@ struct crat_subtype_ccompute { + /* + * HSA IO Link Affinity structure and definitions + */ +-#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) +-#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) +-#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) +-#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) +-#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) +-#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 ++#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 ++#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 ++#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc + + /* + * IO interface types +@@ -241,16 +232,8 @@ struct crat_subtype_ccompute { + #define CRAT_IOLINK_TYPE_UNDEFINED 0 + #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 + #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 +-#define CRAT_IOLINK_TYPE_AMBA 3 +-#define CRAT_IOLINK_TYPE_MIPI 4 +-#define CRAT_IOLINK_TYPE_QPI_1_1 5 +-#define CRAT_IOLINK_TYPE_RESERVED1 6 +-#define CRAT_IOLINK_TYPE_RESERVED2 7 +-#define CRAT_IOLINK_TYPE_RAPID_IO 8 +-#define CRAT_IOLINK_TYPE_INFINIBAND 9 +-#define CRAT_IOLINK_TYPE_RESERVED3 10 +-#define CRAT_IOLINK_TYPE_OTHER 11 +-#define CRAT_IOLINK_TYPE_MAX 255 ++#define CRAT_IOLINK_TYPE_OTHER 3 ++#define CRAT_IOLINK_TYPE_MAX 255 + + #define CRAT_IOLINK_RESERVED_LENGTH 24 + +@@ -308,13 +291,4 @@ struct cdit_header { + + #pragma pack() + +-#ifdef CONFIG_ACPI +-int kfd_create_crat_image_acpi(void **crat_image, size_t *size); +-#endif +-void kfd_destroy_crat_image(void *crat_image); +-int kfd_parse_crat_table(void *crat_image, +- struct list_head *device_list, +- uint32_t proximity_domain); +-int kfd_create_crat_image_virtual(void **crat_image, size_t *size, +- int flags, struct kfd_dev *kdev, uint32_t proximity_domain); + #endif /* KFD_CRAT_H_INCLUDED */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +index af6d736..d5e19b5 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +@@ -29,7 +29,7 @@ + #include <linux/mutex.h> + #include <linux/device.h> + +-#include "kfd_pm4_headers_vi.h" ++#include "kfd_pm4_headers.h" + #include "kfd_pm4_headers_diq.h" + #include "kfd_kernel_queue.h" + #include "kfd_priv.h" +@@ -42,15 +42,16 @@ + + static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev) + { ++ BUG_ON(!dev || !dev->kfd2kgd); ++ + dev->kfd2kgd->address_watch_disable(dev->kgd); + } + + static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + unsigned int pasid, uint64_t vmid0_address, +- uint32_t *packet_buff, size_t size_in_bytes, +- bool sync) ++ uint32_t *packet_buff, size_t size_in_bytes) + { +- struct pm4_mec_release_mem *rm_packet; ++ struct pm4__release_mem *rm_packet; + struct pm4__indirect_buffer_pasid *ib_packet; + struct kfd_mem_obj *mem_obj; + size_t pq_packets_size_in_bytes; +@@ -61,14 +62,12 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + unsigned int *ib_packet_buff; + int status; + +- if (WARN_ON(!size_in_bytes)) +- return -EINVAL; ++ BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes); + + kq = dbgdev->kq; + +- pq_packets_size_in_bytes = sizeof(struct pm4__indirect_buffer_pasid); +- if (sync) +- pq_packets_size_in_bytes += sizeof(struct pm4_mec_release_mem); ++ pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + ++ sizeof(struct pm4__indirect_buffer_pasid); + + /* + * We acquire a buffer from DIQ +@@ -78,8 +77,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + status = kq->ops.acquire_packet_buffer(kq, + pq_packets_size_in_bytes / sizeof(uint32_t), + &ib_packet_buff); +- if (status) { +- pr_err("acquire_packet_buffer failed\n"); ++ if (status != 0) { ++ pr_err("amdkfd: acquire_packet_buffer failed\n"); + return status; + } + +@@ -101,11 +100,6 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + + ib_packet->bitfields5.pasid = pasid; + +- if (!sync) { +- kq->ops.submit_packet(kq); +- return status; +- } +- + /* + * for now we use release mem for GPU-CPU synchronization + * Consider WaitRegMem + WriteData as a better alternative +@@ -114,15 +108,15 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + * (a) Sync with HW + * (b) Sync var is written by CP to mem. + */ +- rm_packet = (struct pm4_mec_release_mem *) (ib_packet_buff + ++ rm_packet = (struct pm4__release_mem *) (ib_packet_buff + + (sizeof(struct pm4__indirect_buffer_pasid) / + sizeof(unsigned int))); + + status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), + &mem_obj); + +- if (status) { +- pr_err("Failed to allocate GART memory\n"); ++ if (status != 0) { ++ pr_err("amdkfd: Failed to allocate GART memory\n"); + kq->ops.rollback_packet(kq); + return status; + } +@@ -133,7 +127,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + + rm_packet->header.opcode = IT_RELEASE_MEM; + rm_packet->header.type = PM4_TYPE_3; +- rm_packet->header.count = sizeof(struct pm4_mec_release_mem) / ++ rm_packet->header.count = sizeof(struct pm4__release_mem) / + sizeof(unsigned int) - 2; + + rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; +@@ -174,6 +168,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + + static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) + { ++ BUG_ON(!dbgdev); ++ + /* + * no action is needed in this case, + * just make sure diq will not be used +@@ -191,12 +187,14 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) + struct kernel_queue *kq = NULL; + int status; + +- properties.type = KFD_QUEUE_TYPE_DIQ; ++ BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->dev); ++ + status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, +- &properties, &qid); ++ &properties, 0, KFD_QUEUE_TYPE_DIQ, ++ &qid); + + if (status) { +- pr_err("Failed to create DIQ\n"); ++ pr_err("amdkfd: Failed to create DIQ\n"); + return status; + } + +@@ -204,8 +202,8 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) + + kq = pqm_get_kernel_queue(dbgdev->pqm, qid); + +- if (!kq) { +- pr_err("Error getting DIQ\n"); ++ if (kq == NULL) { ++ pr_err("amdkfd: Error getting DIQ\n"); + pqm_destroy_queue(dbgdev->pqm, qid); + return -EFAULT; + } +@@ -217,6 +215,8 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) + + static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev) + { ++ BUG_ON(!dbgdev || !dbgdev->dev); ++ + /* disable watch address */ + dbgdev_address_watch_disable_nodiq(dbgdev->dev); + return 0; +@@ -227,6 +227,8 @@ static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev) + /* todo - disable address watch */ + int status; + ++ BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->kq); ++ + status = pqm_destroy_queue(dbgdev->pqm, + dbgdev->kq->queue->properties.queue_id); + dbgdev->kq = NULL; +@@ -239,17 +241,18 @@ static void dbgdev_address_watch_set_registers( + union TCP_WATCH_ADDR_H_BITS *addrHi, + union TCP_WATCH_ADDR_L_BITS *addrLo, + union TCP_WATCH_CNTL_BITS *cntl, +- unsigned int index, unsigned int vmid, +- bool is_apu) ++ unsigned int index, unsigned int vmid) + { + union ULARGE_INTEGER addr; + ++ BUG_ON(!adw_info || !addrHi || !addrLo || !cntl); ++ + addr.quad_part = 0; + addrHi->u32All = 0; + addrLo->u32All = 0; + cntl->u32All = 0; + +- if (adw_info->watch_mask) ++ if (adw_info->watch_mask != NULL) + cntl->bitfields.mask = + (uint32_t) (adw_info->watch_mask[index] & + ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); +@@ -265,9 +268,9 @@ static void dbgdev_address_watch_set_registers( + + cntl->bitfields.mode = adw_info->watch_mode[index]; + cntl->bitfields.vmid = (uint32_t) vmid; +- /* for APU assume it is an ATC address */ +- if (is_apu) +- cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; ++ /* for now assume it is an ATC address */ ++ cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; ++ + pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask); + pr_debug("\t\t%20s %08x\n", "set reg add high :", + addrHi->bitfields.addr); +@@ -276,7 +279,7 @@ static void dbgdev_address_watch_set_registers( + } + + static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, +- struct dbg_address_watch_info *adw_info) ++ struct dbg_address_watch_info *adw_info) + { + union TCP_WATCH_ADDR_H_BITS addrHi; + union TCP_WATCH_ADDR_L_BITS addrLo; +@@ -284,11 +287,13 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, + struct kfd_process_device *pdd; + unsigned int i; + ++ BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); ++ + /* taking the vmid for that process on the safe way using pdd */ + pdd = kfd_get_process_device_data(dbgdev->dev, + adw_info->process); + if (!pdd) { +- pr_err("Failed to get pdd for wave control no DIQ\n"); ++ pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); + return -EFAULT; + } + +@@ -298,19 +303,19 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, + + if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || + (adw_info->num_watch_points == 0)) { +- pr_err("num_watch_points is invalid\n"); ++ pr_err("amdkfd: num_watch_points is invalid\n"); + return -EINVAL; + } + +- if (!adw_info->watch_mode || !adw_info->watch_address) { +- pr_err("adw_info fields are not valid\n"); ++ if ((adw_info->watch_mode == NULL) || ++ (adw_info->watch_address == NULL)) { ++ pr_err("amdkfd: adw_info fields are not valid\n"); + return -EINVAL; + } + +- for (i = 0; i < adw_info->num_watch_points; i++) { ++ for (i = 0 ; i < adw_info->num_watch_points ; i++) { + dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, +- &cntl, i, pdd->qpd.vmid, +- dbgdev->dev->device_info->is_need_iommu_device); ++ &cntl, i, pdd->qpd.vmid); + + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + pr_debug("\t\t%20s %08x\n", "register index :", i); +@@ -343,43 +348,48 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, + } + + static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, +- struct dbg_address_watch_info *adw_info) ++ struct dbg_address_watch_info *adw_info) + { + struct pm4__set_config_reg *packets_vec; + union TCP_WATCH_ADDR_H_BITS addrHi; + union TCP_WATCH_ADDR_L_BITS addrLo; + union TCP_WATCH_CNTL_BITS cntl; ++ struct kfd_mem_obj *mem_obj; + unsigned int aw_reg_add_dword; + uint32_t *packet_buff_uint; +- uint64_t packet_buff_gpu_addr; + unsigned int i; + int status; + size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; + /* we do not control the vmid in DIQ mode, just a place holder */ + unsigned int vmid = 0; + ++ BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); ++ + addrHi.u32All = 0; + addrLo.u32All = 0; + cntl.u32All = 0; + + if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || + (adw_info->num_watch_points == 0)) { +- pr_err("num_watch_points is invalid\n"); ++ pr_err("amdkfd: num_watch_points is invalid\n"); + return -EINVAL; + } + +- if (!adw_info->watch_mode || !adw_info->watch_address) { +- pr_err("adw_info fields are not valid\n"); ++ if ((NULL == adw_info->watch_mode) || ++ (NULL == adw_info->watch_address)) { ++ pr_err("amdkfd: adw_info fields are not valid\n"); + return -EINVAL; + } + +- status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, +- ib_size/sizeof(uint32_t), +- &packet_buff_uint, &packet_buff_gpu_addr); +- if (status) { +- pr_err("Failed to allocate IB from DIQ ring\n"); ++ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); ++ ++ if (status != 0) { ++ pr_err("amdkfd: Failed to allocate GART memory\n"); + return status; + } ++ ++ packet_buff_uint = mem_obj->cpu_ptr; ++ + memset(packet_buff_uint, 0, ib_size); + + packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); +@@ -398,9 +408,12 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + packets_vec[3].bitfields2.insert_vmid = 1; + + for (i = 0; i < adw_info->num_watch_points; i++) { +- dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, +- &cntl, i, vmid, +- dbgdev->dev->device_info->is_need_iommu_device); ++ dbgdev_address_watch_set_registers(adw_info, ++ &addrHi, ++ &addrLo, ++ &cntl, ++ i, ++ vmid); + + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + pr_debug("\t\t%20s %08x\n", "register index :", i); +@@ -429,6 +442,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + i, + ADDRESS_WATCH_REG_CNTL); + ++ aw_reg_add_dword /= sizeof(uint32_t); ++ + packets_vec[0].bitfields2.reg_offset = + aw_reg_add_dword - AMD_CONFIG_REG_BASE; + +@@ -440,6 +455,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + i, + ADDRESS_WATCH_REG_ADDR_HI); + ++ aw_reg_add_dword /= sizeof(uint32_t); ++ + packets_vec[1].bitfields2.reg_offset = + aw_reg_add_dword - AMD_CONFIG_REG_BASE; + packets_vec[1].reg_data[0] = addrHi.u32All; +@@ -450,6 +467,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + i, + ADDRESS_WATCH_REG_ADDR_LO); + ++ aw_reg_add_dword /= sizeof(uint32_t); ++ + packets_vec[2].bitfields2.reg_offset = + aw_reg_add_dword - AMD_CONFIG_REG_BASE; + packets_vec[2].reg_data[0] = addrLo.u32All; +@@ -466,6 +485,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + i, + ADDRESS_WATCH_REG_CNTL); + ++ aw_reg_add_dword /= sizeof(uint32_t); ++ + packets_vec[3].bitfields2.reg_offset = + aw_reg_add_dword - AMD_CONFIG_REG_BASE; + packets_vec[3].reg_data[0] = cntl.u32All; +@@ -473,30 +494,32 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + status = dbgdev_diq_submit_ib( + dbgdev, + adw_info->process->pasid, +- packet_buff_gpu_addr, ++ mem_obj->gpu_addr, + packet_buff_uint, +- ib_size, true); ++ ib_size); + +- if (status) { +- pr_err("Failed to submit IB to DIQ\n"); +- return status; ++ if (status != 0) { ++ pr_err("amdkfd: Failed to submit IB to DIQ\n"); ++ break; + } + } + ++ kfd_gtt_sa_free(dbgdev->dev, mem_obj); + return status; + } + + static int dbgdev_wave_control_set_registers( + struct dbg_wave_control_info *wac_info, + union SQ_CMD_BITS *in_reg_sq_cmd, +- union GRBM_GFX_INDEX_BITS *in_reg_gfx_index, +- unsigned int asic_family) ++ union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) + { + int status = 0; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct HsaDbgWaveMsgAMDGen2 *pMsg; + ++ BUG_ON(!wac_info || !in_reg_sq_cmd || !in_reg_gfx_index); ++ + reg_sq_cmd.u32All = 0; + reg_gfx_index.u32All = 0; + pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2; +@@ -548,25 +571,11 @@ static int dbgdev_wave_control_set_registers( + + switch (wac_info->operand) { + case HSA_DBG_WAVEOP_HALT: +- if (asic_family == CHIP_KAVERI) { +- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; +- pr_debug("Halting KV\n"); +- } else { +- reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; +- reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT; +- pr_debug("Halting CZ\n"); +- } ++ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; + break; + + case HSA_DBG_WAVEOP_RESUME: +- if (asic_family == CHIP_KAVERI) { +- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; +- pr_debug("Resuming KV\n"); +- } else { +- reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; +- reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME; +- pr_debug("Resuming CZ\n"); +- } ++ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; + break; + + case HSA_DBG_WAVEOP_KILL: +@@ -606,21 +615,23 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, + int status; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; ++ struct kfd_mem_obj *mem_obj; + uint32_t *packet_buff_uint; +- uint64_t packet_buff_gpu_addr; + struct pm4__set_config_reg *packets_vec; + size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; + ++ BUG_ON(!dbgdev || !wac_info); ++ + reg_sq_cmd.u32All = 0; + + status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, +- ®_gfx_index, dbgdev->dev->device_info->asic_family); ++ ®_gfx_index); + if (status) { +- pr_err("Failed to set wave control registers\n"); ++ pr_err("amdkfd: Failed to set wave control registers\n"); + return status; + } + +- /* we do not control the VMID in DIQ, so reset it to a known value */ ++ /* we do not control the VMID in DIQ,so reset it to a known value */ + reg_sq_cmd.bits.vm_id = 0; + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); +@@ -653,13 +664,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + +- status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, +- ib_size / sizeof(uint32_t), +- &packet_buff_uint, &packet_buff_gpu_addr); +- if (status) { +- pr_err("Failed to allocate IB from DIQ ring\n"); ++ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); ++ ++ if (status != 0) { ++ pr_err("amdkfd: Failed to allocate GART memory\n"); + return status; + } ++ ++ packet_buff_uint = mem_obj->cpu_ptr; ++ + memset(packet_buff_uint, 0, ib_size); + + packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; +@@ -702,12 +715,14 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, + status = dbgdev_diq_submit_ib( + dbgdev, + wac_info->process->pasid, +- packet_buff_gpu_addr, ++ mem_obj->gpu_addr, + packet_buff_uint, +- ib_size, false); ++ ib_size); + +- if (status) +- pr_err("Failed to submit IB to DIQ\n"); ++ if (status != 0) ++ pr_err("amdkfd: Failed to submit IB to DIQ\n"); ++ ++ kfd_gtt_sa_free(dbgdev->dev, mem_obj); + + return status; + } +@@ -720,19 +735,21 @@ static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, + union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct kfd_process_device *pdd; + ++ BUG_ON(!dbgdev || !dbgdev->dev || !wac_info); ++ + reg_sq_cmd.u32All = 0; + + /* taking the VMID for that process on the safe way using PDD */ + pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process); + + if (!pdd) { +- pr_err("Failed to get pdd for wave control no DIQ\n"); ++ pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); + return -EFAULT; + } + status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, +- ®_gfx_index, dbgdev->dev->device_info->asic_family); ++ ®_gfx_index); + if (status) { +- pr_err("Failed to set wave control registers\n"); ++ pr_err("amdkfd: Failed to set wave control registers\n"); + return status; + } + +@@ -783,8 +800,13 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct kfd_process_device *pdd; + struct dbg_wave_control_info wac_info; +- int first_vmid_to_scan = dev->vm_info.first_vmid_kfd; +- int last_vmid_to_scan = dev->vm_info.last_vmid_kfd; ++ int temp; ++ int first_vmid_to_scan = 8; ++ int last_vmid_to_scan = 15; ++ ++ first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1; ++ temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan; ++ last_vmid_to_scan = first_vmid_to_scan + ffz(temp); + + reg_sq_cmd.u32All = 0; + status = 0; +@@ -796,13 +818,12 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + + /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. + * ATC_VMID15_PASID_MAPPING +- * to check which VMID the current process is mapped to. +- */ ++ * to check which VMID the current process is mapped to. */ + + for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { + if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid + (dev->kgd, vmid)) { +- if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid ++ if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid + (dev->kgd, vmid) == p->pasid) { + pr_debug("Killing wave fronts of vmid %d and pasid %d\n", + vmid, p->pasid); +@@ -812,7 +833,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + } + + if (vmid > last_vmid_to_scan) { +- pr_err("Didn't find vmid for pasid %d\n", p->pasid); ++ pr_err("amdkfd: didn't found vmid for pasid (%d)\n", p->pasid); + return -EFAULT; + } + +@@ -822,7 +843,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + return -EFAULT; + + status = dbgdev_wave_control_set_registers(&wac_info, ®_sq_cmd, +- ®_gfx_index, dev->device_info->asic_family); ++ ®_gfx_index); + if (status != 0) + return -EINVAL; + +@@ -839,6 +860,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, + enum DBGDEV_TYPE type) + { ++ BUG_ON(!pdbgdev || !pdev); ++ + pdbgdev->dev = pdev; + pdbgdev->kq = NULL; + pdbgdev->type = type; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h +index 583aaa9..03424c2 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h +@@ -60,24 +60,6 @@ enum { + SH_REG_SIZE = SH_REG_END - SH_REG_BASE + }; + +-/* SQ_CMD definitions */ +- +-enum { +- SQ_IND_CMD_DATA_RESUME = 0, +- SQ_IND_CMD_DATA_HALT = 1 +-}; +- +-enum SQ_IND_CMD_NEW { +- SQ_IND_CMD_NEW_NULL = 0x00000000, +- SQ_IND_CMD_NEW_SETHALT = 0x00000001, +- SQ_IND_CMD_NEW_SAVECTX = 0x00000002, +- SQ_IND_CMD_NEW_KILL = 0x00000003, +- SQ_IND_CMD_NEW_DEBUG = 0x00000004, +- SQ_IND_CMD_NEW_TRAP = 0x00000005, +- SQ_IND_CMD_NEW_SET_PRIO = 0x00000006 +- +-}; +- + enum SQ_IND_CMD_CMD { + SQ_IND_CMD_CMD_NULL = 0x00000000, + SQ_IND_CMD_CMD_HALT = 0x00000001, +@@ -136,20 +118,6 @@ union SQ_CMD_BITS { + uint32_t:1; + uint32_t vm_id:4; + } bitfields, bits; +- struct { +- uint32_t cmd:3; +- uint32_t:1; +- uint32_t mode:3; +- uint32_t check_vmid:1; +- uint32_t data:3; +- uint32_t:5; +- uint32_t wave_id:4; +- uint32_t simd_id:2; +- uint32_t:2; +- uint32_t queue_id:3; +- uint32_t:1; +- uint32_t vm_id:4; +- } bitfields_sethalt, bits_sethalt; + uint32_t u32All; + signed int i32All; + float f32All; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +index 9d4af96..56d6763 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +@@ -33,7 +33,6 @@ + #include "kfd_pm4_headers_diq.h" + #include "kfd_dbgmgr.h" + #include "kfd_dbgdev.h" +-#include "kfd_device_queue_manager.h" + + static DEFINE_MUTEX(kfd_dbgmgr_mutex); + +@@ -45,6 +44,8 @@ struct mutex *kfd_get_dbgmgr_mutex(void) + + static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) + { ++ BUG_ON(!pmgr); ++ + kfree(pmgr->dbgdev); + + pmgr->dbgdev = NULL; +@@ -54,7 +55,7 @@ static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) + + void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) + { +- if (pmgr) { ++ if (pmgr != NULL) { + kfd_dbgmgr_uninitialize(pmgr); + kfree(pmgr); + } +@@ -65,12 +66,12 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) + enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; + struct kfd_dbgmgr *new_buff; + +- if (WARN_ON(!pdev->init_complete)) +- return false; ++ BUG_ON(pdev == NULL); ++ BUG_ON(!pdev->init_complete); + + new_buff = kfd_alloc_struct(new_buff); + if (!new_buff) { +- pr_err("Failed to allocate dbgmgr instance\n"); ++ pr_err("amdkfd: Failed to allocate dbgmgr instance\n"); + return false; + } + +@@ -78,13 +79,13 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) + new_buff->dev = pdev; + new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev); + if (!new_buff->dbgdev) { +- pr_err("Failed to allocate dbgdev instance\n"); ++ pr_err("amdkfd: Failed to allocate dbgdev instance\n"); + kfree(new_buff); + return false; + } + + /* get actual type of DBGDevice cpsch or not */ +- if (pdev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) ++ if (sched_policy == KFD_SCHED_POLICY_NO_HWS) + type = DBGDEV_TYPE_NODIQ; + + kfd_dbgdev_init(new_buff->dbgdev, pdev, type); +@@ -95,6 +96,8 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) + + long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) + { ++ BUG_ON(!p || !pmgr || !pmgr->dbgdev); ++ + if (pmgr->pasid != 0) { + pr_debug("H/W debugger is already active using pasid %d\n", + pmgr->pasid); +@@ -115,6 +118,8 @@ long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) + + long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) + { ++ BUG_ON(!p || !pmgr || !pmgr->dbgdev); ++ + /* Is the requests coming from the already registered process? */ + if (pmgr->pasid != p->pasid) { + pr_debug("H/W debugger is not registered by calling pasid %d\n", +@@ -132,6 +137,8 @@ long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) + long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, + struct dbg_wave_control_info *wac_info) + { ++ BUG_ON(!pmgr || !pmgr->dbgdev || !wac_info); ++ + /* Is the requests coming from the already registered process? */ + if (pmgr->pasid != wac_info->process->pasid) { + pr_debug("H/W debugger support was not registered for requester pasid %d\n", +@@ -145,6 +152,9 @@ long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, + long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, + struct dbg_address_watch_info *adw_info) + { ++ BUG_ON(!pmgr || !pmgr->dbgdev || !adw_info); ++ ++ + /* Is the requests coming from the already registered process? */ + if (pmgr->pasid != adw_info->process->pasid) { + pr_debug("H/W debugger support was not registered for requester pasid %d\n", +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h +index a04a1fe..257a745 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h +@@ -30,11 +30,13 @@ + #pragma pack(push, 4) + + enum HSA_DBG_WAVEOP { +- HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ +- HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ +- HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ +- HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter dbg mode */ +- HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take a trap */ ++ HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ ++ HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ ++ HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ ++ HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter ++ debug mode */ ++ HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take ++ a trap */ + HSA_DBG_NUM_WAVEOP = 5, + HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF + }; +@@ -79,13 +81,15 @@ struct HsaDbgWaveMsgAMDGen2 { + uint32_t UserData:8; /* user data */ + uint32_t ShaderArray:1; /* Shader array */ + uint32_t Priv:1; /* Privileged */ +- uint32_t Reserved0:4; /* Reserved, should be 0 */ ++ uint32_t Reserved0:4; /* This field is reserved, ++ should be 0 */ + uint32_t WaveId:4; /* wave id */ + uint32_t SIMD:2; /* SIMD id */ + uint32_t HSACU:4; /* Compute unit */ + uint32_t ShaderEngine:2;/* Shader engine */ + uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ +- uint32_t Reserved1:4; /* Reserved, should be 0 */ ++ uint32_t Reserved1:4; /* This field is reserved, ++ should be 0 */ + } ui32; + uint32_t Value; + }; +@@ -117,23 +121,20 @@ struct HsaDbgWaveMessage { + * in the user mode instruction stream. The OS scheduler event is typically + * associated and signaled by an interrupt issued by the GPU, but other HSA + * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced +- * by the KFD by this mechanism, too. +- */ ++ * by the KFD by this mechanism, too. */ + + /* these are the new definitions for events */ + enum HSA_EVENTTYPE { + HSA_EVENTTYPE_SIGNAL = 0, /* user-mode generated GPU signal */ + HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ + HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change +- * (start/stop) +- */ ++ (start/stop) */ + HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ + HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ + HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ + HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */ + HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state +- * (EOP pm4) +- */ ++ (EOP pm4) */ + /* ... */ + HSA_EVENTTYPE_MAXID, + HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +deleted file mode 100644 +index 232e28f..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c ++++ /dev/null +@@ -1,75 +0,0 @@ +-/* +- * Copyright 2014 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#include <linux/debugfs.h> +-#include "kfd_priv.h" +- +-static struct dentry *debugfs_root; +- +-static int kfd_debugfs_open(struct inode *inode, struct file *file) +-{ +- int (*show)(struct seq_file *, void *) = inode->i_private; +- +- return single_open(file, show, NULL); +-} +- +-static const struct file_operations kfd_debugfs_fops = { +- .owner = THIS_MODULE, +- .open = kfd_debugfs_open, +- .read = seq_read, +- .llseek = seq_lseek, +- .release = single_release, +-}; +- +-void kfd_debugfs_init(void) +-{ +- struct dentry *ent; +- +- debugfs_root = debugfs_create_dir("kfd", NULL); +- if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) { +- pr_warn("Failed to create kfd debugfs dir\n"); +- return; +- } +- +- ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, +- kfd_debugfs_mqds_by_process, +- &kfd_debugfs_fops); +- if (!ent) +- pr_warn("Failed to create mqds in kfd debugfs\n"); +- +- ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, +- kfd_debugfs_hqds_by_device, +- &kfd_debugfs_fops); +- if (!ent) +- pr_warn("Failed to create hqds in kfd debugfs\n"); +- +- ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, +- kfd_debugfs_rls_by_device, +- &kfd_debugfs_fops); +- if (!ent) +- pr_warn("Failed to create rls in kfd debugfs\n"); +-} +- +-void kfd_debugfs_fini(void) +-{ +- debugfs_remove_recursive(debugfs_root); +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +index 6b3a1fa..3f95f7c 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +@@ -20,209 +20,36 @@ + * OTHER DEALINGS IN THE SOFTWARE. + */ + +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + #include <linux/amd-iommu.h> +-#endif ++#include <linux/bsearch.h> + #include <linux/pci.h> + #include <linux/slab.h> +-#include <linux/highmem.h> + #include "kfd_priv.h" + #include "kfd_device_queue_manager.h" +-#include "kfd_pm4_headers_vi.h" +-#include "cwsr_trap_handler_carrizo.h" +-#include "cwsr_trap_handler_gfx9.asm" ++#include "kfd_pm4_headers.h" + + #define MQD_SIZE_ALIGNED 768 +-static atomic_t kfd_device_suspended = ATOMIC_INIT(0); + +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + static const struct kfd_device_info kaveri_device_info = { + .asic_family = CHIP_KAVERI, + .max_pasid_bits = 16, + /* max num of queues for KV.TODO should be a dynamic value */ + .max_no_of_hqd = 24, +- .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = true, +- .supports_cwsr = false, +- .needs_pci_atomics = false, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED + }; +-#endif + +-static const struct kfd_device_info hawaii_device_info = { +- .asic_family = CHIP_HAWAII, +- .max_pasid_bits = 16, +- /* max num of queues for KV.TODO should be a dynamic value */ +- .max_no_of_hqd = 24, +- .doorbell_size = 4, +- .ih_ring_entry_size = 4 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_cik, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = false, +- .needs_pci_atomics = false, +-}; +- +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + static const struct kfd_device_info carrizo_device_info = { + .asic_family = CHIP_CARRIZO, + .max_pasid_bits = 16, + /* max num of queues for CZ.TODO should be a dynamic value */ + .max_no_of_hqd = 24, +- .doorbell_size = 4, +- .ih_ring_entry_size = 4 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_cik, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = true, +- .supports_cwsr = true, +- .needs_pci_atomics = false, +-}; +-#endif +- +-static const struct kfd_device_info tonga_device_info = { +- .asic_family = CHIP_TONGA, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 4, +- .ih_ring_entry_size = 4 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_cik, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = false, +- .needs_pci_atomics = true, +-}; +- +-static const struct kfd_device_info tonga_vf_device_info = { +- .asic_family = CHIP_TONGA, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 4, +- .ih_ring_entry_size = 4 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_cik, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = false, +- .needs_pci_atomics = false, +-}; +- +-static const struct kfd_device_info fiji_device_info = { +- .asic_family = CHIP_FIJI, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 4, +- .ih_ring_entry_size = 4 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_cik, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = true, +- .needs_pci_atomics = true, +-}; +- +-static const struct kfd_device_info fiji_vf_device_info = { +- .asic_family = CHIP_FIJI, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = true, +- .needs_pci_atomics = false, +-}; +- +- +-static const struct kfd_device_info polaris10_device_info = { +- .asic_family = CHIP_POLARIS10, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 4, +- .ih_ring_entry_size = 4 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_cik, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = true, +- .needs_pci_atomics = true, +-}; +- +-static const struct kfd_device_info polaris10_vf_device_info = { +- .asic_family = CHIP_POLARIS10, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 4, +- .ih_ring_entry_size = 4 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_cik, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = true, +- .needs_pci_atomics = false, +-}; +- +-static const struct kfd_device_info polaris11_device_info = { +- .asic_family = CHIP_POLARIS11, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 4, +- .ih_ring_entry_size = 4 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_cik, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = true, +- .needs_pci_atomics = true, +-}; +- +-static const struct kfd_device_info vega10_device_info = { +- .asic_family = CHIP_VEGA10, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 8, +- .ih_ring_entry_size = 8 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_v9, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = true, +- .needs_pci_atomics = true, +-}; +- +-static const struct kfd_device_info vega10_vf_device_info = { +- .asic_family = CHIP_VEGA10, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 8, +- .ih_ring_entry_size = 8 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_v9, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = false, +- .supports_cwsr = true, +- .needs_pci_atomics = false, +-}; +- +-static const struct kfd_device_info raven_device_info = { +- .asic_family = CHIP_RAVEN, +- .max_pasid_bits = 16, +- .max_no_of_hqd = 24, +- .doorbell_size = 8, +- .ih_ring_entry_size = 8 * sizeof(uint32_t), +- .event_interrupt_class = &event_interrupt_class_v9, +- .num_of_watch_points = 4, +- .mqd_size_aligned = MQD_SIZE_ALIGNED, +- .is_need_iommu_device = true, +- .supports_cwsr = true, +- .needs_pci_atomics = true, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED + }; + + struct kfd_deviceid { +@@ -232,7 +59,6 @@ struct kfd_deviceid { + + /* Please keep this sorted by increasing device id. */ + static const struct kfd_deviceid supported_devices[] = { +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + { 0x1304, &kaveri_device_info }, /* Kaveri */ + { 0x1305, &kaveri_device_info }, /* Kaveri */ + { 0x1306, &kaveri_device_info }, /* Kaveri */ +@@ -255,90 +81,28 @@ static const struct kfd_deviceid supported_devices[] = { + { 0x131B, &kaveri_device_info }, /* Kaveri */ + { 0x131C, &kaveri_device_info }, /* Kaveri */ + { 0x131D, &kaveri_device_info }, /* Kaveri */ +-#endif +- { 0x67A0, &hawaii_device_info }, /* Hawaii */ +- { 0x67A1, &hawaii_device_info }, /* Hawaii */ +- { 0x67A2, &hawaii_device_info }, /* Hawaii */ +- { 0x67A8, &hawaii_device_info }, /* Hawaii */ +- { 0x67A9, &hawaii_device_info }, /* Hawaii */ +- { 0x67AA, &hawaii_device_info }, /* Hawaii */ +- { 0x67B0, &hawaii_device_info }, /* Hawaii */ +- { 0x67B1, &hawaii_device_info }, /* Hawaii */ +- { 0x67B8, &hawaii_device_info }, /* Hawaii */ +- { 0x67B9, &hawaii_device_info }, /* Hawaii */ +- { 0x67BA, &hawaii_device_info }, /* Hawaii */ +- { 0x67BE, &hawaii_device_info }, /* Hawaii */ +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + { 0x9870, &carrizo_device_info }, /* Carrizo */ + { 0x9874, &carrizo_device_info }, /* Carrizo */ + { 0x9875, &carrizo_device_info }, /* Carrizo */ + { 0x9876, &carrizo_device_info }, /* Carrizo */ +- { 0x9877, &carrizo_device_info }, /* Carrizo */ +-#endif +- { 0x6920, &tonga_device_info }, /* Tonga */ +- { 0x6921, &tonga_device_info }, /* Tonga */ +- { 0x6928, &tonga_device_info }, /* Tonga */ +- { 0x6929, &tonga_device_info }, /* Tonga */ +- { 0x692B, &tonga_device_info }, /* Tonga */ +- { 0x692F, &tonga_vf_device_info }, /* Tonga vf */ +- { 0x6938, &tonga_device_info }, /* Tonga */ +- { 0x6939, &tonga_device_info }, /* Tonga */ +- { 0x7300, &fiji_device_info }, /* Fiji */ +- { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ +- { 0x67C0, &polaris10_device_info }, /* Polaris10 */ +- { 0x67C1, &polaris10_device_info }, /* Polaris10 */ +- { 0x67C2, &polaris10_device_info }, /* Polaris10 */ +- { 0x67C4, &polaris10_device_info }, /* Polaris10 */ +- { 0x67C7, &polaris10_device_info }, /* Polaris10 */ +- { 0x67C8, &polaris10_device_info }, /* Polaris10 */ +- { 0x67C9, &polaris10_device_info }, /* Polaris10 */ +- { 0x67CA, &polaris10_device_info }, /* Polaris10 */ +- { 0x67CC, &polaris10_device_info }, /* Polaris10 */ +- { 0x67CF, &polaris10_device_info }, /* Polaris10 */ +- { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ +- { 0x67DF, &polaris10_device_info }, /* Polaris10 */ +- { 0x67E0, &polaris11_device_info }, /* Polaris11 */ +- { 0x67E1, &polaris11_device_info }, /* Polaris11 */ +- { 0x67E3, &polaris11_device_info }, /* Polaris11 */ +- { 0x67E7, &polaris11_device_info }, /* Polaris11 */ +- { 0x67E8, &polaris11_device_info }, /* Polaris11 */ +- { 0x67E9, &polaris11_device_info }, /* Polaris11 */ +- { 0x67EB, &polaris11_device_info }, /* Polaris11 */ +- { 0x67EF, &polaris11_device_info }, /* Polaris11 */ +- { 0x67FF, &polaris11_device_info }, /* Polaris11 */ +- { 0x6860, &vega10_device_info }, /* Vega10 */ +- { 0x6861, &vega10_device_info }, /* Vega10 */ +- { 0x6862, &vega10_device_info }, /* Vega10 */ +- { 0x6863, &vega10_device_info }, /* Vega10 */ +- { 0x6864, &vega10_device_info }, /* Vega10 */ +- { 0x6867, &vega10_device_info }, /* Vega10 */ +- { 0x6868, &vega10_device_info }, /* Vega10 */ +- { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ +- { 0x687F, &vega10_device_info }, /* Vega10 */ +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- { 0x15DD, &raven_device_info } /* Raven */ +-#endif ++ { 0x9877, &carrizo_device_info } /* Carrizo */ + }; + + static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size); + static void kfd_gtt_sa_fini(struct kfd_dev *kfd); + +-static int kfd_resume(struct kfd_dev *kfd); +- + static const struct kfd_device_info *lookup_device_info(unsigned short did) + { + size_t i; + + for (i = 0; i < ARRAY_SIZE(supported_devices); i++) { + if (supported_devices[i].did == did) { +- WARN_ON(!supported_devices[i].device_info); ++ BUG_ON(supported_devices[i].device_info == NULL); + return supported_devices[i].device_info; + } + } + +- WARN(1, "device is not added to supported_devices\n"); +- + return NULL; + } + +@@ -350,21 +114,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + const struct kfd_device_info *device_info = + lookup_device_info(pdev->device); + +- if (!device_info) { +- dev_err(kfd_device, "kgd2kfd_probe failed\n"); ++ if (!device_info) + return NULL; +- } +- +- if (device_info->needs_pci_atomics) { +- /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. +- */ +- if (pci_enable_atomic_ops_to_root(pdev) < 0) { +- dev_info(kfd_device, +- "skipped device %x:%x, PCI rejects atomics", +- pdev->vendor, pdev->device); +- return NULL; +- } +- } + + kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); + if (!kfd) +@@ -383,7 +134,6 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + return kfd; + } + +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + static bool device_iommu_pasid_init(struct kfd_dev *kfd) + { + const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | +@@ -402,16 +152,15 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) + } + + if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) { +- dev_err(kfd_device, "error required iommu flags ats %i, pri %i, pasid %i\n", ++ dev_err(kfd_device, "error required iommu flags ats(%i), pri(%i), pasid(%i)\n", + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0, + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0, +- (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) +- != 0); ++ (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) != 0); + return false; + } + + pasid_limit = min_t(unsigned int, +- (unsigned int)(1 << kfd->device_info->max_pasid_bits), ++ (unsigned int)1 << kfd->device_info->max_pasid_bits, + iommu_info.max_pasids); + /* + * last pasid is used for kernel queues doorbells +@@ -421,8 +170,15 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) + pasid_limit, + kfd->doorbell_process_limit - 1); + ++ err = amd_iommu_init_device(kfd->pdev, pasid_limit); ++ if (err < 0) { ++ dev_err(kfd_device, "error initializing iommu device\n"); ++ return false; ++ } ++ + if (!kfd_set_pasid_limit(pasid_limit)) { + dev_err(kfd_device, "error setting pasid limit\n"); ++ amd_iommu_free_device(kfd->pdev); + return false; + } + +@@ -434,7 +190,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) + struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); + + if (dev) +- kfd_process_iommu_unbind_callback(dev, pasid); ++ kfd_unbind_process_from_device(dev, pasid); + } + + /* +@@ -455,108 +211,21 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, + flags); + + dev = kfd_device_by_pci_dev(pdev); +- if (!WARN_ON(!dev)) +- kfd_signal_iommu_event(dev, pasid, address, ++ BUG_ON(dev == NULL); ++ ++ kfd_signal_iommu_event(dev, pasid, address, + flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC); + + return AMD_IOMMU_INV_PRI_RSP_INVALID; + } +-#endif /* CONFIG_AMD_IOMMU_V2 */ +- +-static int kfd_cwsr_init(struct kfd_dev *kfd) +-{ +- /* +- * Initialize the CWSR required memory for TBA and TMA +- */ +- if (cwsr_enable && kfd->device_info->supports_cwsr) { +- const uint32_t *cwsr_hex; +- void *cwsr_addr = NULL; +- unsigned int size; +- +- if (kfd->device_info->asic_family < CHIP_VEGA10) { +- cwsr_hex = cwsr_trap_carrizo_hex; +- size = sizeof(cwsr_trap_carrizo_hex); +- } else { +- cwsr_hex = cwsr_trap_gfx9_hex; +- size = sizeof(cwsr_trap_gfx9_hex); +- } +- +- if (size > PAGE_SIZE) { +- pr_err("Wrong CWSR ISA size.\n"); +- return -EINVAL; +- } +- kfd->cwsr_size = +- ALIGN(size, PAGE_SIZE) + PAGE_SIZE; +- kfd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, +- get_order(kfd->cwsr_size)); +- if (!kfd->cwsr_pages) { +- pr_err("Failed to allocate CWSR isa memory.\n"); +- return -ENOMEM; +- } +- /*Only first page used for cwsr ISA code */ +- cwsr_addr = kmap(kfd->cwsr_pages); +- memset(cwsr_addr, 0, PAGE_SIZE); +- memcpy(cwsr_addr, cwsr_hex, size); +- kunmap(kfd->cwsr_pages); +- kfd->tma_offset = ALIGN(size, PAGE_SIZE); +- kfd->cwsr_enabled = true; +- dev_info(kfd_device, +- "Reserved %d pages for cwsr.\n", +- (kfd->cwsr_size >> PAGE_SHIFT)); +- } +- +- return 0; +-} +- +-static void kfd_cwsr_fini(struct kfd_dev *kfd) +-{ +- if (kfd->cwsr_pages) +- __free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size)); +-} +- +-static void kfd_ib_mem_init(struct kfd_dev *kdev) +-{ +- /* In certain cases we need to send IB from kernel using the GPU address +- * space created by user applications. +- * For example, on GFX v7, we need to flush TC associated to the VMID +- * before tearing down the VMID. In order to do so, we need an address +- * valid to the VMID to place the IB while this space was created on +- * the user's side, not the kernel. +- * Since kfd_set_process_dgpu_aperture reserves "cwsr_base + cwsr_size" +- * but CWSR only uses pages above cwsr_base, we'll use one page memory +- * under cwsr_base for IB submissions +- */ +- kdev->ib_size = PAGE_SIZE; +-} + + bool kgd2kfd_device_init(struct kfd_dev *kfd, + const struct kgd2kfd_shared_resources *gpu_resources) + { + unsigned int size; +- unsigned int vmid_bitmap_kfd, vmid_num_kfd; +- +- kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, +- KGD_ENGINE_MEC1); + + kfd->shared_resources = *gpu_resources; + +- vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap; +- kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1; +- kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1; +- vmid_num_kfd = kfd->vm_info.last_vmid_kfd +- - kfd->vm_info.first_vmid_kfd + 1; +- kfd->vm_info.vmid_num_kfd = vmid_num_kfd; +- +- /* Verify module parameters regarding mapped process number*/ +- if ((hws_max_conc_proc < 0) +- || (hws_max_conc_proc > vmid_num_kfd)) { +- dev_err(kfd_device, +- "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n", +- hws_max_conc_proc, vmid_num_kfd, vmid_num_kfd); +- kfd->max_proc_per_quantum = vmid_num_kfd; +- } else +- kfd->max_proc_per_quantum = hws_max_conc_proc; +- + /* calculate max size of mqds needed for queues */ + size = max_num_of_queues_per_device * + kfd->device_info->mqd_size_aligned; +@@ -565,9 +234,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + * calculate max size of runlist packet. + * There can be only 2 packets at once + */ +- size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_mes_map_process) + +- max_num_of_queues_per_device * sizeof(struct pm4_mes_map_queues) +- + sizeof(struct pm4_mes_runlist)) * 2; ++ size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_map_process) + ++ max_num_of_queues_per_device * ++ sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2; + + /* Add size of HIQ & DIQ */ + size += KFD_KERNEL_QUEUE_SIZE * 2; +@@ -578,88 +247,89 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + if (kfd->kfd2kgd->init_gtt_mem_allocation( + kfd->kgd, size, &kfd->gtt_mem, + &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)){ +- dev_err(kfd_device, "Could not allocate %d bytes\n", size); ++ dev_err(kfd_device, ++ "Could not allocate %d bytes for device (%x:%x)\n", ++ size, kfd->pdev->vendor, kfd->pdev->device); + goto out; + } + +- dev_info(kfd_device, "Allocated %d bytes on gart\n", size); ++ dev_info(kfd_device, ++ "Allocated %d bytes on gart for device(%x:%x)\n", ++ size, kfd->pdev->vendor, kfd->pdev->device); + + /* Initialize GTT sa with 512 byte chunk size */ + if (kfd_gtt_sa_init(kfd, size, 512) != 0) { +- dev_err(kfd_device, "Error initializing gtt sub-allocator\n"); ++ dev_err(kfd_device, ++ "Error initializing gtt sub-allocator\n"); + goto kfd_gtt_sa_init_error; + } + +- if (kfd_doorbell_init(kfd)) { +- dev_err(kfd_device, +- "Error initializing doorbell aperture\n"); +- goto kfd_doorbell_error; +- } ++ kfd_doorbell_init(kfd); + +- if (kfd_topology_add_device(kfd)) { +- dev_err(kfd_device, "Error adding device to topology\n"); ++ if (kfd_topology_add_device(kfd) != 0) { ++ dev_err(kfd_device, ++ "Error adding device (%x:%x) to topology\n", ++ kfd->pdev->vendor, kfd->pdev->device); + goto kfd_topology_add_device_error; + } + + if (kfd_interrupt_init(kfd)) { +- dev_err(kfd_device, "Error initializing interrupts\n"); ++ dev_err(kfd_device, ++ "Error initializing interrupts for device (%x:%x)\n", ++ kfd->pdev->vendor, kfd->pdev->device); + goto kfd_interrupt_error; + } + ++ if (!device_iommu_pasid_init(kfd)) { ++ dev_err(kfd_device, ++ "Error initializing iommuv2 for device (%x:%x)\n", ++ kfd->pdev->vendor, kfd->pdev->device); ++ goto device_iommu_pasid_error; ++ } ++ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, ++ iommu_pasid_shutdown_callback); ++ amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); ++ + kfd->dqm = device_queue_manager_init(kfd); + if (!kfd->dqm) { +- dev_err(kfd_device, "Error initializing queue manager\n"); ++ dev_err(kfd_device, ++ "Error initializing queue manager for device (%x:%x)\n", ++ kfd->pdev->vendor, kfd->pdev->device); + goto device_queue_manager_error; + } + +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- if (kfd->device_info->is_need_iommu_device) { +- if (!device_iommu_pasid_init(kfd)) { +- dev_err(kfd_device, "Error initializing iommuv2\n"); +- goto device_iommu_pasid_error; +- } +- } +-#endif +- +- if (kfd_cwsr_init(kfd)) { +- dev_err(kfd_device, "Error initializing cwsr\n"); +- goto device_iommu_pasid_error; +- } +- +- kfd_ib_mem_init(kfd); +- +- if (kfd_resume(kfd)) { +- dev_err(kfd_device, "Error resuming kfd\n"); +- goto kfd_resume_error; ++ if (kfd->dqm->ops.start(kfd->dqm) != 0) { ++ dev_err(kfd_device, ++ "Error starting queuen manager for device (%x:%x)\n", ++ kfd->pdev->vendor, kfd->pdev->device); ++ goto dqm_start_error; + } + + kfd->dbgmgr = NULL; + + kfd->init_complete = true; +- dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor, ++ dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor, + kfd->pdev->device); + +- pr_debug("Starting kfd with the following scheduling policy %d\n", +- kfd->dqm->sched_policy); ++ pr_debug("kfd: Starting kfd with the following scheduling policy %d\n", ++ sched_policy); + + goto out; + +-kfd_resume_error: +- kfd_cwsr_fini(kfd); +-device_iommu_pasid_error: ++dqm_start_error: + device_queue_manager_uninit(kfd->dqm); + device_queue_manager_error: ++ amd_iommu_free_device(kfd->pdev); ++device_iommu_pasid_error: + kfd_interrupt_exit(kfd); + kfd_interrupt_error: + kfd_topology_remove_device(kfd); + kfd_topology_add_device_error: +- kfd_doorbell_fini(kfd); +-kfd_doorbell_error: + kfd_gtt_sa_fini(kfd); + kfd_gtt_sa_init_error: + kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); + dev_err(kfd_device, +- "device %x:%x NOT added due to errors\n", ++ "device (%x:%x) NOT added due to errors\n", + kfd->pdev->vendor, kfd->pdev->device); + out: + return kfd->init_complete; +@@ -668,12 +338,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + void kgd2kfd_device_exit(struct kfd_dev *kfd) + { + if (kfd->init_complete) { +- kgd2kfd_suspend(kfd); +- kfd_cwsr_fini(kfd); + device_queue_manager_uninit(kfd->dqm); ++ amd_iommu_free_device(kfd->pdev); + kfd_interrupt_exit(kfd); + kfd_topology_remove_device(kfd); +- kfd_doorbell_fini(kfd); + kfd_gtt_sa_fini(kfd); + kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); + } +@@ -683,419 +351,77 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) + + void kgd2kfd_suspend(struct kfd_dev *kfd) + { +- if (!kfd->init_complete) +- return; +- +- /* For first KFD device suspend all the KFD processes */ +- if (atomic_inc_return(&kfd_device_suspended) == 1) +- kfd_suspend_all_processes(); +- +- kfd->dqm->ops.stop(kfd->dqm); +- +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- if (!kfd->device_info->is_need_iommu_device) +- return; ++ BUG_ON(kfd == NULL); + +- kfd_unbind_processes_from_device(kfd); +- +- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); +- amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); +- amd_iommu_free_device(kfd->pdev); +-#endif ++ if (kfd->init_complete) { ++ kfd->dqm->ops.stop(kfd->dqm); ++ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); ++ amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); ++ amd_iommu_free_device(kfd->pdev); ++ } + } + + int kgd2kfd_resume(struct kfd_dev *kfd) + { +- int ret; +- +- if (!kfd->init_complete) +- return 0; +- +- ret = kfd_resume(kfd); +- if (ret) +- return ret; +- +- if (atomic_dec_return(&kfd_device_suspended) == 0) +- ret = kfd_resume_all_processes(); +- WARN(atomic_read(&kfd_device_suspended) < 0, +- "KFD suspend / resume ref. error\n"); +- return ret; +-} ++ unsigned int pasid_limit; ++ int err; + +-static int kfd_resume(struct kfd_dev *kfd) +-{ +- int err = 0; ++ BUG_ON(kfd == NULL); + +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- if (kfd->device_info->is_need_iommu_device) { +- unsigned int pasid_limit = kfd_get_pasid_limit(); ++ pasid_limit = kfd_get_pasid_limit(); + ++ if (kfd->init_complete) { + err = amd_iommu_init_device(kfd->pdev, pasid_limit); +- if (err) { +- dev_err(kfd_device, "failed to initialize iommu\n"); ++ if (err < 0) + return -ENXIO; +- } +- + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, +- iommu_pasid_shutdown_callback); +- amd_iommu_set_invalid_ppr_cb(kfd->pdev, +- iommu_invalid_ppr_cb); +- +- err = kfd_bind_processes_to_device(kfd); +- if (err) { +- dev_err(kfd_device, +- "failed to bind process to device\n"); +- return -ENXIO; +- } +- } +-#endif +- +- err = kfd->dqm->ops.start(kfd->dqm); +- if (err) { +- dev_err(kfd_device, +- "Error starting queue manager for device %x:%x\n", +- kfd->pdev->vendor, kfd->pdev->device); +- goto dqm_start_error; ++ iommu_pasid_shutdown_callback); ++ amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); ++ kfd->dqm->ops.start(kfd->dqm); + } + +- kfd->kfd2kgd->write_config_static_mem(kfd->kgd, true, 1, 3, 0); +- +- return err; +- +-dqm_start_error: +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- if (kfd->device_info->is_need_iommu_device) +- amd_iommu_free_device(kfd->pdev); +-#endif +- +- return err; ++ return 0; + } + + /* This is called directly from KGD at ISR. */ + void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) + { +- uint32_t patched_ihre[DIV_ROUND_UP( +- kfd->device_info->ih_ring_entry_size, +- sizeof(uint32_t))]; +- bool is_patched = false; +- + if (!kfd->init_complete) + return; + + spin_lock(&kfd->interrupt_lock); + +- if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry, +- patched_ihre, &is_patched) +- && enqueue_ih_ring_entry(kfd, +- is_patched ? patched_ihre : ih_ring_entry)) +- queue_work(kfd->ih_wq, &kfd->interrupt_work); ++ if (kfd->interrupts_active ++ && interrupt_is_wanted(kfd, ih_ring_entry) ++ && enqueue_ih_ring_entry(kfd, ih_ring_entry)) ++ schedule_work(&kfd->interrupt_work); + + spin_unlock(&kfd->interrupt_lock); + } + +-/* quiesce_process_mm - +- * Quiesce all user queues that belongs to given process p +- */ +-int quiesce_process_mm(struct kfd_process *p) +-{ +- struct kfd_process_device *pdd; +- int r = 0; +- unsigned int n_evicted = 0; +- +- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { +- r = process_evict_queues(pdd->dev->dqm, &pdd->qpd); +- if (r != 0) { +- pr_err("Failed to evict process queues\n"); +- goto fail; +- } +- n_evicted++; +- } +- +- return r; +- +-fail: +- /* To keep state consistent, roll back partial eviction by +- * restoring queues +- */ +- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { +- if (n_evicted == 0) +- break; +- if (process_restore_queues(pdd->dev->dqm, &pdd->qpd)) +- pr_err("Failed to restore queues\n"); +- +- n_evicted--; +- } +- +- return r; +-} +- +-/* resume_process_mm - +- * Resume all user queues that belongs to given process p. The caller must +- * ensure that process p context is valid. +- */ +-static int resume_process_mm(struct kfd_process *p) +-{ +- struct kfd_process_device *pdd; +- struct mm_struct *mm = (struct mm_struct *)p->mm; +- int r, ret = 0; +- +- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { +- if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) +- down_read(&mm->mmap_sem); +- +- r = process_restore_queues(pdd->dev->dqm, &pdd->qpd); +- if (r != 0) { +- pr_err("Failed to restore process queues\n"); +- if (ret == 0) +- ret = r; +- } +- +- if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) +- up_read(&mm->mmap_sem); +- } +- +- return ret; +-} +- +-int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm) +-{ +- struct kfd_process *p; +- struct kfd_process_device *pdd; +- int r; +- +- /* Because we are called from arbitrary context (workqueue) as opposed +- * to process context, kfd_process could attempt to exit while we are +- * running so the lookup function increments the process ref count. +- */ +- p = kfd_lookup_process_by_mm(mm); +- if (!p) +- return -ENODEV; +- +- if (kfd) { +- r = -ENODEV; +- pdd = kfd_get_process_device_data(kfd, p); +- if (pdd) +- r = process_evict_queues(kfd->dqm, &pdd->qpd); +- } else { +- r = quiesce_process_mm(p); +- } +- +- kfd_unref_process(p); +- return r; +-} +- +-int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) +-{ +- struct kfd_process *p; +- struct kfd_process_device *pdd; +- int r; +- +- /* Because we are called from arbitrary context (workqueue) as opposed +- * to process context, kfd_process could attempt to exit while we are +- * running so the lookup function increments the process ref count. +- */ +- p = kfd_lookup_process_by_mm(mm); +- if (!p) +- return -ENODEV; +- +- if (kfd) { +- r = -ENODEV; +- pdd = kfd_get_process_device_data(kfd, p); +- if (pdd) +- r = process_restore_queues(kfd->dqm, &pdd->qpd); +- } else { +- r = resume_process_mm(p); +- } +- +- kfd_unref_process(p); +- return r; +-} +- +- +-void kfd_restore_bo_worker(struct work_struct *work) +-{ +- struct delayed_work *dwork; +- struct kfd_process *p; +- struct kfd_process_device *pdd; +- int ret = 0; +- +- dwork = to_delayed_work(work); +- +- /* Process termination destroys this worker thread. So during the +- * lifetime of this thread, kfd_process p will be valid +- */ +- p = container_of(dwork, struct kfd_process, restore_work); +- +- /* Call restore_process_bos on the first KGD device. This function +- * takes care of restoring the whole process including other devices. +- * Restore can fail if enough memory is not available. If so, +- * reschedule again. +- */ +- pdd = list_first_entry(&p->per_device_data, +- struct kfd_process_device, +- per_device_list); +- +- pr_info("Started restoring process of pasid %d\n", p->pasid); +- +- /* Setting last_restore_timestamp before successful restoration. +- * Otherwise this would have to be set by KGD (restore_process_bos) +- * before KFD BOs are unreserved. If not, the process can be evicted +- * again before the timestamp is set. +- * If restore fails, the timestamp will be set again in the next +- * attempt. This would mean that the minimum GPU quanta would be +- * PROCESS_ACTIVE_TIME_MS - (time to execute the following two +- * functions) +- */ +- +- p->last_restore_timestamp = get_jiffies_64(); +- ret = pdd->dev->kfd2kgd->restore_process_bos(p->process_info, &p->ef); +- if (ret) { +- pr_info("Restore failed, try again after %d ms\n", +- PROCESS_BACK_OFF_TIME_MS); +- ret = schedule_delayed_work(&p->restore_work, +- msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); +- WARN(!ret, "reschedule restore work failed\n"); +- return; +- } +- +- ret = resume_process_mm(p); +- if (ret) +- pr_err("Failed to resume user queues\n"); +- +- pr_info("Finished restoring process of pasid %d\n", p->pasid); +-} +- +-/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will +- * prepare for safe eviction of KFD BOs that belong to the specified +- * process. +- * +- * @mm: mm_struct that identifies the specified KFD process +- * @fence: eviction fence attached to KFD process BOs +- * +- */ +-int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, +- struct dma_fence *fence) +-{ +- struct kfd_process *p; +- unsigned long active_time; +- unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS); +- +- if (!fence) +- return -EINVAL; +- +- if (dma_fence_is_signaled(fence)) +- return 0; +- +- p = kfd_lookup_process_by_mm(mm); +- if (!p) +- return -ENODEV; +- +- if (delayed_work_pending(&p->eviction_work.dwork)) { +- /* It is possible has TTM has lined up couple of BOs of the same +- * process to be evicted. Check if the fence is same which +- * indicates that previous work item scheduled is not completed +- */ +- if (p->eviction_work.quiesce_fence == fence) +- goto out; +- else { +- WARN(1, "Starting new evict with previous evict is not completed\n"); +- if (cancel_delayed_work_sync(&p->eviction_work.dwork)) +- dma_fence_put(p->eviction_work.quiesce_fence); +- } +- } +- +- p->eviction_work.quiesce_fence = dma_fence_get(fence); +- +- /* Avoid KFD process starvation. Wait for at least +- * PROCESS_ACTIVE_TIME_MS before evicting the process again +- */ +- active_time = get_jiffies_64() - p->last_restore_timestamp; +- if (delay_jiffies > active_time) +- delay_jiffies -= active_time; +- else +- delay_jiffies = 0; +- +- /* During process initialization eviction_work.dwork is initialized +- * to kfd_evict_bo_worker +- */ +- schedule_delayed_work(&p->eviction_work.dwork, delay_jiffies); +-out: +- kfd_unref_process(p); +- return 0; +-} +- +-void kfd_evict_bo_worker(struct work_struct *work) +-{ +- int ret; +- struct kfd_process *p; +- struct kfd_eviction_work *eviction_work; +- struct delayed_work *dwork; +- +- dwork = to_delayed_work(work); +- eviction_work = container_of(dwork, struct kfd_eviction_work, +- dwork); +- +- /* Process termination destroys this worker thread. So during the +- * lifetime of this thread, kfd_process p will be valid +- */ +- p = container_of(eviction_work, struct kfd_process, eviction_work); +- +- /* Narrow window of overlap between restore and evict work item is +- * possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos unreserves +- * KFD BOs, it is possible to evicted again. But restore has few more +- * steps of finish. So lets wait for the restore work to complete +- */ +- if (delayed_work_pending(&p->restore_work)) +- flush_delayed_work(&p->restore_work); +- +- pr_info("Started evicting process of pasid %d\n", p->pasid); +- ret = quiesce_process_mm(p); +- if (!ret) { +- dma_fence_signal(eviction_work->quiesce_fence); +- WARN_ONCE(eviction_work->quiesce_fence != p->ef, +- "Eviction fence mismatch\n"); +- dma_fence_put(p->ef); +- /* TODO: quiesce_fence is same as kfd_process->ef. But +- * quiesce_fence is also used to avoid starting multiple +- * eviction work items. This might not be necessary and +- * one of the variables could be removed +- */ +- p->ef = NULL; +- schedule_delayed_work(&p->restore_work, +- msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); +- } else +- pr_err("Failed to quiesce user queues. Cannot evict BOs\n"); +- +- dma_fence_put(eviction_work->quiesce_fence); +- +- pr_info("Finished evicting process of pasid %d\n", p->pasid); +- +-} +- + static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size) + { +- unsigned int num_of_longs; ++ unsigned int num_of_bits; + +- if (WARN_ON(buf_size < chunk_size)) +- return -EINVAL; +- if (WARN_ON(buf_size == 0)) +- return -EINVAL; +- if (WARN_ON(chunk_size == 0)) +- return -EINVAL; ++ BUG_ON(!kfd); ++ BUG_ON(!kfd->gtt_mem); ++ BUG_ON(buf_size < chunk_size); ++ BUG_ON(buf_size == 0); ++ BUG_ON(chunk_size == 0); + + kfd->gtt_sa_chunk_size = chunk_size; + kfd->gtt_sa_num_of_chunks = buf_size / chunk_size; + +- num_of_longs = (kfd->gtt_sa_num_of_chunks + BITS_PER_LONG - 1) / +- BITS_PER_LONG; ++ num_of_bits = kfd->gtt_sa_num_of_chunks / BITS_PER_BYTE; ++ BUG_ON(num_of_bits == 0); + +- kfd->gtt_sa_bitmap = kcalloc(num_of_longs, sizeof(long), GFP_KERNEL); ++ kfd->gtt_sa_bitmap = kzalloc(num_of_bits, GFP_KERNEL); + + if (!kfd->gtt_sa_bitmap) + return -ENOMEM; + +- pr_debug("gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n", ++ pr_debug("kfd: gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n", + kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap); + + mutex_init(&kfd->gtt_sa_lock); +@@ -1129,17 +455,19 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + { + unsigned int found, start_search, cur_size; + ++ BUG_ON(!kfd); ++ + if (size == 0) + return -EINVAL; + + if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) + return -ENOMEM; + +- *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); +- if (!(*mem_obj)) ++ *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); ++ if ((*mem_obj) == NULL) + return -ENOMEM; + +- pr_debug("Allocated mem_obj = %p for size = %d\n", *mem_obj, size); ++ pr_debug("kfd: allocated mem_obj = %p for size = %d\n", *mem_obj, size); + + start_search = 0; + +@@ -1151,7 +479,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + kfd->gtt_sa_num_of_chunks, + start_search); + +- pr_debug("Found = %d\n", found); ++ pr_debug("kfd: found = %d\n", found); + + /* If there wasn't any free chunk, bail out */ + if (found == kfd->gtt_sa_num_of_chunks) +@@ -1169,12 +497,12 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + found, + kfd->gtt_sa_chunk_size); + +- pr_debug("gpu_addr = %p, cpu_addr = %p\n", ++ pr_debug("kfd: gpu_addr = %p, cpu_addr = %p\n", + (uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr); + + /* If we need only one chunk, mark it as allocated and get out */ + if (size <= kfd->gtt_sa_chunk_size) { +- pr_debug("Single bit\n"); ++ pr_debug("kfd: single bit\n"); + set_bit(found, kfd->gtt_sa_bitmap); + goto kfd_gtt_out; + } +@@ -1209,7 +537,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + + } while (cur_size > 0); + +- pr_debug("range_start = %d, range_end = %d\n", ++ pr_debug("kfd: range_start = %d, range_end = %d\n", + (*mem_obj)->range_start, (*mem_obj)->range_end); + + /* Mark the chunks as allocated */ +@@ -1223,7 +551,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + return 0; + + kfd_gtt_no_free_chunk: +- pr_debug("Allocation failed with mem_obj = %p\n", mem_obj); ++ pr_debug("kfd: allocation failed with mem_obj = %p\n", mem_obj); + mutex_unlock(&kfd->gtt_sa_lock); + kfree(mem_obj); + return -ENOMEM; +@@ -1233,11 +561,13 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) + { + unsigned int bit; + ++ BUG_ON(!kfd); ++ + /* Act like kfree when trying to free a NULL object */ + if (!mem_obj) + return 0; + +- pr_debug("Free mem_obj = %p, range_start = %d, range_end = %d\n", ++ pr_debug("kfd: free mem_obj = %p, range_start = %d, range_end = %d\n", + mem_obj, mem_obj->range_start, mem_obj->range_end); + + mutex_lock(&kfd->gtt_sa_lock); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +index 1abbaa0..42de22b 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +@@ -44,13 +44,9 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +-static int execute_queues_cpsch(struct device_queue_manager *dqm, +- bool static_queues_included); +-static int unmap_queues_cpsch(struct device_queue_manager *dqm, +- enum kfd_unmap_queues_filter filter, +- uint32_t filter_param); +- +-static int map_queues_cpsch(struct device_queue_manager *dqm); ++static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); ++static int destroy_queues_cpsch(struct device_queue_manager *dqm, ++ bool preempt_static_queues, bool lock); + + static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, +@@ -83,17 +79,20 @@ static bool is_pipe_enabled(struct device_queue_manager *dqm, int mec, int pipe) + + unsigned int get_queues_num(struct device_queue_manager *dqm) + { ++ BUG_ON(!dqm || !dqm->dev); + return bitmap_weight(dqm->dev->shared_resources.queue_bitmap, + KGD_MAX_QUEUES); + } + + unsigned int get_queues_per_pipe(struct device_queue_manager *dqm) + { ++ BUG_ON(!dqm || !dqm->dev); + return dqm->dev->shared_resources.num_queue_per_pipe; + } + + unsigned int get_pipes_per_mec(struct device_queue_manager *dqm) + { ++ BUG_ON(!dqm || !dqm->dev); + return dqm->dev->shared_resources.num_pipe_per_mec; + } + +@@ -108,57 +107,6 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, + qpd->sh_mem_bases); + } + +-static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) +-{ +- struct kfd_dev *dev = qpd->dqm->dev; +- +- if (!KFD_IS_SOC15(dev->device_info->asic_family)) { +- /* On pre-SOC15 chips we need to use the queue ID to +- * preserve the user mode ABI. +- */ +- q->doorbell_id = q->properties.queue_id; +- } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { +- /* For SDMA queues on SOC15, use static doorbell +- * assignments based on the engine and queue. +- */ +- q->doorbell_id = dev->shared_resources.sdma_doorbell +- [q->properties.sdma_engine_id] +- [q->properties.sdma_queue_id]; +- } else { +- /* For CP queues on SOC15 reserve a free doorbell ID */ +- unsigned int found; +- +- found = find_first_zero_bit(qpd->doorbell_bitmap, +- KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); +- if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { +- pr_debug("No doorbells available"); +- return -EBUSY; +- } +- set_bit(found, qpd->doorbell_bitmap); +- q->doorbell_id = found; +- } +- +- q->properties.doorbell_off = +- kfd_doorbell_id_to_offset(dev, q->process, +- q->doorbell_id); +- +- return 0; +-} +- +-static void deallocate_doorbell(struct qcm_process_device *qpd, +- struct queue *q) +-{ +- unsigned int old; +- struct kfd_dev *dev = qpd->dqm->dev; +- +- if (!KFD_IS_SOC15(dev->device_info->asic_family) || +- q->properties.type == KFD_QUEUE_TYPE_SDMA) +- return; +- +- old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); +- WARN_ON(!old); +-} +- + static int allocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +@@ -168,59 +116,31 @@ static int allocate_vmid(struct device_queue_manager *dqm, + if (dqm->vmid_bitmap == 0) + return -ENOMEM; + +- bit = ffs(dqm->vmid_bitmap) - 1; +- dqm->vmid_bitmap &= ~(1 << bit); ++ bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM); ++ clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); + +- allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; +- pr_debug("vmid allocation %d\n", allocated_vmid); ++ /* Kaveri kfd vmid's starts from vmid 8 */ ++ allocated_vmid = bit + KFD_VMID_START_OFFSET; ++ pr_debug("kfd: vmid allocation %d\n", allocated_vmid); + qpd->vmid = allocated_vmid; + q->properties.vmid = allocated_vmid; + + set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); + program_sh_mem_settings(dqm, qpd); + +- /* qpd->page_table_base is set earlier when register_process() +- * is called, i.e. when the first queue is created. +- */ +- dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, +- qpd->vmid, +- qpd->page_table_base); +- /*invalidate the VM context after pasid and vmid mapping is set up*/ +- kfd_flush_tlb(dqm->dev, qpd->pqm->process->pasid); +- + return 0; + } + +-static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, +- struct qcm_process_device *qpd) +-{ +- uint32_t len; +- +- if (!qpd->ib_kaddr) +- return -ENOMEM; +- +- len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base, +- (uint32_t *)qpd->ib_kaddr); +- +- return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, +- qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); +-} +- + static void deallocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) + { +- int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; +- +- /* On GFX v7, CP doesn't flush TC at dequeue */ +- if (q->device->device_info->asic_family == CHIP_HAWAII) +- if (flush_texture_cache_nocpsch(q->device, qpd)) +- pr_err("Failed to flush TC\n"); ++ int bit = qpd->vmid - KFD_VMID_START_OFFSET; + + /* Release the vmid mapping */ + set_pasid_vmid_mapping(dqm, 0, qpd->vmid); + +- dqm->vmid_bitmap |= (1 << bit); ++ set_bit(bit, (unsigned long *)&dqm->vmid_bitmap); + qpd->vmid = 0; + q->properties.vmid = 0; + } +@@ -230,53 +150,47 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + int *allocated_vmid) + { +- int retval = 0; ++ int retval; ++ ++ BUG_ON(!dqm || !q || !qpd || !allocated_vmid); + ++ pr_debug("kfd: In func %s\n", __func__); + print_queue(q); + + mutex_lock(&dqm->lock); + + if (dqm->total_queue_count >= max_num_of_queues_per_device) { +- pr_warn("Can't create new usermode queue because %d queues were already created\n", ++ pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", + dqm->total_queue_count); +- retval = -EPERM; +- goto out_unlock; ++ mutex_unlock(&dqm->lock); ++ return -EPERM; + } + + if (list_empty(&qpd->queues_list)) { + retval = allocate_vmid(dqm, qpd, q); +- if (retval) +- goto out_unlock; ++ if (retval != 0) { ++ mutex_unlock(&dqm->lock); ++ return retval; ++ } + } + *allocated_vmid = qpd->vmid; + q->properties.vmid = qpd->vmid; +- /* +- * Eviction state logic: we only mark active queues as evicted +- * to avoid the overhead of restoring inactive queues later +- */ +- if (qpd->evicted) +- q->properties.is_evicted = (q->properties.queue_size > 0 && +- q->properties.queue_percent > 0 && +- q->properties.queue_address != 0); +- +- q->properties.tba_addr = qpd->tba_addr; +- q->properties.tma_addr = qpd->tma_addr; + + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + retval = create_compute_queue_nocpsch(dqm, q, qpd); +- else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + retval = create_sdma_queue_nocpsch(dqm, q, qpd); + +- if (retval) { ++ if (retval != 0) { + if (list_empty(&qpd->queues_list)) { + deallocate_vmid(dqm, qpd, q); + *allocated_vmid = 0; + } +- goto out_unlock; ++ mutex_unlock(&dqm->lock); ++ return retval; + } + + list_add(&q->list, &qpd->queues_list); +- qpd->queue_count++; + if (q->properties.is_active) + dqm->queue_count++; + +@@ -291,9 +205,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + +-out_unlock: + mutex_unlock(&dqm->lock); +- return retval; ++ return 0; + } + + static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) +@@ -303,16 +216,19 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) + + set = false; + +- for (pipe = dqm->next_pipe_to_allocate, i = 0; +- i < get_pipes_per_mec(dqm); ++ for (pipe = dqm->next_pipe_to_allocate, i = 0; i < get_pipes_per_mec(dqm); + pipe = ((pipe + 1) % get_pipes_per_mec(dqm)), ++i) { + + if (!is_pipe_enabled(dqm, 0, pipe)) + continue; + + if (dqm->allocated_queues[pipe] != 0) { +- bit = ffs(dqm->allocated_queues[pipe]) - 1; +- dqm->allocated_queues[pipe] &= ~(1 << bit); ++ bit = find_first_bit( ++ (unsigned long *)&dqm->allocated_queues[pipe], ++ get_queues_per_pipe(dqm)); ++ ++ clear_bit(bit, ++ (unsigned long *)&dqm->allocated_queues[pipe]); + q->pipe = pipe; + q->queue = bit; + set = true; +@@ -323,7 +239,8 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) + if (!set) + return -EBUSY; + +- pr_debug("hqd slot - pipe %d, queue %d\n", q->pipe, q->queue); ++ pr_debug("kfd: DQM %s hqd slot - pipe (%d) queue(%d)\n", ++ __func__, q->pipe, q->queue); + /* horizontal hqd allocation */ + dqm->next_pipe_to_allocate = (pipe + 1) % get_pipes_per_mec(dqm); + +@@ -333,7 +250,7 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) + static inline void deallocate_hqd(struct device_queue_manager *dqm, + struct queue *q) + { +- dqm->allocated_queues[q->pipe] |= (1 << q->queue); ++ set_bit(q->queue, (unsigned long *)&dqm->allocated_queues[q->pipe]); + } + + static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, +@@ -343,203 +260,138 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + int retval; + struct mqd_manager *mqd; + ++ BUG_ON(!dqm || !q || !qpd); ++ + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); +- if (!mqd) ++ if (mqd == NULL) + return -ENOMEM; + + retval = allocate_hqd(dqm, q); +- if (retval) ++ if (retval != 0) + return retval; + +- retval = allocate_doorbell(qpd, q); +- if (retval) +- goto out_deallocate_hqd; +- + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); +- if (retval) +- goto out_deallocate_doorbell; +- +- pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", +- q->pipe, q->queue); +- +- dqm->dev->kfd2kgd->alloc_memory_of_scratch( +- dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); ++ if (retval != 0) { ++ deallocate_hqd(dqm, q); ++ return retval; ++ } + +- if (!q->properties.is_active) +- return 0; ++ pr_debug("kfd: loading mqd to hqd on pipe (%d) queue (%d)\n", ++ q->pipe, ++ q->queue); + +- retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, +- q->process->mm); +- if (retval) +- goto out_uninit_mqd; ++ retval = mqd->load_mqd(mqd, q->mqd, q->pipe, ++ q->queue, (uint32_t __user *) q->properties.write_ptr); ++ if (retval != 0) { ++ deallocate_hqd(dqm, q); ++ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); ++ return retval; ++ } + + return 0; +- +-out_uninit_mqd: +- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); +-out_deallocate_doorbell: +- deallocate_doorbell(qpd, q); +-out_deallocate_hqd: +- deallocate_hqd(dqm, q); +- +- return retval; + } + +-/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked +- * to avoid asynchronized access +- */ +-static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, ++static int destroy_queue_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) + { + int retval; + struct mqd_manager *mqd; + +- mqd = dqm->ops.get_mqd_manager(dqm, +- get_mqd_type_from_queue_type(q->properties.type)); +- if (!mqd) +- return -ENOMEM; ++ BUG_ON(!dqm || !q || !q->mqd || !qpd); ++ ++ retval = 0; + +- deallocate_doorbell(qpd, q); ++ pr_debug("kfd: In Func %s\n", __func__); + +- if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) ++ mutex_lock(&dqm->lock); ++ ++ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { ++ mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); ++ if (mqd == NULL) { ++ retval = -ENOMEM; ++ goto out; ++ } + deallocate_hqd(dqm, q); +- else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { ++ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { ++ mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); ++ if (mqd == NULL) { ++ retval = -ENOMEM; ++ goto out; ++ } + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); + } else { +- pr_debug("q->properties.type %d is invalid\n", ++ pr_debug("q->properties.type is invalid (%d)\n", + q->properties.type); + retval = -EINVAL; ++ goto out; + } +- dqm->total_queue_count--; + + retval = mqd->destroy_mqd(mqd, q->mqd, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, +- KFD_HIQ_TIMEOUT, ++ QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, + q->pipe, q->queue); +- if (retval == -ETIME) +- qpd->reset_wavefronts = true; ++ ++ if (retval != 0) ++ goto out; + + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + + list_del(&q->list); +- if (list_empty(&qpd->queues_list)) { +- if (qpd->reset_wavefronts) { +- pr_warn("Resetting wave fronts (nocpsch) on dev %p\n", +- dqm->dev); +- /* dbgdev_wave_reset_wavefronts has to be called before +- * deallocate_vmid(), i.e. when vmid is still in use. +- */ +- dbgdev_wave_reset_wavefronts(dqm->dev, +- qpd->pqm->process); +- qpd->reset_wavefronts = false; +- } +- ++ if (list_empty(&qpd->queues_list)) + deallocate_vmid(dqm, qpd, q); +- } +- qpd->queue_count--; + if (q->properties.is_active) + dqm->queue_count--; + +- return retval; +-} +- +-static int destroy_queue_nocpsch(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd, +- struct queue *q) +-{ +- int retval; ++ /* ++ * Unconditionally decrement this counter, regardless of the queue's ++ * type ++ */ ++ dqm->total_queue_count--; ++ pr_debug("Total of %d queues are accountable so far\n", ++ dqm->total_queue_count); + +- mutex_lock(&dqm->lock); +- retval = destroy_queue_nocpsch_locked(dqm, qpd, q); ++out: + mutex_unlock(&dqm->lock); +- + return retval; + } + +-static bool is_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q) +-{ +- return (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && +- (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || +- q->properties.type == KFD_QUEUE_TYPE_SDMA)); +-} +- + static int update_queue(struct device_queue_manager *dqm, struct queue *q) + { + int retval; + struct mqd_manager *mqd; +- struct kfd_process_device *pdd; +- + bool prev_active = false; + +- mutex_lock(&dqm->lock); ++ BUG_ON(!dqm || !q || !q->mqd); + +- pdd = kfd_get_process_device_data(q->device, q->process); +- if (!pdd) { +- retval = -ENODEV; +- goto out_unlock; +- } ++ mutex_lock(&dqm->lock); + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); +- if (!mqd) { +- retval = -ENOMEM; +- goto out_unlock; +- } +- /* +- * Eviction state logic: we only mark active queues as evicted +- * to avoid the overhead of restoring inactive queues later +- */ +- if (pdd->qpd.evicted > 0) +- q->properties.is_evicted = (q->properties.queue_size > 0 && +- q->properties.queue_percent > 0 && +- q->properties.queue_address != 0); +- +- /* save previous activity state for counters */ +- prev_active = q->properties.is_active; +- +- /* HWS mode, unmap first to own mqd */ +- if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) { +- retval = unmap_queues_cpsch(dqm, +- KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); +- if (retval) { +- pr_err("unmap queue failed"); +- goto out_unlock; +- } +- } else if (is_queue_nocpsch(dqm, q) && prev_active) { +- retval = mqd->destroy_mqd(mqd, q->mqd, +- KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, +- KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); +- if (retval) { +- pr_err("destroy mqd failed"); +- goto out_unlock; +- } ++ if (mqd == NULL) { ++ mutex_unlock(&dqm->lock); ++ return -ENOMEM; + } + +- retval = mqd->update_mqd(mqd, q->mqd, &q->properties); ++ if (q->properties.is_active) ++ prev_active = true; + +- if (is_queue_nocpsch(dqm, q)) { +- if (q->properties.is_active) +- retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, +- &q->properties, q->process->mm); +- } + /* + * + * check active state vs. the previous state + * and modify counter accordingly + */ +- if (q->properties.is_active && !prev_active) ++ retval = mqd->update_mqd(mqd, q->mqd, &q->properties); ++ if ((q->properties.is_active) && (!prev_active)) + dqm->queue_count++; +- else if (!q->properties.is_active && prev_active) ++ else if ((!q->properties.is_active) && (prev_active)) + dqm->queue_count--; + +- if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) +- retval = map_queues_cpsch(dqm); ++ if (sched_policy != KFD_SCHED_POLICY_NO_HWS) ++ retval = execute_queues_cpsch(dqm, false); + +-out_unlock: + mutex_unlock(&dqm->lock); +- + return retval; + } + +@@ -548,169 +400,41 @@ static struct mqd_manager *get_mqd_manager_nocpsch( + { + struct mqd_manager *mqd; + +- if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) +- return NULL; ++ BUG_ON(!dqm || type >= KFD_MQD_TYPE_MAX); + +- pr_debug("mqd type %d\n", type); ++ pr_debug("kfd: In func %s mqd type %d\n", __func__, type); + + mqd = dqm->mqds[type]; + if (!mqd) { + mqd = mqd_manager_init(type, dqm->dev); +- if (!mqd) +- pr_err("mqd manager is NULL"); ++ if (mqd == NULL) ++ pr_err("kfd: mqd manager is NULL"); + dqm->mqds[type] = mqd; + } + + return mqd; + } + +-int process_evict_queues(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd) +-{ +- struct queue *q, *next; +- struct mqd_manager *mqd; +- struct kfd_process_device *pdd; +- int retval = 0; +- +- mutex_lock(&dqm->lock); +- if (qpd->evicted++ > 0) /* already evicted, do nothing */ +- goto out; +- +- pdd = qpd_to_pdd(qpd); +- pr_info_ratelimited("Evicting PASID %u queues\n", +- pdd->process->pasid); +- +- /* unactivate all active queues on the qpd */ +- list_for_each_entry_safe(q, next, &qpd->queues_list, list) { +- mqd = dqm->ops.get_mqd_manager(dqm, +- get_mqd_type_from_queue_type(q->properties.type)); +- if (!mqd) { /* should not be here */ +- pr_err("Cannot evict queue, mqd is NULL\n"); +- retval = -ENOMEM; +- goto out; +- } +- /* if the queue is not active anyway, it is not evicted */ +- if (q->properties.is_active) { +- q->properties.is_evicted = true; +- q->properties.is_active = false; +- } +- +- if (is_queue_nocpsch(dqm, q) && +- q->properties.is_evicted) +- retval = mqd->destroy_mqd(mqd, q->mqd, +- KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, +- KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); +- if (q->properties.is_evicted) +- dqm->queue_count--; +- } +- if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) +- retval = execute_queues_cpsch(dqm, qpd->is_debug); +- +-out: +- mutex_unlock(&dqm->lock); +- return retval; +- +-} +- +-int process_restore_queues(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd) +-{ +- struct queue *q, *next; +- struct mqd_manager *mqd; +- int retval = 0; +- struct kfd_process_device *pdd; +- uint32_t pd_base; +- +- pdd = qpd_to_pdd(qpd); +- /* Retrieve PD base */ +- pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); +- +- mutex_lock(&dqm->lock); +- if (qpd->evicted == 0) /* already restored, do nothing */ +- goto out_unlock; +- +- if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ +- qpd->evicted--; +- goto out_unlock; +- } +- +- pr_info_ratelimited("Restoring PASID %u queues\n", +- pdd->process->pasid); +- +- /* Update PD Base in QPD */ +- qpd->page_table_base = pd_base; +- pr_debug("Updated PD address to 0x%08x\n", pd_base); +- +- if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && +- !list_empty(&qpd->queues_list)) { +- dqm->dev->kfd2kgd->set_vm_context_page_table_base( +- dqm->dev->kgd, +- qpd->vmid, +- qpd->page_table_base); +- +- kfd_flush_tlb(dqm->dev, pdd->process->pasid); +- } +- +- /* activate all active queues on the qpd */ +- list_for_each_entry_safe(q, next, &qpd->queues_list, list) { +- mqd = dqm->ops.get_mqd_manager(dqm, +- get_mqd_type_from_queue_type(q->properties.type)); +- if (!mqd) { /* should not be here */ +- pr_err("Cannot restore queue, mqd is NULL\n"); +- retval = -ENOMEM; +- goto out_unlock; +- } +- if (q->properties.is_evicted) { +- q->properties.is_evicted = false; +- q->properties.is_active = true; +- +- if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && +- (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || +- q->properties.type == KFD_QUEUE_TYPE_SDMA)) +- retval = mqd->load_mqd(mqd, q->mqd, q->pipe, +- q->queue, &q->properties, +- q->process->mm); +- dqm->queue_count++; +- } +- } +- if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) +- retval = execute_queues_cpsch(dqm, false); +- +- if (retval == 0) +- qpd->evicted = 0; +- +-out_unlock: +- mutex_unlock(&dqm->lock); +- +- return retval; +-} +- +-static int register_process(struct device_queue_manager *dqm, ++static int register_process_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) + { + struct device_process_node *n; + int retval; +- struct kfd_process_device *pdd; +- uint32_t pd_base; + +- n = kzalloc(sizeof(*n), GFP_KERNEL); ++ BUG_ON(!dqm || !qpd); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL); + if (!n) + return -ENOMEM; + + n->qpd = qpd; + +- pdd = qpd_to_pdd(qpd); +- /* Retrieve PD base */ +- pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); +- + mutex_lock(&dqm->lock); + list_add(&n->list, &dqm->queues); + +- /* Update PD Base in QPD */ +- qpd->page_table_base = pd_base; +- pr_debug("Updated PD address to 0x%08x\n", pd_base); +- +- retval = dqm->asic_ops.update_qpd(dqm, qpd); ++ retval = dqm->ops_asic_specific.register_process(dqm, qpd); + + dqm->processes_count++; + +@@ -719,12 +443,16 @@ static int register_process(struct device_queue_manager *dqm, + return retval; + } + +-static int unregister_process(struct device_queue_manager *dqm, ++static int unregister_process_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) + { + int retval; + struct device_process_node *cur, *next; + ++ BUG_ON(!dqm || !qpd); ++ ++ pr_debug("In func %s\n", __func__); ++ + pr_debug("qpd->queues_list is %s\n", + list_empty(&qpd->queues_list) ? "empty" : "not empty"); + +@@ -765,41 +493,48 @@ static void init_interrupts(struct device_queue_manager *dqm) + { + unsigned int i; + ++ BUG_ON(dqm == NULL); ++ + for (i = 0 ; i < get_pipes_per_mec(dqm) ; i++) + if (is_pipe_enabled(dqm, 0, i)) + dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i); + } ++ + static int init_scheduler(struct device_queue_manager *dqm) + { +- return 0; ++ int retval = 0; ++ ++ BUG_ON(!dqm); ++ ++ pr_debug("kfd: In %s\n", __func__); ++ ++ return retval; + } + + static int initialize_nocpsch(struct device_queue_manager *dqm) + { +- int pipe, queue; ++ int i; + +- pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); ++ BUG_ON(!dqm); + +- dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm), +- sizeof(unsigned int), GFP_KERNEL); +- if (!dqm->allocated_queues) +- return -ENOMEM; ++ pr_debug("kfd: In func %s num of pipes: %d\n", ++ __func__, get_pipes_per_mec(dqm)); + + mutex_init(&dqm->lock); + INIT_LIST_HEAD(&dqm->queues); + dqm->queue_count = dqm->next_pipe_to_allocate = 0; + dqm->sdma_queue_count = 0; +- +- for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { +- int pipe_offset = pipe * get_queues_per_pipe(dqm); +- +- for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) +- if (test_bit(pipe_offset + queue, +- dqm->dev->shared_resources.queue_bitmap)) +- dqm->allocated_queues[pipe] |= 1 << queue; ++ dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm), ++ sizeof(unsigned int), GFP_KERNEL); ++ if (!dqm->allocated_queues) { ++ mutex_destroy(&dqm->lock); ++ return -ENOMEM; + } + +- dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; ++ for (i = 0; i < get_pipes_per_mec(dqm); i++) ++ dqm->allocated_queues[i] = (1 << get_queues_per_pipe(dqm)) - 1; ++ ++ dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; + dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; + + init_scheduler(dqm); +@@ -810,7 +545,9 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm) + { + int i; + +- WARN_ON(dqm->queue_count > 0 || dqm->processes_count > 0); ++ BUG_ON(!dqm); ++ ++ BUG_ON(dqm->queue_count > 0 || dqm->processes_count > 0); + + kfree(dqm->allocated_queues); + for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) +@@ -822,12 +559,11 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm) + static int start_nocpsch(struct device_queue_manager *dqm) + { + init_interrupts(dqm); +- return pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); ++ return 0; + } + + static int stop_nocpsch(struct device_queue_manager *dqm) + { +- pm_uninit(&dqm->packets); + return 0; + } + +@@ -839,8 +575,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, + if (dqm->sdma_bitmap == 0) + return -ENOMEM; + +- bit = ffs(dqm->sdma_bitmap) - 1; +- dqm->sdma_bitmap &= ~(1 << bit); ++ bit = find_first_bit((unsigned long *)&dqm->sdma_bitmap, ++ CIK_SDMA_QUEUES); ++ ++ clear_bit(bit, (unsigned long *)&dqm->sdma_bitmap); + *sdma_queue_id = bit; + + return 0; +@@ -851,7 +589,7 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm, + { + if (sdma_queue_id >= CIK_SDMA_QUEUES) + return; +- dqm->sdma_bitmap |= (1 << sdma_queue_id); ++ set_bit(sdma_queue_id, (unsigned long *)&dqm->sdma_bitmap); + } + + static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, +@@ -866,40 +604,33 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + return -ENOMEM; + + retval = allocate_sdma_queue(dqm, &q->sdma_id); +- if (retval) ++ if (retval != 0) + return retval; + +- q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; +- q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; +- +- retval = allocate_doorbell(qpd, q); +- if (retval) +- goto out_deallocate_sdma_queue; ++ q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; ++ q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM; + +- pr_debug("SDMA id is: %d\n", q->sdma_id); +- pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); +- pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); ++ pr_debug("kfd: sdma id is: %d\n", q->sdma_id); ++ pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); ++ pr_debug(" sdma engine id: %d\n", q->properties.sdma_engine_id); + +- dqm->asic_ops.init_sdma_vm(dqm, q, qpd); ++ dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); +- if (retval) +- goto out_deallocate_doorbell; ++ if (retval != 0) { ++ deallocate_sdma_queue(dqm, q->sdma_id); ++ return retval; ++ } + +- retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); +- if (retval) +- goto out_uninit_mqd; ++ retval = mqd->load_mqd(mqd, q->mqd, 0, ++ 0, NULL); ++ if (retval != 0) { ++ deallocate_sdma_queue(dqm, q->sdma_id); ++ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); ++ return retval; ++ } + + return 0; +- +-out_uninit_mqd: +- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); +-out_deallocate_doorbell: +- deallocate_doorbell(qpd, q); +-out_deallocate_sdma_queue: +- deallocate_sdma_queue(dqm, q->sdma_id); +- +- return retval; + } + + /* +@@ -911,7 +642,12 @@ static int set_sched_resources(struct device_queue_manager *dqm) + int i, mec; + struct scheduling_resources res; + +- res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap; ++ BUG_ON(!dqm); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ res.vmid_mask = (1 << VMID_PER_DEVICE) - 1; ++ res.vmid_mask <<= KFD_VMID_START_OFFSET; + + res.queue_mask = 0; + for (i = 0; i < KGD_MAX_QUEUES; ++i) { +@@ -927,8 +663,7 @@ static int set_sched_resources(struct device_queue_manager *dqm) + + /* This situation may be hit in the future if a new HW + * generation exposes more than 64 queues. If so, the +- * definition of res.queue_mask needs updating +- */ ++ * definition of res.queue_mask needs updating */ + if (WARN_ON(i >= (sizeof(res.queue_mask)*8))) { + pr_err("Invalid queue enabled by amdgpu: %d\n", i); + break; +@@ -939,9 +674,9 @@ static int set_sched_resources(struct device_queue_manager *dqm) + res.gws_mask = res.oac_mask = res.gds_heap_base = + res.gds_heap_size = 0; + +- pr_debug("Scheduling resources:\n" +- "vmid mask: 0x%8X\n" +- "queue mask: 0x%8llX\n", ++ pr_debug("kfd: scheduling resources:\n" ++ " vmid mask: 0x%8X\n" ++ " queue mask: 0x%8llX\n", + res.vmid_mask, res.queue_mask); + + return pm_send_set_resources(&dqm->packets, &res); +@@ -951,42 +686,51 @@ static int initialize_cpsch(struct device_queue_manager *dqm) + { + int retval; + +- pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); ++ BUG_ON(!dqm); ++ ++ pr_debug("kfd: In func %s num of pipes: %d\n", ++ __func__, get_pipes_per_mec(dqm)); + + mutex_init(&dqm->lock); + INIT_LIST_HEAD(&dqm->queues); + dqm->queue_count = dqm->processes_count = 0; + dqm->sdma_queue_count = 0; + dqm->active_runlist = false; +- dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; +- retval = dqm->asic_ops.init_cpsch(dqm); +- if (retval) +- mutex_destroy(&dqm->lock); ++ retval = dqm->ops_asic_specific.initialize(dqm); ++ if (retval != 0) ++ goto fail_init_pipelines; ++ ++ return 0; + ++fail_init_pipelines: ++ mutex_destroy(&dqm->lock); + return retval; + } + + static int start_cpsch(struct device_queue_manager *dqm) + { ++ struct device_process_node *node; + int retval; + ++ BUG_ON(!dqm); ++ + retval = 0; + +- retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); +- if (retval) ++ retval = pm_init(&dqm->packets, dqm); ++ if (retval != 0) + goto fail_packet_manager_init; + + retval = set_sched_resources(dqm); +- if (retval) ++ if (retval != 0) + goto fail_set_sched_resources; + +- pr_debug("Allocating fence memory\n"); ++ pr_debug("kfd: allocating fence memory\n"); + + /* allocate fence memory on the gart */ + retval = kfd_gtt_sa_allocate(dqm->dev, sizeof(*dqm->fence_addr), + &dqm->fence_mem); + +- if (retval) ++ if (retval != 0) + goto fail_allocate_vidmem; + + dqm->fence_addr = dqm->fence_mem->cpu_ptr; +@@ -994,9 +738,12 @@ static int start_cpsch(struct device_queue_manager *dqm) + + init_interrupts(dqm); + +- mutex_lock(&dqm->lock); +- execute_queues_cpsch(dqm, false); +- mutex_unlock(&dqm->lock); ++ list_for_each_entry(node, &dqm->queues, list) ++ if (node->qpd->pqm->process && dqm->dev) ++ kfd_bind_process_to_device(dqm->dev, ++ node->qpd->pqm->process); ++ ++ execute_queues_cpsch(dqm, true); + + return 0; + fail_allocate_vidmem: +@@ -1008,12 +755,17 @@ static int start_cpsch(struct device_queue_manager *dqm) + + static int stop_cpsch(struct device_queue_manager *dqm) + { +- mutex_lock(&dqm->lock); ++ struct device_process_node *node; ++ struct kfd_process_device *pdd; + +- unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); ++ BUG_ON(!dqm); + +- mutex_unlock(&dqm->lock); ++ destroy_queues_cpsch(dqm, true, true); + ++ list_for_each_entry(node, &dqm->queues, list) { ++ pdd = qpd_to_pdd(node->qpd); ++ pdd->bound = false; ++ } + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); + pm_uninit(&dqm->packets); + +@@ -1024,9 +776,13 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd) + { ++ BUG_ON(!dqm || !kq || !qpd); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ + mutex_lock(&dqm->lock); + if (dqm->total_queue_count >= max_num_of_queues_per_device) { +- pr_warn("Can't create new kernel queue because %d queues were already created\n", ++ pr_warn("amdkfd: Can't create new kernel queue because %d queues were already created\n", + dqm->total_queue_count); + mutex_unlock(&dqm->lock); + return -EPERM; +@@ -1053,12 +809,17 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd) + { ++ BUG_ON(!dqm || !kq); ++ ++ pr_debug("kfd: In %s\n", __func__); ++ + mutex_lock(&dqm->lock); + /* here we actually preempt the DIQ */ ++ destroy_queues_cpsch(dqm, true, false); + list_del(&kq->list); + dqm->queue_count--; + qpd->is_debug = false; +- execute_queues_cpsch(dqm, true); ++ execute_queues_cpsch(dqm, false); + /* + * Unconditionally decrement this counter, regardless of the queue's + * type. +@@ -1069,12 +830,22 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, + mutex_unlock(&dqm->lock); + } + ++static void select_sdma_engine_id(struct queue *q) ++{ ++ static int sdma_id; ++ ++ q->sdma_id = sdma_id; ++ sdma_id = (sdma_id + 1) % 2; ++} ++ + static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd, int *allocate_vmid) + { + int retval; + struct mqd_manager *mqd; + ++ BUG_ON(!dqm || !q || !qpd); ++ + retval = 0; + + if (allocate_vmid) +@@ -1083,60 +854,37 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + mutex_lock(&dqm->lock); + + if (dqm->total_queue_count >= max_num_of_queues_per_device) { +- pr_warn("Can't create new usermode queue because %d queues were already created\n", ++ pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", + dqm->total_queue_count); + retval = -EPERM; +- goto out_unlock; +- } +- +- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { +- retval = allocate_sdma_queue(dqm, &q->sdma_id); +- if (retval) +- goto out_unlock; +- q->properties.sdma_queue_id = +- q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; +- q->properties.sdma_engine_id = +- q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; ++ goto out; + } + +- retval = allocate_doorbell(qpd, q); +- if (retval) +- goto out_deallocate_sdma_queue; ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) ++ select_sdma_engine_id(q); + + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + +- if (!mqd) { +- retval = -ENOMEM; +- goto out_deallocate_doorbell; ++ if (mqd == NULL) { ++ mutex_unlock(&dqm->lock); ++ return -ENOMEM; + } +- /* +- * Eviction state logic: we only mark active queues as evicted +- * to avoid the overhead of restoring inactive queues later +- */ +- if (qpd->evicted) +- q->properties.is_evicted = (q->properties.queue_size > 0 && +- q->properties.queue_percent > 0 && +- q->properties.queue_address != 0); +- +- dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + +- q->properties.tba_addr = qpd->tba_addr; +- q->properties.tma_addr = qpd->tma_addr; ++ dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); +- if (retval) +- goto out_deallocate_doorbell; ++ if (retval != 0) ++ goto out; + + list_add(&q->list, &qpd->queues_list); +- qpd->queue_count++; + if (q->properties.is_active) { + dqm->queue_count++; + retval = execute_queues_cpsch(dqm, false); + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) +- dqm->sdma_queue_count++; ++ dqm->sdma_queue_count++; + /* + * Unconditionally increment this counter, regardless of the queue's + * type or whether the queue is active. +@@ -1146,31 +894,21 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + ++out: + mutex_unlock(&dqm->lock); + return retval; +- +-out_deallocate_doorbell: +- deallocate_doorbell(qpd, q); +-out_deallocate_sdma_queue: +- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) +- deallocate_sdma_queue(dqm, q->sdma_id); +-out_unlock: +- mutex_unlock(&dqm->lock); +- +- return retval; + } + + int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + unsigned int fence_value, +- unsigned long timeout_ms) ++ unsigned long timeout) + { +- unsigned long end_jiffies; +- +- end_jiffies = (timeout_ms * HZ / 1000) + jiffies; ++ BUG_ON(!fence_addr); ++ timeout += jiffies; + + while (*fence_addr != fence_value) { +- if (time_after(jiffies, end_jiffies)) { +- pr_err("qcm fence wait loop timeout expired\n"); ++ if (time_after(jiffies, timeout)) { ++ pr_err("kfd: qcm fence wait loop timeout expired\n"); + return -ETIME; + } + schedule(); +@@ -1179,63 +917,46 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + return 0; + } + +-static int unmap_sdma_queues(struct device_queue_manager *dqm, +- unsigned int sdma_engine) ++static int destroy_sdma_queues(struct device_queue_manager *dqm, ++ unsigned int sdma_engine) + { + return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, +- KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, ++ KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false, + sdma_engine); + } + +-/* dqm->lock mutex has to be locked before calling this function */ +-static int map_queues_cpsch(struct device_queue_manager *dqm) ++static int destroy_queues_cpsch(struct device_queue_manager *dqm, ++ bool preempt_static_queues, bool lock) + { + int retval; ++ enum kfd_preempt_type_filter preempt_type; ++ struct kfd_process_device *pdd; + +- if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { +- retval = 0; +- return retval; +- } +- +- if (dqm->active_runlist) { +- retval = 0; +- return retval; +- } +- +- retval = pm_send_runlist(&dqm->packets, &dqm->queues); +- if (retval) { +- pr_err("failed to execute runlist\n"); +- return retval; +- } +- dqm->active_runlist = true; +- +- return retval; +-} +- +-/* dqm->lock mutex has to be locked before calling this function */ +-static int unmap_queues_cpsch(struct device_queue_manager *dqm, +- enum kfd_unmap_queues_filter filter, +- uint32_t filter_param) +-{ +- int retval; ++ BUG_ON(!dqm); + + retval = 0; + ++ if (lock) ++ mutex_lock(&dqm->lock); + if (!dqm->active_runlist) +- return retval; ++ goto out; + +- pr_debug("Before destroying queues, sdma queue count is : %u\n", ++ pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n", + dqm->sdma_queue_count); + + if (dqm->sdma_queue_count > 0) { +- unmap_sdma_queues(dqm, 0); +- unmap_sdma_queues(dqm, 1); ++ destroy_sdma_queues(dqm, 0); ++ destroy_sdma_queues(dqm, 1); + } + ++ preempt_type = preempt_static_queues ? ++ KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES : ++ KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES; ++ + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, +- filter, filter_param, false, 0); +- if (retval) +- return retval; ++ preempt_type, 0, false, 0); ++ if (retval != 0) ++ goto out; + + *dqm->fence_addr = KFD_FENCE_INIT; + pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, +@@ -1243,36 +964,56 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, + /* should be timed out */ + retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); +- if (retval) { +- pr_err("Unmapping queues failed.\n"); +- return retval; ++ if (retval != 0) { ++ pdd = kfd_get_process_device_data(dqm->dev, ++ kfd_get_process(current)); ++ pdd->reset_wavefronts = true; ++ goto out; + } +- + pm_release_ib(&dqm->packets); + dqm->active_runlist = false; + ++out: ++ if (lock) ++ mutex_unlock(&dqm->lock); + return retval; + } + +-/* dqm->lock mutex has to be locked before calling this function */ +-static int execute_queues_cpsch(struct device_queue_manager *dqm, +- bool static_queues_included) ++static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock) + { + int retval; +- enum kfd_unmap_queues_filter filter; + +- filter = static_queues_included ? +- KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : +- KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; ++ BUG_ON(!dqm); + +- retval = unmap_queues_cpsch(dqm, filter, 0); +- if (retval) { +- pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); +- return retval; ++ if (lock) ++ mutex_lock(&dqm->lock); ++ ++ retval = destroy_queues_cpsch(dqm, false, false); ++ if (retval != 0) { ++ pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption"); ++ goto out; ++ } ++ ++ if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { ++ retval = 0; ++ goto out; + } + +- retval = map_queues_cpsch(dqm); ++ if (dqm->active_runlist) { ++ retval = 0; ++ goto out; ++ } + ++ retval = pm_send_runlist(&dqm->packets, &dqm->queues); ++ if (retval != 0) { ++ pr_err("kfd: failed to execute runlist"); ++ goto out; ++ } ++ dqm->active_runlist = true; ++ ++out: ++ if (lock) ++ mutex_unlock(&dqm->lock); + return retval; + } + +@@ -1284,6 +1025,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, + struct mqd_manager *mqd; + bool preempt_all_queues; + ++ BUG_ON(!dqm || !qpd || !q); ++ + preempt_all_queues = false; + + retval = 0; +@@ -1308,21 +1051,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, + goto failed; + } + +- deallocate_doorbell(qpd, q); +- +- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + dqm->sdma_queue_count--; +- deallocate_sdma_queue(dqm, q->sdma_id); +- } + + list_del(&q->list); +- qpd->queue_count--; + if (q->properties.is_active) + dqm->queue_count--; + +- retval = execute_queues_cpsch(dqm, false); +- if (retval == -ETIME) +- qpd->reset_wavefronts = true; ++ execute_queues_cpsch(dqm, false); + + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + +@@ -1336,7 +1072,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, + + mutex_unlock(&dqm->lock); + +- return retval; ++ return 0; + + failed: + failed_try_destroy_debugged_queue: +@@ -1360,10 +1096,9 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) + { +- bool retval = true; ++ bool retval; + +- if (!dqm->asic_ops.set_cache_memory_policy) +- return retval; ++ pr_debug("kfd: In func %s\n", __func__); + + mutex_lock(&dqm->lock); + +@@ -1385,17 +1120,20 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, + uint64_t base = (uintptr_t)alternate_aperture_base; + uint64_t limit = base + alternate_aperture_size - 1; + +- if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || +- (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { +- retval = false; ++ if (limit <= base) ++ goto out; ++ ++ if ((base & APE1_FIXED_BITS_MASK) != 0) ++ goto out; ++ ++ if ((limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) + goto out; +- } + + qpd->sh_mem_ape1_base = base >> 16; + qpd->sh_mem_ape1_limit = limit >> 16; + } + +- retval = dqm->asic_ops.set_cache_memory_policy( ++ retval = dqm->ops_asic_specific.set_cache_memory_policy( + dqm, + qpd, + default_policy, +@@ -1403,199 +1141,35 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, + alternate_aperture_base, + alternate_aperture_size); + +- if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) ++ if ((sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) + program_sh_mem_settings(dqm, qpd); + +- pr_debug("sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", ++ pr_debug("kfd: sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", + qpd->sh_mem_config, qpd->sh_mem_ape1_base, + qpd->sh_mem_ape1_limit); + +-out: + mutex_unlock(&dqm->lock); + return retval; +-} +- +-static int set_trap_handler(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd, +- uint64_t tba_addr, +- uint64_t tma_addr) +-{ +- uint64_t *tma; +- +- if (dqm->dev->cwsr_enabled) { +- /* Jump from CWSR trap handler to user trap */ +- tma = (uint64_t *)(qpd->cwsr_kaddr + dqm->dev->tma_offset); +- tma[0] = tba_addr; +- tma[1] = tma_addr; +- } else { +- qpd->tba_addr = tba_addr; +- qpd->tma_addr = tma_addr; +- } +- +- return 0; +-} +- +-static int process_termination_nocpsch(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd) +-{ +- struct queue *q, *next; +- struct device_process_node *cur, *next_dpn; +- int retval = 0; +- +- mutex_lock(&dqm->lock); +- +- /* Clear all user mode queues */ +- list_for_each_entry_safe(q, next, &qpd->queues_list, list) { +- int ret; +- +- ret = destroy_queue_nocpsch_locked(dqm, qpd, q); +- if (ret) +- retval = ret; +- } +- +- /* Unregister process */ +- list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { +- if (qpd == cur->qpd) { +- list_del(&cur->list); +- kfree(cur); +- dqm->processes_count--; +- break; +- } +- } +- +- mutex_unlock(&dqm->lock); +- return retval; +-} +- +-static int get_wave_state(struct device_queue_manager *dqm, +- struct queue *q, +- void __user *ctl_stack, +- u32 *ctl_stack_used_size, +- u32 *save_area_used_size) +-{ +- struct mqd_manager *mqd; +- int r; +- +- mutex_lock(&dqm->lock); +- +- if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE || +- q->properties.is_active || !q->device->cwsr_enabled) { +- r = -EINVAL; +- goto dqm_unlock; +- } +- +- mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); +- if (!mqd) { +- r = -ENOMEM; +- goto dqm_unlock; +- } +- +- if (!mqd->get_wave_state) { +- r = -EINVAL; +- goto dqm_unlock; +- } +- +- r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size, +- save_area_used_size); +- +-dqm_unlock: +- mutex_unlock(&dqm->lock); +- return r; +-} +- +-static int process_termination_cpsch(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd) +-{ +- int retval; +- struct queue *q, *next; +- struct kernel_queue *kq, *kq_next; +- struct mqd_manager *mqd; +- struct device_process_node *cur, *next_dpn; +- bool unmap_static_queues = false; +- +- retval = 0; +- +- mutex_lock(&dqm->lock); +- +- /* Clean all kernel queues */ +- list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) { +- list_del(&kq->list); +- dqm->queue_count--; +- qpd->is_debug = false; +- dqm->total_queue_count--; +- unmap_static_queues = true; +- } +- +- /* Clear all user mode queues */ +- list_for_each_entry(q, &qpd->queues_list, list) { +- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { +- dqm->sdma_queue_count--; +- deallocate_sdma_queue(dqm, q->sdma_id); +- } +- +- if (q->properties.is_active) +- dqm->queue_count--; +- +- dqm->total_queue_count--; +- } +- +- /* Unregister process */ +- list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { +- if (qpd == cur->qpd) { +- list_del(&cur->list); +- kfree(cur); +- dqm->processes_count--; +- break; +- } +- } +- +- retval = execute_queues_cpsch(dqm, unmap_static_queues); +- if (retval || qpd->reset_wavefronts) { +- pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); +- dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); +- qpd->reset_wavefronts = false; +- } +- +- /* lastly, free mqd resources */ +- list_for_each_entry_safe(q, next, &qpd->queues_list, list) { +- mqd = dqm->ops.get_mqd_manager(dqm, +- get_mqd_type_from_queue_type(q->properties.type)); +- if (!mqd) { +- retval = -ENOMEM; +- goto out; +- } +- list_del(&q->list); +- qpd->queue_count--; +- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); +- } + + out: + mutex_unlock(&dqm->lock); +- return retval; ++ return false; + } + + struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + { + struct device_queue_manager *dqm; + +- pr_debug("Loading device queue manager\n"); ++ BUG_ON(!dev); ++ ++ pr_debug("kfd: loading device queue manager\n"); + +- dqm = kzalloc(sizeof(*dqm), GFP_KERNEL); ++ dqm = kzalloc(sizeof(struct device_queue_manager), GFP_KERNEL); + if (!dqm) + return NULL; + +- switch (dev->device_info->asic_family) { +- case CHIP_HAWAII: +- case CHIP_TONGA: +- dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS; +- break; +- default: +- dqm->sched_policy = sched_policy; +- break; +- } +- + dqm->dev = dev; +- switch (dqm->sched_policy) { ++ switch (sched_policy) { + case KFD_SCHED_POLICY_HWS: + case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: + /* initialize dqm for cp scheduling */ +@@ -1606,15 +1180,12 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + dqm->ops.destroy_queue = destroy_queue_cpsch; + dqm->ops.update_queue = update_queue; + dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; +- dqm->ops.register_process = register_process; +- dqm->ops.unregister_process = unregister_process; ++ dqm->ops.register_process = register_process_nocpsch; ++ dqm->ops.unregister_process = unregister_process_nocpsch; + dqm->ops.uninitialize = uninitialize_nocpsch; + dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; + dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; +- dqm->ops.set_trap_handler = set_trap_handler; +- dqm->ops.process_termination = process_termination_cpsch; +- dqm->ops.get_wave_state = get_wave_state; + break; + case KFD_SCHED_POLICY_NO_HWS: + /* initialize dqm for no cp scheduling */ +@@ -1624,142 +1195,39 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + dqm->ops.destroy_queue = destroy_queue_nocpsch; + dqm->ops.update_queue = update_queue; + dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; +- dqm->ops.register_process = register_process; +- dqm->ops.unregister_process = unregister_process; ++ dqm->ops.register_process = register_process_nocpsch; ++ dqm->ops.unregister_process = unregister_process_nocpsch; + dqm->ops.initialize = initialize_nocpsch; + dqm->ops.uninitialize = uninitialize_nocpsch; + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; +- dqm->ops.set_trap_handler = set_trap_handler; +- dqm->ops.process_termination = process_termination_nocpsch; +- dqm->ops.get_wave_state = get_wave_state; + break; + default: +- WARN(1, "Invalid scheduling policy %d", dqm->sched_policy); +- goto out_free; ++ BUG(); ++ break; + } + + switch (dev->device_info->asic_family) { + case CHIP_CARRIZO: +- device_queue_manager_init_vi(&dqm->asic_ops); ++ device_queue_manager_init_vi(&dqm->ops_asic_specific); + break; + + case CHIP_KAVERI: +- device_queue_manager_init_cik(&dqm->asic_ops); +- break; +- +- case CHIP_HAWAII: +- device_queue_manager_init_cik_hawaii(&dqm->asic_ops); +- break; +- +- case CHIP_TONGA: +- case CHIP_FIJI: +- case CHIP_POLARIS10: +- case CHIP_POLARIS11: +- device_queue_manager_init_vi_tonga(&dqm->asic_ops); ++ device_queue_manager_init_cik(&dqm->ops_asic_specific); + break; +- +- case CHIP_VEGA10: +- case CHIP_RAVEN: +- device_queue_manager_init_v9_vega10(&dqm->asic_ops); +- break; +- default: +- BUG(); + } + +- if (!dqm->ops.initialize(dqm)) +- return dqm; ++ if (dqm->ops.initialize(dqm) != 0) { ++ kfree(dqm); ++ return NULL; ++ } + +-out_free: +- kfree(dqm); +- return NULL; ++ return dqm; + } + + void device_queue_manager_uninit(struct device_queue_manager *dqm) + { ++ BUG_ON(!dqm); ++ + dqm->ops.uninitialize(dqm); + kfree(dqm); + } +- +-int kfd_process_vm_fault(struct device_queue_manager *dqm, +- unsigned int pasid) +-{ +- struct kfd_process_device *pdd; +- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); +- int ret = 0; +- +- if (!p) +- return -EINVAL; +- pdd = kfd_get_process_device_data(dqm->dev, p); +- if (pdd) +- ret = process_evict_queues(dqm, &pdd->qpd); +- kfd_unref_process(p); +- +- return ret; +-} +- +-static void seq_reg_dump(struct seq_file *m, +- uint32_t (*dump)[2], uint32_t n_regs) +-{ +- uint32_t i, count; +- +- for (i = 0, count = 0; i < n_regs; i++) { +- if (count == 0 || +- dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) { +- seq_printf(m, "%s %08x: %08x", +- i ? "\n" : "", +- dump[i][0], dump[i][1]); +- count = 7; +- } else { +- seq_printf(m, " %08x", dump[i][1]); +- count--; +- } +- } +- +- seq_puts(m, "\n"); +-} +- +-int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data) +-{ +- struct device_queue_manager *dqm = data; +- uint32_t (*dump)[2], n_regs; +- int pipe, queue; +- int r = 0; +- +- for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { +- int pipe_offset = pipe * get_queues_per_pipe(dqm); +- +- for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) { +- if (!test_bit(pipe_offset + queue, +- dqm->dev->shared_resources.queue_bitmap)) +- continue; +- +- r = dqm->dev->kfd2kgd->hqd_dump( +- dqm->dev->kgd, pipe, queue, &dump, &n_regs); +- if (r) +- break; +- +- seq_printf(m, " CP Pipe %d, Queue %d\n", +- pipe, queue); +- seq_reg_dump(m, dump, n_regs); +- +- kfree(dump); +- } +- } +- +- for (pipe = 0; pipe < CIK_SDMA_ENGINE_NUM; pipe++) { +- for (queue = 0; queue < CIK_SDMA_QUEUES_PER_ENGINE; queue++) { +- r = dqm->dev->kfd2kgd->hqd_sdma_dump( +- dqm->dev->kgd, pipe, queue, &dump, &n_regs); +- if (r) +- break; +- +- seq_printf(m, " SDMA Engine %d, RLC %d\n", +- pipe, queue); +- seq_reg_dump(m, dump, n_regs); +- +- kfree(dump); +- } +- } +- +- return r; +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +index 841283a..faf820a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +@@ -29,10 +29,10 @@ + #include "kfd_priv.h" + #include "kfd_mqd_manager.h" + +-#define KFD_HIQ_TIMEOUT (500) +-#define KFD_UNMAP_LATENCY_MS (4000) +-#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) +- ++#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500) ++#define CIK_VMID_NUM (8) ++#define KFD_VMID_START_OFFSET (8) ++#define VMID_PER_DEVICE CIK_VMID_NUM + #define KFD_DQM_FIRST_PIPE (0) + #define CIK_SDMA_QUEUES (4) + #define CIK_SDMA_QUEUES_PER_ENGINE (2) +@@ -79,14 +79,6 @@ struct device_process_node { + * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the + * memory apertures. + * +- * @set_page_directory_base: Sets the PD base address (GPU local memory) +- * in all the queues of the relevant process running on the specified device. +- * It preempts the queues, updates the value and execute the runlist again. +- * +- * @process_termination: Clears all process queues belongs to that device. +- * +- * @get_wave_state: Retrieves context save state and optionally copies the +- * control stack, if kept in the MQD, to the given userspace address. + */ + + struct device_queue_manager_ops { +@@ -130,26 +122,12 @@ struct device_queue_manager_ops { + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +- +- int (*set_trap_handler)(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd, +- uint64_t tba_addr, +- uint64_t tma_addr); +- +- int (*process_termination)(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd); +- +- int (*get_wave_state)(struct device_queue_manager *dqm, +- struct queue *q, +- void __user *ctl_stack, +- u32 *ctl_stack_used_size, +- u32 *save_area_used_size); + }; + + struct device_queue_manager_asic_ops { +- int (*update_qpd)(struct device_queue_manager *dqm, ++ int (*register_process)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +- int (*init_cpsch)(struct device_queue_manager *dqm); ++ int (*initialize)(struct device_queue_manager *dqm); + bool (*set_cache_memory_policy)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, +@@ -175,7 +153,7 @@ struct device_queue_manager_asic_ops { + + struct device_queue_manager { + struct device_queue_manager_ops ops; +- struct device_queue_manager_asic_ops asic_ops; ++ struct device_queue_manager_asic_ops ops_asic_specific; + + struct mqd_manager *mqds[KFD_MQD_TYPE_MAX]; + struct packet_manager packets; +@@ -196,37 +174,21 @@ struct device_queue_manager { + unsigned int *fence_addr; + struct kfd_mem_obj *fence_mem; + bool active_runlist; +- int sched_policy; + }; + +-void device_queue_manager_init_cik( +- struct device_queue_manager_asic_ops *asic_ops); +-void device_queue_manager_init_cik_hawaii( +- struct device_queue_manager_asic_ops *asic_ops); +-void device_queue_manager_init_vi( +- struct device_queue_manager_asic_ops *asic_ops); +-void device_queue_manager_init_vi_tonga( +- struct device_queue_manager_asic_ops *asic_ops); +-void device_queue_manager_init_v9_vega10( +- struct device_queue_manager_asic_ops *asic_ops); ++void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops); ++void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops); + void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + unsigned int get_queues_num(struct device_queue_manager *dqm); + unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); + unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); + +-int process_evict_queues(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd); +-int process_restore_queues(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd); +- +- + static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) + { + return (pdd->lds_base >> 16) & 0xFF; + } + +-/* This function is only useful for GFXv7 and v8 */ + static inline unsigned int + get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) + { +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +index 8e1eb24..48dc056 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +@@ -24,7 +24,6 @@ + #include "kfd_device_queue_manager.h" + #include "cik_regs.h" + #include "oss/oss_2_4_sh_mask.h" +-#include "gca/gfx_7_2_sh_mask.h" + + static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, +@@ -32,33 +31,18 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +-static int update_qpd_cik(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd); +-static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, ++static int register_process_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + static int initialize_cpsch_cik(struct device_queue_manager *dqm); + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); +-static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, +- struct queue *q, +- struct qcm_process_device *qpd); +- +-void device_queue_manager_init_cik( +- struct device_queue_manager_asic_ops *asic_ops) +-{ +- asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; +- asic_ops->update_qpd = update_qpd_cik; +- asic_ops->init_cpsch = initialize_cpsch_cik; +- asic_ops->init_sdma_vm = init_sdma_vm; +-} + +-void device_queue_manager_init_cik_hawaii( +- struct device_queue_manager_asic_ops *asic_ops) ++void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops) + { +- asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; +- asic_ops->update_qpd = update_qpd_cik_hawaii; +- asic_ops->init_cpsch = initialize_cpsch_cik; +- asic_ops->init_sdma_vm = init_sdma_vm_hawaii; ++ ops->set_cache_memory_policy = set_cache_memory_policy_cik; ++ ops->register_process = register_process_cik; ++ ops->initialize = initialize_cpsch_cik; ++ ops->init_sdma_vm = init_sdma_vm; + } + + static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) +@@ -81,7 +65,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) + * for LDS/Scratch and GPUVM. + */ + +- WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE || ++ BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE || + top_address_nybble == 0); + + return PRIVATE_BASE(top_address_nybble << 12) | +@@ -114,12 +98,14 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, + return true; + } + +-static int update_qpd_cik(struct device_queue_manager *dqm, ++static int register_process_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) + { + struct kfd_process_device *pdd; + unsigned int temp; + ++ BUG_ON(!dqm || !qpd); ++ + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ +@@ -139,40 +125,9 @@ static int update_qpd_cik(struct device_queue_manager *dqm, + } else { + temp = get_sh_mem_bases_nybble_64(pdd); + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); +- qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; + } + +- pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", +- qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); +- +- return 0; +-} +- +-static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd) +-{ +- struct kfd_process_device *pdd; +- unsigned int temp; +- +- pdd = qpd_to_pdd(qpd); +- +- /* check if sh_mem_config register already configured */ +- if (qpd->sh_mem_config == 0) { +- qpd->sh_mem_config = +- ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | +- DEFAULT_MTYPE(MTYPE_NONCACHED) | +- APE1_MTYPE(MTYPE_NONCACHED); +- qpd->sh_mem_ape1_limit = 0; +- qpd->sh_mem_ape1_base = 0; +- } +- +- /* On dGPU we're always in GPUVM64 addressing mode with 64-bit +- * aperture addresses. +- */ +- temp = get_sh_mem_bases_nybble_64(pdd); +- qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); +- +- pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", ++ pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", + qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); + + return 0; +@@ -194,19 +149,6 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + q->properties.sdma_vm_addr = value; + } + +-static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, +- struct queue *q, +- struct qcm_process_device *qpd) +-{ +- /* On dGPU we're always in GPUVM64 addressing mode with 64-bit +- * aperture addresses. +- */ +- q->properties.sdma_vm_addr = +- ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << +- SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & +- SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; +-} +- + static int initialize_cpsch_cik(struct device_queue_manager *dqm) + { + return 0; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +deleted file mode 100644 +index dde5882..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c ++++ /dev/null +@@ -1,90 +0,0 @@ +-/* +- * Copyright 2016 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- * +- */ +- +-#include "kfd_device_queue_manager.h" +-#include "vega10/vega10_enum.h" +-#include "vega10/GC/gc_9_0_offset.h" +-#include "vega10/GC/gc_9_0_sh_mask.h" +-#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" +- +-static int update_qpd_v9(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd); +-static int initialize_cpsch_v9(struct device_queue_manager *dqm); +-static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, +- struct qcm_process_device *qpd); +- +-void device_queue_manager_init_v9_vega10( +- struct device_queue_manager_asic_ops *asic_ops) +-{ +- asic_ops->update_qpd = update_qpd_v9; +- asic_ops->init_cpsch = initialize_cpsch_v9; +- asic_ops->init_sdma_vm = init_sdma_vm_v9; +-} +- +-static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) +-{ +- uint32_t shared_base = pdd->lds_base >> 48; +- uint32_t private_base = pdd->scratch_base >> 48; +- +- return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | +- private_base; +-} +- +-static int update_qpd_v9(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd) +-{ +- struct kfd_process_device *pdd; +- +- pdd = qpd_to_pdd(qpd); +- +- /* check if sh_mem_config register already configured */ +- if (qpd->sh_mem_config == 0) { +- qpd->sh_mem_config = +- SH_MEM_ALIGNMENT_MODE_UNALIGNED << +- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; +- if (vega10_noretry) +- qpd->sh_mem_config |= +- 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; +- +- qpd->sh_mem_ape1_limit = 0; +- qpd->sh_mem_ape1_base = 0; +- } +- +- qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); +- +- pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); +- +- return 0; +-} +- +-static int initialize_cpsch_v9(struct device_queue_manager *dqm) +-{ +- return 0; +-} +- +-static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, +- struct qcm_process_device *qpd) +-{ +- /* Not needed on SDMAv4 any more */ +- q->properties.sdma_vm_addr = 0; +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +index ac8d852..7e9cae9 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +@@ -33,44 +33,18 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +-static int update_qpd_vi(struct device_queue_manager *dqm, ++static int register_process_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + static int initialize_cpsch_vi(struct device_queue_manager *dqm); + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + +-/* +- * Tonga device queue manager functions +- */ +-static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd, +- enum cache_policy default_policy, +- enum cache_policy alternate_policy, +- void __user *alternate_aperture_base, +- uint64_t alternate_aperture_size); +-static int update_qpd_vi_tonga(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd); +-static void init_sdma_vm_tonga(struct device_queue_manager *dqm, +- struct queue *q, +- struct qcm_process_device *qpd); +- +-void device_queue_manager_init_vi_tonga( +- struct device_queue_manager_asic_ops *asic_ops) +-{ +- asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; +- asic_ops->update_qpd = update_qpd_vi_tonga; +- asic_ops->init_cpsch = initialize_cpsch_vi; +- asic_ops->init_sdma_vm = init_sdma_vm_tonga; +-} +- +- +-void device_queue_manager_init_vi( +- struct device_queue_manager_asic_ops *asic_ops) ++void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops) + { +- asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; +- asic_ops->update_qpd = update_qpd_vi; +- asic_ops->init_cpsch = initialize_cpsch_vi; +- asic_ops->init_sdma_vm = init_sdma_vm; ++ ops->set_cache_memory_policy = set_cache_memory_policy_vi; ++ ops->register_process = register_process_vi; ++ ops->initialize = initialize_cpsch_vi; ++ ops->init_sdma_vm = init_sdma_vm; + } + + static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) +@@ -93,7 +67,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) + * for LDS/Scratch and GPUVM. + */ + +- WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE || ++ BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE || + top_address_nybble == 0); + + return top_address_nybble << 12 | +@@ -130,39 +104,14 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, + return true; + } + +-static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd, +- enum cache_policy default_policy, +- enum cache_policy alternate_policy, +- void __user *alternate_aperture_base, +- uint64_t alternate_aperture_size) +-{ +- uint32_t default_mtype; +- uint32_t ape1_mtype; +- +- default_mtype = (default_policy == cache_policy_coherent) ? +- MTYPE_UC : +- MTYPE_NC; +- +- ape1_mtype = (alternate_policy == cache_policy_coherent) ? +- MTYPE_UC : +- MTYPE_NC; +- +- qpd->sh_mem_config = +- SH_MEM_ALIGNMENT_MODE_UNALIGNED << +- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | +- default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | +- ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; +- +- return true; +-} +- +-static int update_qpd_vi(struct device_queue_manager *dqm, ++static int register_process_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) + { + struct kfd_process_device *pdd; + unsigned int temp; + ++ BUG_ON(!dqm || !qpd); ++ + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ +@@ -188,50 +137,14 @@ static int update_qpd_vi(struct device_queue_manager *dqm, + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 << + SH_MEM_CONFIG__ADDRESS_MODE__SHIFT; +- qpd->sh_mem_config |= 1 << +- SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; + } + +- pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", ++ pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", + qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); + + return 0; + } + +-static int update_qpd_vi_tonga(struct device_queue_manager *dqm, +- struct qcm_process_device *qpd) +-{ +- struct kfd_process_device *pdd; +- unsigned int temp; +- +- pdd = qpd_to_pdd(qpd); +- +- /* check if sh_mem_config register already configured */ +- if (qpd->sh_mem_config == 0) { +- qpd->sh_mem_config = +- SH_MEM_ALIGNMENT_MODE_UNALIGNED << +- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | +- MTYPE_UC << +- SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | +- MTYPE_UC << +- SH_MEM_CONFIG__APE1_MTYPE__SHIFT; +- +- qpd->sh_mem_ape1_limit = 0; +- qpd->sh_mem_ape1_base = 0; +- } +- +- /* On dGPU we're always in GPUVM64 addressing mode with 64-bit +- * aperture addresses. +- */ +- temp = get_sh_mem_bases_nybble_64(pdd); +- qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); +- +- pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n", +- temp, qpd->sh_mem_bases); +- +- return 0; +-} +- + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) + { +@@ -248,20 +161,6 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + q->properties.sdma_vm_addr = value; + } + +-static void init_sdma_vm_tonga(struct device_queue_manager *dqm, +- struct queue *q, +- struct qcm_process_device *qpd) +-{ +- /* On dGPU we're always in GPUVM64 addressing mode with 64-bit +- * aperture addresses. +- */ +- q->properties.sdma_vm_addr = +- ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << +- SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & +- SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; +-} +- +- + static int initialize_cpsch_vi(struct device_queue_manager *dqm) + { + return 0; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +index 008d258..453c5d6 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +@@ -34,6 +34,7 @@ + */ + + #define KERNEL_DOORBELL_PASID 1 ++#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 + + /* + * Each device exposes a doorbell aperture, a PCI MMIO aperture that +@@ -50,15 +51,15 @@ + */ + + /* # of doorbell bytes allocated for each process. */ +-size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) ++static inline size_t doorbell_process_allocation(void) + { +- return roundup(kfd->device_info->doorbell_size * ++ return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + PAGE_SIZE); + } + + /* Doorbell calculations for device init. */ +-int kfd_doorbell_init(struct kfd_dev *kfd) ++void kfd_doorbell_init(struct kfd_dev *kfd) + { + size_t doorbell_start_offset; + size_t doorbell_aperture_size; +@@ -72,16 +73,16 @@ int kfd_doorbell_init(struct kfd_dev *kfd) + + doorbell_start_offset = + roundup(kfd->shared_resources.doorbell_start_offset, +- kfd_doorbell_process_slice(kfd)); ++ doorbell_process_allocation()); + + doorbell_aperture_size = + rounddown(kfd->shared_resources.doorbell_aperture_size, +- kfd_doorbell_process_slice(kfd)); ++ doorbell_process_allocation()); + + if (doorbell_aperture_size > doorbell_start_offset) + doorbell_process_limit = + (doorbell_aperture_size - doorbell_start_offset) / +- kfd_doorbell_process_slice(kfd); ++ doorbell_process_allocation(); + else + doorbell_process_limit = 0; + +@@ -92,49 +93,45 @@ int kfd_doorbell_init(struct kfd_dev *kfd) + kfd->doorbell_process_limit = doorbell_process_limit - 1; + + kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, +- kfd_doorbell_process_slice(kfd)); ++ doorbell_process_allocation()); + +- if (!kfd->doorbell_kernel_ptr) +- return -ENOMEM; ++ BUG_ON(!kfd->doorbell_kernel_ptr); + +- pr_debug("Doorbell initialization:\n"); +- pr_debug("doorbell base == 0x%08lX\n", ++ pr_debug("kfd: doorbell initialization:\n"); ++ pr_debug("kfd: doorbell base == 0x%08lX\n", + (uintptr_t)kfd->doorbell_base); + +- pr_debug("doorbell_id_offset == 0x%08lX\n", ++ pr_debug("kfd: doorbell_id_offset == 0x%08lX\n", + kfd->doorbell_id_offset); + +- pr_debug("doorbell_process_limit == 0x%08lX\n", ++ pr_debug("kfd: doorbell_process_limit == 0x%08lX\n", + doorbell_process_limit); + +- pr_debug("doorbell_kernel_offset == 0x%08lX\n", ++ pr_debug("kfd: doorbell_kernel_offset == 0x%08lX\n", + (uintptr_t)kfd->doorbell_base); + +- pr_debug("doorbell aperture size == 0x%08lX\n", ++ pr_debug("kfd: doorbell aperture size == 0x%08lX\n", + kfd->shared_resources.doorbell_aperture_size); + +- pr_debug("doorbell kernel address == 0x%08lX\n", ++ pr_debug("kfd: doorbell kernel address == 0x%08lX\n", + (uintptr_t)kfd->doorbell_kernel_ptr); +- +- return 0; +-} +- +-void kfd_doorbell_fini(struct kfd_dev *kfd) +-{ +- if (kfd->doorbell_kernel_ptr) +- iounmap(kfd->doorbell_kernel_ptr); + } + +-int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, +- struct vm_area_struct *vma) ++int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) + { + phys_addr_t address; ++ struct kfd_dev *dev; + + /* + * For simplicitly we only allow mapping of the entire doorbell + * allocation of a single device & process. + */ +- if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) ++ if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) ++ return -EINVAL; ++ ++ /* Find kfd device according to gpu id */ ++ dev = kfd_device_by_id(vma->vm_pgoff); ++ if (dev == NULL) + return -EINVAL; + + /* Calculate physical address of doorbell */ +@@ -145,29 +142,32 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + +- pr_debug("Mapping doorbell page\n" ++ pr_debug("kfd: mapping doorbell page in %s\n" + " target user address == 0x%08llX\n" + " physical address == 0x%08llX\n" + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", ++ __func__, + (unsigned long long) vma->vm_start, address, vma->vm_flags, +- kfd_doorbell_process_slice(dev)); ++ doorbell_process_allocation()); + + + return io_remap_pfn_range(vma, + vma->vm_start, + address >> PAGE_SHIFT, +- kfd_doorbell_process_slice(dev), ++ doorbell_process_allocation(), + vma->vm_page_prot); + } + + + /* get kernel iomem pointer for a doorbell */ +-void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, ++u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off) + { + u32 inx; + ++ BUG_ON(!kfd || !doorbell_off); ++ + mutex_lock(&kfd->doorbell_mutex); + inx = find_first_zero_bit(kfd->doorbell_available_index, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); +@@ -178,17 +178,14 @@ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) + return NULL; + +- inx *= kfd->device_info->doorbell_size / sizeof(u32); +- + /* + * Calculating the kernel doorbell offset using "faked" kernel +- * pasid that allocated for kernel queues only. Offset is in +- * dword units regardless of the ASIC-dependent doorbell size. ++ * pasid that allocated for kernel queues only + */ +- *doorbell_off = KERNEL_DOORBELL_PASID * +- (kfd_doorbell_process_slice(kfd) / sizeof(u32)) + inx; ++ *doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() / ++ sizeof(u32)) + inx; + +- pr_debug("Get kernel queue doorbell\n" ++ pr_debug("kfd: get kernel queue doorbell\n" + " doorbell offset == 0x%08X\n" + " kernel address == 0x%08lX\n", + *doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx)); +@@ -200,6 +197,8 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) + { + unsigned int inx; + ++ BUG_ON(!kfd || !db_addr); ++ + inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr); + + mutex_lock(&kfd->doorbell_mutex); +@@ -207,21 +206,11 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) + mutex_unlock(&kfd->doorbell_mutex); + } + +-void write_kernel_doorbell(void __iomem *db, u32 value) ++inline void write_kernel_doorbell(u32 __iomem *db, u32 value) + { + if (db) { + writel(value, db); +- pr_debug("Writing %d to doorbell address 0x%p\n", value, db); +- } +-} +- +-void write_kernel_doorbell64(void __iomem *db, u64 value) +-{ +- if (db) { +- WARN(((unsigned long)db & 7) != 0, +- "Unaligned 64-bit doorbell"); +- writeq(value, (u64 __iomem *)db); +- pr_debug("writing %llu to doorbell address 0x%p\n", value, db); ++ pr_debug("writing %d to doorbell address 0x%p\n", value, db); + } + } + +@@ -229,26 +218,25 @@ void write_kernel_doorbell64(void __iomem *db, u64 value) + * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 + * to doorbells with the process's doorbell page + */ +-unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, ++unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, + struct kfd_process *process, +- unsigned int doorbell_id) ++ unsigned int queue_id) + { + /* + * doorbell_id_offset accounts for doorbells taken by KGD. +- * pasid * kfd_doorbell_process_slice/sizeof(u32) adjusts to +- * the process's doorbells. The offset returned is in dword +- * units regardless of the ASIC-dependent doorbell size. ++ * pasid * doorbell_process_allocation/sizeof(u32) adjusts ++ * to the process's doorbells + */ + return kfd->doorbell_id_offset + +- process->pasid * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) + +- doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); ++ process->pasid * (doorbell_process_allocation()/sizeof(u32)) + ++ queue_id; + } + + uint64_t kfd_get_number_elems(struct kfd_dev *kfd) + { + uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - + kfd->shared_resources.doorbell_start_offset) / +- kfd_doorbell_process_slice(kfd) + 1; ++ doorbell_process_allocation() + 1; + + return num_of_elems; + +@@ -258,5 +246,5 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process) + { + return dev->doorbell_base + +- process->pasid * kfd_doorbell_process_slice(dev); ++ process->pasid * doorbell_process_allocation(); + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +index 7eacf42..d1ce83d 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +@@ -23,9 +23,9 @@ + #include <linux/mm_types.h> + #include <linux/slab.h> + #include <linux/types.h> +-#include <linux/uaccess.h> +-#include <linux/sched/mm.h> + #include <linux/sched/signal.h> ++#include <linux/uaccess.h> ++#include <linux/mm.h> + #include <linux/mman.h> + #include <linux/memory.h> + #include "kfd_priv.h" +@@ -52,9 +52,6 @@ struct kfd_event_waiter { + uint32_t input_index; + }; + +-#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT +-#define SLOT_BITMAP_LONGS BITS_TO_LONGS(SLOTS_PER_PAGE) +- + /* + * Over-complicated pooled allocator for event notification slots. + * +@@ -68,19 +65,24 @@ struct kfd_event_waiter { + struct signal_page { + struct list_head event_pages; /* kfd_process.signal_event_pages */ + uint64_t *kernel_address; +- uint64_t handle; + uint64_t __user *user_address; + uint32_t page_index; /* Index into the mmap aperture. */ + unsigned int free_slots; +- unsigned long used_slot_bitmap[SLOT_BITMAP_LONGS]; ++ unsigned long used_slot_bitmap[0]; + }; + ++#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT ++#define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE) ++#define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1) ++#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \ ++ SLOT_BITMAP_SIZE * sizeof(long)) ++ + /* + * For signal events, the event ID is used as the interrupt user data. + * For SQ s_sendmsg interrupts, this is limited to 8 bits. + */ + +-#define INTERRUPT_DATA_BITS 12 ++#define INTERRUPT_DATA_BITS 8 + #define SIGNAL_EVENT_ID_SLOT_SHIFT 0 + + static uint64_t *page_slots(struct signal_page *page) +@@ -108,7 +110,7 @@ static bool allocate_free_slot(struct kfd_process *process, + *out_page = page; + *out_slot_index = slot; + +- pr_debug("Allocated event signal slot in page %p, slot %d\n", ++ pr_debug("allocated event signal slot in page %p, slot %d\n", + page, slot); + + return true; +@@ -129,7 +131,7 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) + void *backing_store; + struct signal_page *page; + +- page = kzalloc(sizeof(*page), GFP_KERNEL); ++ page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); + if (!page) + goto fail_alloc_signal_page; + +@@ -153,9 +155,9 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) + struct signal_page, + event_pages)->page_index + 1; + +- pr_debug("Allocated new event signal page at %p, for process %p\n", ++ pr_debug("allocated new event signal page at %p, for process %p\n", + page, p); +- pr_debug("Page index is %d\n", page->page_index); ++ pr_debug("page index is %d\n", page->page_index); + + list_add(&page->event_pages, &p->signal_event_pages); + +@@ -184,53 +186,6 @@ static bool allocate_event_notification_slot(struct file *devkfd, + return ret; + } + +-static bool allocate_signal_page_dgpu(struct kfd_process *p, +- uint64_t *kernel_address, uint64_t handle) +-{ +- struct signal_page *my_page; +- +- my_page = kzalloc(sizeof(*my_page), GFP_KERNEL); +- if (!my_page) +- return false; +- +- /* prevent user-mode info leaks */ +- memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, +- KFD_SIGNAL_EVENT_LIMIT * 8); +- +- my_page->kernel_address = kernel_address; +- my_page->handle = handle; +- my_page->user_address = NULL; +- my_page->free_slots = SLOTS_PER_PAGE; +- if (list_empty(&p->signal_event_pages)) +- my_page->page_index = 0; +- else +- my_page->page_index = list_tail_entry(&p->signal_event_pages, +- struct signal_page, +- event_pages)->page_index + 1; +- +- pr_debug("Allocated new event signal page at %p, for process %p\n", +- my_page, p); +- pr_debug("Page index is %d\n", my_page->page_index); +- +- list_add(&my_page->event_pages, &p->signal_event_pages); +- +- return true; +-} +- +-void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle) +-{ +- struct signal_page *page, *tmp; +- +- list_for_each_entry_safe(page, tmp, &p->signal_event_pages, +- event_pages) { +- if (page->handle == handle) { +- list_del(&page->event_pages); +- kfree(page); +- break; +- } +- } +-} +- + /* Assumes that the process's event_mutex is locked. */ + static void release_event_notification_slot(struct signal_page *page, + size_t slot_index) +@@ -239,8 +194,7 @@ static void release_event_notification_slot(struct signal_page *page, + page->free_slots++; + + /* We don't free signal pages, they are retained by the process +- * and reused until it exits. +- */ ++ * and reused until it exits. */ + } + + static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, +@@ -292,7 +246,7 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) + + for (id = p->next_nonsignal_event_id; + id < KFD_LAST_NONSIGNAL_EVENT_ID && +- lookup_event_by_id(p, id); ++ lookup_event_by_id(p, id) != NULL; + id++) + ; + +@@ -311,7 +265,7 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) + + for (id = KFD_FIRST_NONSIGNAL_EVENT_ID; + id < KFD_LAST_NONSIGNAL_EVENT_ID && +- lookup_event_by_id(p, id); ++ lookup_event_by_id(p, id) != NULL; + id++) + ; + +@@ -337,16 +291,13 @@ static int create_signal_event(struct file *devkfd, + struct kfd_event *ev) + { + if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { +- if (!p->signal_event_limit_reached) { +- pr_warn("Signal event wasn't created because limit was reached\n"); +- p->signal_event_limit_reached = true; +- } ++ pr_warn("amdkfd: Signal event wasn't created because limit was reached\n"); + return -ENOMEM; + } + + if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page, +- &ev->signal_slot_index)) { +- pr_warn("Signal event wasn't created because out of kernel memory\n"); ++ &ev->signal_slot_index)) { ++ pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n"); + return -ENOMEM; + } + +@@ -358,7 +309,11 @@ static int create_signal_event(struct file *devkfd, + ev->event_id = make_signal_event_id(ev->signal_page, + ev->signal_slot_index); + +- pr_debug("Signal event number %zu created with id %d, address %p\n", ++ pr_debug("signal event number %zu created with id %d, address %p\n", ++ p->signal_event_count, ev->event_id, ++ ev->user_signal_address); ++ ++ pr_debug("signal event number %zu created with id %d, address %p\n", + p->signal_event_count, ev->event_id, + ev->user_signal_address); + +@@ -390,7 +345,7 @@ void kfd_event_init_process(struct kfd_process *p) + + static void destroy_event(struct kfd_process *p, struct kfd_event *ev) + { +- if (ev->signal_page) { ++ if (ev->signal_page != NULL) { + release_event_notification_slot(ev->signal_page, + ev->signal_slot_index); + p->signal_event_count--; +@@ -426,9 +381,8 @@ static void shutdown_signal_pages(struct kfd_process *p) + + list_for_each_entry_safe(page, tmp, &p->signal_event_pages, + event_pages) { +- if (page->user_address) +- free_pages((unsigned long)page->kernel_address, +- get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); ++ free_pages((unsigned long)page->kernel_address, ++ get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + kfree(page); + } + } +@@ -453,8 +407,7 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) + int kfd_event_create(struct file *devkfd, struct kfd_process *p, + uint32_t event_type, bool auto_reset, uint32_t node_id, + uint32_t *event_id, uint32_t *event_trigger_data, +- uint64_t *event_page_offset, uint32_t *event_slot_index, +- void *kern_addr) ++ uint64_t *event_page_offset, uint32_t *event_slot_index) + { + int ret = 0; + struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); +@@ -468,20 +421,17 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, + + INIT_LIST_HEAD(&ev->waiters); + +- mutex_lock(&p->event_mutex); +- +- if (kern_addr && list_empty(&p->signal_event_pages)) +- allocate_signal_page_dgpu(p, kern_addr, *event_page_offset); +- + *event_page_offset = 0; + ++ mutex_lock(&p->event_mutex); ++ + switch (event_type) { + case KFD_EVENT_TYPE_SIGNAL: + case KFD_EVENT_TYPE_DEBUG: + ret = create_signal_event(devkfd, p, ev); + if (!ret) { + *event_page_offset = (ev->signal_page->page_index | +- KFD_MMAP_TYPE_EVENTS); ++ KFD_MMAP_EVENTS_MASK); + *event_page_offset <<= PAGE_SHIFT; + *event_slot_index = ev->signal_slot_index; + } +@@ -614,7 +564,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, + /* + * Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are +- * running so the lookup function increments the process ref count. ++ * running so the lookup function returns a locked process. + */ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + +@@ -634,7 +584,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, + * search faster. + */ + struct signal_page *page; +- unsigned int i; ++ unsigned i; + + list_for_each_entry(page, &p->signal_event_pages, event_pages) + for (i = 0; i < SLOTS_PER_PAGE; i++) +@@ -646,7 +596,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, + } + + mutex_unlock(&p->event_mutex); +- kfd_unref_process(p); ++ mutex_unlock(&p->mutex); + } + + static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) +@@ -667,7 +617,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) + return event_waiters; + } + +-static int init_event_waiter_get_status(struct kfd_process *p, ++static int init_event_waiter(struct kfd_process *p, + struct kfd_event_waiter *waiter, + uint32_t event_id, + uint32_t input_index) +@@ -682,18 +632,9 @@ static int init_event_waiter_get_status(struct kfd_process *p, + waiter->activated = ev->signaled; + ev->signaled = ev->signaled && !ev->auto_reset; + +- return 0; +-} ++ list_add(&waiter->waiters, &ev->waiters); + +-static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) +-{ +- struct kfd_event *ev = waiter->event; +- +- /* Only add to the wait list if we actually need to +- * wait on this event. +- */ +- if (!waiter->activated) +- list_add(&waiter->waiters, &ev->waiters); ++ return 0; + } + + static bool test_event_condition(bool all, uint32_t num_events, +@@ -788,11 +729,6 @@ int kfd_wait_on_events(struct kfd_process *p, + + mutex_lock(&p->event_mutex); + +- /* Set to something unreasonable - this is really +- * just a bool for now. +- */ +- *wait_result = KFD_WAIT_TIMEOUT; +- + event_waiters = alloc_event_waiters(num_events); + if (!event_waiters) { + ret = -ENOMEM; +@@ -808,34 +744,14 @@ int kfd_wait_on_events(struct kfd_process *p, + goto fail; + } + +- ret = init_event_waiter_get_status(p, &event_waiters[i], ++ ret = init_event_waiter(p, &event_waiters[i], + event_data.event_id, i); + if (ret) + goto fail; + } + +- /* Check condition once. */ +- if (test_event_condition(all, num_events, event_waiters)) { +- if (copy_signaled_event_data(num_events, +- event_waiters, events)) +- *wait_result = KFD_WAIT_COMPLETE; +- else +- *wait_result = KFD_WAIT_ERROR; +- free_waiters(num_events, event_waiters); +- } else { +- /* Add to wait lists if we need to wait. */ +- for (i = 0; i < num_events; i++) +- init_event_waiter_add_to_waitlist(&event_waiters[i]); +- } +- + mutex_unlock(&p->event_mutex); + +- /* Return if all waits were already satisfied. */ +- if (*wait_result != KFD_WAIT_TIMEOUT) { +- __set_current_state(TASK_RUNNING); +- return ret; +- } +- + while (true) { + if (fatal_signal_pending(current)) { + ret = -EINTR; +@@ -855,17 +771,6 @@ int kfd_wait_on_events(struct kfd_process *p, + break; + } + +- /* Set task state to interruptible sleep before +- * checking wake-up conditions. A concurrent wake-up +- * will put the task back into runnable state. In that +- * case schedule_timeout will not put the task to +- * sleep and we'll get a chance to re-check the +- * updated conditions almost immediately. Otherwise, +- * this race condition would lead to a soft hang or a +- * very long sleep. +- */ +- set_current_state(TASK_INTERRUPTIBLE); +- + if (test_event_condition(all, num_events, event_waiters)) { + if (copy_signaled_event_data(num_events, + event_waiters, events)) +@@ -880,7 +785,7 @@ int kfd_wait_on_events(struct kfd_process *p, + break; + } + +- timeout = schedule_timeout(timeout); ++ timeout = schedule_timeout_interruptible(timeout); + } + __set_current_state(TASK_RUNNING); + +@@ -911,7 +816,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) + /* check required size is logical */ + if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) != + get_order(vma->vm_end - vma->vm_start)) { +- pr_err("Event page mmap requested illegal size\n"); ++ pr_err("amdkfd: event page mmap requested illegal size\n"); + return -EINVAL; + } + +@@ -920,7 +825,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) + page = lookup_signal_page_by_index(p, page_index); + if (!page) { + /* Probably KFD bug, but mmap is user-accessible. */ +- pr_debug("Signal page could not be found for page_index %u\n", ++ pr_debug("signal page could not be found for page_index %u\n", + page_index); + return -EINVAL; + } +@@ -931,7 +836,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE + | VM_DONTDUMP | VM_PFNMAP; + +- pr_debug("Mapping signal page\n"); ++ pr_debug("mapping signal page\n"); + pr_debug(" start user address == 0x%08lx\n", vma->vm_start); + pr_debug(" end user address == 0x%08lx\n", vma->vm_end); + pr_debug(" pfn == 0x%016lX\n", pfn); +@@ -971,13 +876,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, + ev->memory_exception_data = *ev_data; + } + +- if (type == KFD_EVENT_TYPE_MEMORY) { +- dev_warn(kfd_device, +- "Sending SIGSEGV to HSA Process with PID %d ", +- p->lead_thread->pid); +- send_sig(SIGSEGV, p->lead_thread, 0); +- } +- + /* Send SIGTERM no event of type "type" has been found*/ + if (send_signal) { + if (send_sigterm) { +@@ -993,7 +891,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, + } + } + +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + unsigned long address, bool is_write_requested, + bool is_execute_requested) +@@ -1004,27 +901,17 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + /* + * Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are +- * running so the lookup function increments the process ref count. ++ * running so the lookup function returns a locked process. + */ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); +- struct mm_struct *mm; + + if (!p) + return; /* Presumably process exited. */ + +- /* Take a safe reference to the mm_struct, which may otherwise +- * disappear even while the kfd_process is still referenced. +- */ +- mm = get_task_mm(p->lead_thread); +- if (!mm) { +- kfd_unref_process(p); +- return; /* Process is exiting */ +- } +- + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); + +- down_read(&mm->mmap_sem); +- vma = find_vma(mm, address); ++ down_read(&p->mm->mmap_sem); ++ vma = find_vma(p->mm, address); + + memory_exception_data.gpu_id = dev->id; + memory_exception_data.va = address; +@@ -1050,8 +937,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + } + } + +- up_read(&mm->mmap_sem); +- mmput(mm); ++ up_read(&p->mm->mmap_sem); + + mutex_lock(&p->event_mutex); + +@@ -1060,17 +946,15 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + &memory_exception_data); + + mutex_unlock(&p->event_mutex); +- +- kfd_unref_process(p); ++ mutex_unlock(&p->mutex); + } +-#endif /* CONFIG_AMD_IOMMU_V2_MODULE */ + + void kfd_signal_hw_exception_event(unsigned int pasid) + { + /* + * Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are +- * running so the lookup function increments the process ref count. ++ * running so the lookup function returns a locked process. + */ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + +@@ -1083,42 +967,5 @@ void kfd_signal_hw_exception_event(unsigned int pasid) + lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL); + + mutex_unlock(&p->event_mutex); +- kfd_unref_process(p); +-} +- +-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, +- struct kfd_vm_fault_info *info) +-{ +- struct kfd_event *ev; +- int bkt; +- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); +- struct kfd_hsa_memory_exception_data memory_exception_data; +- +- if (!p) +- return; /* Presumably process exited. */ +- memset(&memory_exception_data, 0, sizeof(memory_exception_data)); +- memory_exception_data.gpu_id = dev->id; +- memory_exception_data.failure.imprecise = true; +- /* Set failure reason */ +- if (info) { +- memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; +- memory_exception_data.failure.NotPresent = +- info->prot_valid ? 1 : 0; +- memory_exception_data.failure.NoExecute = +- info->prot_exec ? 1 : 0; +- memory_exception_data.failure.ReadOnly = +- info->prot_write ? 1 : 0; +- memory_exception_data.failure.imprecise = 0; +- } +- mutex_lock(&p->event_mutex); +- +- hash_for_each(p->events, bkt, ev, events) { +- if (ev->type == KFD_EVENT_TYPE_MEMORY) { +- ev->memory_exception_data = memory_exception_data; +- set_event(ev); +- } +- } +- +- mutex_unlock(&p->event_mutex); +- kfd_unref_process(p); ++ mutex_unlock(&p->mutex); + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +index 09595a9..2b65510 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +@@ -275,80 +275,24 @@ + * for FLAT_* / S_LOAD operations. + */ + +-#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ ++#define MAKE_GPUVM_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) + +-#define MAKE_GPUVM_APP_LIMIT(base, size) \ +- (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) ++#define MAKE_GPUVM_APP_LIMIT(base) \ ++ (((uint64_t)(base) & \ ++ 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) + +-#define MAKE_SCRATCH_APP_BASE_VI() \ +- (((uint64_t)(0x1UL) << 61) + 0x100000000L) ++#define MAKE_SCRATCH_APP_BASE(gpu_num) \ ++ (((uint64_t)(gpu_num) << 61) + 0x100000000L) + + #define MAKE_SCRATCH_APP_LIMIT(base) \ + (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +-#define MAKE_LDS_APP_BASE_VI() \ +- (((uint64_t)(0x1UL) << 61) + 0x0) +- ++#define MAKE_LDS_APP_BASE(gpu_num) \ ++ (((uint64_t)(gpu_num) << 61) + 0x0) + #define MAKE_LDS_APP_LIMIT(base) \ + (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +-/* On GFXv9 the LDS and scratch apertures are programmed independently +- * using the high 16 bits of the 64-bit virtual address. They must be +- * in the hole, which will be the case as long as the high 16 bits are +- * not 0. +- * +- * The aperture sizes are still 4GB implicitly. +- * +- * A GPUVM aperture is not applicable on GFXv9. +- */ +-#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) +-#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) +- +-/* Some VM address space reserved for kernel use (CWSR trap handlers +- * and kernel IBs) +- */ +-#define DGPU_VM_BASE_DEFAULT 0x100000 +-#define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE) +- +-int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, +- uint64_t base, uint64_t limit) +-{ +- if (base < (pdd->qpd.cwsr_base + pdd->dev->cwsr_size)) { +- pr_err("Set dgpu vm base 0x%llx failed.\n", base); +- return -EINVAL; +- } +- pdd->dgpu_base = base; +- pdd->dgpu_limit = limit; +- return 0; +-} +- +-void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) +-{ +- /* +- * node id couldn't be 0 - the three MSB bits of +- * aperture shoudn't be 0 +- */ +- pdd->lds_base = MAKE_LDS_APP_BASE_VI(); +- pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); +- +- pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); +- pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( +- pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size); +- +- pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); +- pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); +-} +- +-void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) +-{ +- pdd->lds_base = MAKE_LDS_APP_BASE_V9(); +- pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); +- +- pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); +- pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); +-} +- + int kfd_init_apertures(struct kfd_process *process) + { + uint8_t id = 0; +@@ -356,14 +300,11 @@ int kfd_init_apertures(struct kfd_process *process) + struct kfd_process_device *pdd; + + /*Iterating over all devices*/ +- while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { +- if (!dev) { +- id++; /* Skip non GPU devices */ +- continue; +- } ++ while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && ++ id < NUM_OF_SUPPORTED_GPUS) { + + pdd = kfd_create_process_device_data(dev, process); +- if (!pdd) { ++ if (pdd == NULL) { + pr_err("Failed to create process device data\n"); + return -1; + } +@@ -377,29 +318,23 @@ int kfd_init_apertures(struct kfd_process *process) + pdd->gpuvm_base = pdd->gpuvm_limit = 0; + pdd->scratch_base = pdd->scratch_limit = 0; + } else { +- switch (dev->device_info->asic_family) { +- case CHIP_KAVERI: +- case CHIP_HAWAII: +- case CHIP_CARRIZO: +- case CHIP_TONGA: +- case CHIP_FIJI: +- case CHIP_POLARIS10: +- case CHIP_POLARIS11: +- kfd_init_apertures_vi(pdd, id); +- break; +- case CHIP_VEGA10: +- case CHIP_RAVEN: +- kfd_init_apertures_v9(pdd, id); +- break; +- default: +- pr_err("Unknown chip in kfd_init_apertures\n"); +- return -1; +- } ++ /* ++ * node id couldn't be 0 - the three MSB bits of ++ * aperture shoudn't be 0 ++ */ ++ pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); ++ ++ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); ++ ++ pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); ++ ++ pdd->gpuvm_limit = ++ MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); + +- if (!dev->device_info->is_need_iommu_device) { +- pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT; +- pdd->qpd.ib_base = DGPU_IB_BASE_DEFAULT; +- } ++ pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); ++ ++ pdd->scratch_limit = ++ MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); + } + + dev_dbg(kfd_device, "node id %u\n", id); +@@ -417,9 +352,4 @@ int kfd_init_apertures(struct kfd_process *process) + return 0; + } + +-void kfd_flush_tlb(struct kfd_dev *dev, uint32_t pasid) +-{ +- const struct kfd2kgd_calls *f2g = dev->kfd2kgd; + +- f2g->invalidate_tlbs(dev->kgd, pasid); +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +deleted file mode 100644 +index b2c6b52..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c ++++ /dev/null +@@ -1,133 +0,0 @@ +-/* +- * Copyright 2016 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#include "kfd_priv.h" +-#include "kfd_events.h" +-#include "soc15_int.h" +- +- +-static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid) +-{ +- uint32_t pasid = 0; +- const struct kfd2kgd_calls *f2g = dev->kfd2kgd; +- +- if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid)) +- pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); +- +- return pasid; +-} +- +-static bool event_interrupt_isr_v9(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry, +- uint32_t *patched_ihre, +- bool *patched_flag) +-{ +- uint16_t source_id, client_id, pasid, vmid; +- bool result = false; +- +- source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); +- client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); +- pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); +- vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); +- +- if (pasid) { +- const uint32_t *data = ih_ring_entry; +- +- pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", +- client_id, source_id, pasid); +- pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", +- data[0], data[1], data[2], data[3], +- data[4], data[5], data[6], data[7]); +- } +- +- if ((vmid >= dev->vm_info.first_vmid_kfd && +- vmid <= dev->vm_info.last_vmid_kfd) && +- (source_id == SOC15_INTSRC_CP_END_OF_PIPE || +- source_id == SOC15_INTSRC_SDMA_TRAP || +- source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || +- source_id == SOC15_INTSRC_CP_BAD_OPCODE || +- client_id == SOC15_IH_CLIENTID_VMC || +- client_id == SOC15_IH_CLIENTID_UTCL2)) { +- +- /* +- * KFD want to handle this INT, but MEC firmware did +- * not send pasid. Try to get it from vmid mapping +- * and patch the ih entry. It's a temp workaround. +- */ +- WARN_ONCE((!pasid), "Fix me.\n"); +- if (!pasid) { +- uint32_t temp = le32_to_cpu(ih_ring_entry[3]); +- +- pasid = kfd_get_pasid_from_vmid(dev, vmid); +- memcpy(patched_ihre, ih_ring_entry, +- dev->device_info->ih_ring_entry_size); +- patched_ihre[3] = cpu_to_le32(temp | pasid); +- *patched_flag = true; +- } +- result = pasid ? true : false; +- } +- +- /* Do not process in ISR, just request it to be forwarded to WQ. */ +- return result; +- +-} +- +-static void event_interrupt_wq_v9(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry) +-{ +- uint16_t source_id, client_id, pasid, vmid; +- +- source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); +- client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); +- pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); +- vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); +- +- if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) +- kfd_signal_event_interrupt(pasid, 0, 0); +- else if (source_id == SOC15_INTSRC_SDMA_TRAP) +- kfd_signal_event_interrupt(pasid, 0, 0); +- else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) +- kfd_signal_event_interrupt(pasid, 0, 0); /*todo */ +- else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) +- kfd_signal_hw_exception_event(pasid); +- else if (client_id == SOC15_IH_CLIENTID_VMC || +- client_id == SOC15_IH_CLIENTID_UTCL2) { +- struct kfd_vm_fault_info info = {0}; +- uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); +- +- info.vmid = vmid; +- info.mc_id = client_id; +- info.page_addr = ih_ring_entry[4] | +- (uint64_t)(ih_ring_entry[5] & 0xf) << 32; +- info.prot_valid = ring_id & 0x08; +- info.prot_read = ring_id & 0x10; +- info.prot_write = ring_id & 0x20; +- +- kfd_process_vm_fault(dev->dqm, pasid); +- kfd_signal_vm_fault_event(dev, pasid, &info); +- } +-} +- +-const struct kfd_event_interrupt_class event_interrupt_class_v9 = { +- .interrupt_isr = event_interrupt_isr_v9, +- .interrupt_wq = event_interrupt_wq_v9, +-}; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +index 47dcf4a..7f134aa 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +@@ -44,24 +44,24 @@ + #include <linux/device.h> + #include "kfd_priv.h" + +-#define KFD_IH_NUM_ENTRIES 8192 ++#define KFD_INTERRUPT_RING_SIZE 1024 + + static void interrupt_wq(struct work_struct *); + + int kfd_interrupt_init(struct kfd_dev *kfd) + { +- int r; +- +- r = kfifo_alloc(&kfd->ih_fifo, +- KFD_IH_NUM_ENTRIES * +- kfd->device_info->ih_ring_entry_size, +- GFP_KERNEL); +- if (r) { +- dev_err(kfd_chardev(), "Failed to allocate IH fifo\n"); +- return r; +- } ++ void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE, ++ kfd->device_info->ih_ring_entry_size, ++ GFP_KERNEL); ++ if (!interrupt_ring) ++ return -ENOMEM; ++ ++ kfd->interrupt_ring = interrupt_ring; ++ kfd->interrupt_ring_size = ++ KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size; ++ atomic_set(&kfd->interrupt_ring_wptr, 0); ++ atomic_set(&kfd->interrupt_ring_rptr, 0); + +- kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); + spin_lock_init(&kfd->interrupt_lock); + + INIT_WORK(&kfd->interrupt_work, interrupt_wq); +@@ -92,47 +92,74 @@ void kfd_interrupt_exit(struct kfd_dev *kfd) + spin_unlock_irqrestore(&kfd->interrupt_lock, flags); + + /* +- * flush_work ensures that there are no outstanding ++ * Flush_scheduled_work ensures that there are no outstanding + * work-queue items that will access interrupt_ring. New work items + * can't be created because we stopped interrupt handling above. + */ +- flush_workqueue(kfd->ih_wq); ++ flush_scheduled_work(); + +- kfifo_free(&kfd->ih_fifo); ++ kfree(kfd->interrupt_ring); + } + + /* +- * Assumption: single reader/writer. This function is not re-entrant ++ * This assumes that it can't be called concurrently with itself ++ * but only with dequeue_ih_ring_entry. + */ + bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) + { +- int count; ++ unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); ++ unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); + +- count = kfifo_in(&kfd->ih_fifo, ih_ring_entry, +- kfd->device_info->ih_ring_entry_size); +- if (count != kfd->device_info->ih_ring_entry_size) { ++ if ((rptr - wptr) % kfd->interrupt_ring_size == ++ kfd->device_info->ih_ring_entry_size) { ++ /* This is very bad, the system is likely to hang. */ + dev_err_ratelimited(kfd_chardev(), +- "Interrupt ring overflow, dropping interrupt %d\n", +- count); ++ "Interrupt ring overflow, dropping interrupt.\n"); + return false; + } + ++ memcpy(kfd->interrupt_ring + wptr, ih_ring_entry, ++ kfd->device_info->ih_ring_entry_size); ++ ++ wptr = (wptr + kfd->device_info->ih_ring_entry_size) % ++ kfd->interrupt_ring_size; ++ smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */ ++ atomic_set(&kfd->interrupt_ring_wptr, wptr); ++ + return true; + } + + /* +- * Assumption: single reader/writer. This function is not re-entrant ++ * This assumes that it can't be called concurrently with itself ++ * but only with enqueue_ih_ring_entry. + */ + static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry) + { +- int count; ++ /* ++ * Assume that wait queues have an implicit barrier, i.e. anything that ++ * happened in the ISR before it queued work is visible. ++ */ ++ ++ unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); ++ unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); + +- count = kfifo_out(&kfd->ih_fifo, ih_ring_entry, +- kfd->device_info->ih_ring_entry_size); ++ if (rptr == wptr) ++ return false; + +- WARN_ON(count && count != kfd->device_info->ih_ring_entry_size); ++ memcpy(ih_ring_entry, kfd->interrupt_ring + rptr, ++ kfd->device_info->ih_ring_entry_size); + +- return count == kfd->device_info->ih_ring_entry_size; ++ rptr = (rptr + kfd->device_info->ih_ring_entry_size) % ++ kfd->interrupt_ring_size; ++ ++ /* ++ * Ensure the rptr write update is not visible until ++ * memcpy has finished reading. ++ */ ++ smp_mb(); ++ atomic_set(&kfd->interrupt_ring_rptr, rptr); ++ ++ return true; + } + + static void interrupt_wq(struct work_struct *work) +@@ -149,15 +176,13 @@ static void interrupt_wq(struct work_struct *work) + ih_ring_entry); + } + +-bool interrupt_is_wanted(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry, +- uint32_t *patched_ihre, bool *flag) ++bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) + { + /* integer and bitwise OR so there is no boolean short-circuiting */ +- unsigned int wanted = 0; ++ unsigned wanted = 0; + + wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, +- ih_ring_entry, patched_ihre, flag); ++ ih_ring_entry); + + return wanted != 0; + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c +deleted file mode 100644 +index e67eb9f..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c ++++ /dev/null +@@ -1,275 +0,0 @@ +-/* +- * Copyright 2014 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#include <linux/dma-buf.h> +-#include <linux/slab.h> +-#include <linux/random.h> +- +-#include "kfd_ipc.h" +-#include "kfd_priv.h" +- +-#define KFD_IPC_HASH_TABLE_SIZE_SHIFT 4 +-#define KFD_IPC_HASH_TABLE_SIZE_MASK ((1 << KFD_IPC_HASH_TABLE_SIZE_SHIFT) - 1) +- +-static struct kfd_ipc_handles { +- DECLARE_HASHTABLE(handles, KFD_IPC_HASH_TABLE_SIZE_SHIFT); +- struct mutex lock; +-} kfd_ipc_handles; +- +-/* Since, handles are random numbers, it can be used directly as hashing key. +- * The least 4 bits of the handle are used as key. However, during import all +- * 128 bits of the handle are checked to prevent handle snooping. +- */ +-#define HANDLE_TO_KEY(sh) ((*(uint64_t *)sh) & KFD_IPC_HASH_TABLE_SIZE_MASK) +- +-static int ipc_store_insert(void *val, void *sh, struct kfd_ipc_obj **ipc_obj) +-{ +- struct kfd_ipc_obj *obj; +- +- obj = kmalloc(sizeof(*obj), GFP_KERNEL); +- if (!obj) +- return -ENOMEM; +- +- /* The initial ref belongs to the allocator process. +- * The IPC object store itself does not hold a ref since +- * there is no specific moment in time where that ref should +- * be dropped, except "when there are no more userspace processes +- * holding a ref to the object". Therefore the removal from IPC +- * storage happens at ipc_obj release time. +- */ +- kref_init(&obj->ref); +- obj->data = val; +- get_random_bytes(obj->share_handle, sizeof(obj->share_handle)); +- +- memcpy(sh, obj->share_handle, sizeof(obj->share_handle)); +- +- mutex_lock(&kfd_ipc_handles.lock); +- hlist_add_head(&obj->node, +- &kfd_ipc_handles.handles[HANDLE_TO_KEY(obj->share_handle)]); +- mutex_unlock(&kfd_ipc_handles.lock); +- +- if (ipc_obj) +- *ipc_obj = obj; +- +- return 0; +-} +- +-static void ipc_obj_release(struct kref *r) +-{ +- struct kfd_ipc_obj *obj; +- +- obj = container_of(r, struct kfd_ipc_obj, ref); +- +- mutex_lock(&kfd_ipc_handles.lock); +- hash_del(&obj->node); +- mutex_unlock(&kfd_ipc_handles.lock); +- +- dma_buf_put(obj->data); +- kfree(obj); +-} +- +-void ipc_obj_get(struct kfd_ipc_obj *obj) +-{ +- kref_get(&obj->ref); +-} +- +-void ipc_obj_put(struct kfd_ipc_obj **obj) +-{ +- kref_put(&(*obj)->ref, ipc_obj_release); +- *obj = NULL; +-} +- +-int kfd_ipc_init(void) +-{ +- mutex_init(&kfd_ipc_handles.lock); +- hash_init(kfd_ipc_handles.handles); +- return 0; +-} +- +-static int kfd_import_dmabuf_create_kfd_bo(struct kfd_dev *dev, +- struct kfd_process *p, +- uint32_t gpu_id, struct dma_buf *dmabuf, +- uint64_t va_addr, uint64_t *handle, +- uint64_t *mmap_offset, +- struct kfd_ipc_obj *ipc_obj) +-{ +- int r; +- void *mem; +- uint64_t size; +- int idr_handle; +- struct kfd_process_device *pdd = NULL; +- uint64_t kfd_mmap_flags = KFD_MMAP_TYPE_MAP_BO | +- KFD_MMAP_GPU_ID(gpu_id); +- +- if (!handle) +- return -EINVAL; +- +- if (!dev || !dev->kfd2kgd->import_dmabuf) +- return -EINVAL; +- +- mutex_lock(&p->mutex); +- +- pdd = kfd_bind_process_to_device(dev, p); +- if (IS_ERR(pdd)) { +- r = PTR_ERR(pdd); +- goto err_unlock; +- } +- +- r = dev->kfd2kgd->import_dmabuf(dev->kgd, dmabuf, +- va_addr, pdd->vm, +- (struct kgd_mem **)&mem, &size, +- mmap_offset); +- if (r) +- goto err_unlock; +- +- idr_handle = kfd_process_device_create_obj_handle(pdd, mem, +- va_addr, size, +- ipc_obj); +- if (idr_handle < 0) { +- r = -EFAULT; +- goto err_free; +- } +- +- mutex_unlock(&p->mutex); +- +- *handle = MAKE_HANDLE(gpu_id, idr_handle); +- if (mmap_offset) +- *mmap_offset = (kfd_mmap_flags << PAGE_SHIFT) | *mmap_offset; +- +- return 0; +- +-err_free: +- dev->kfd2kgd->free_memory_of_gpu(dev->kgd, +- (struct kgd_mem *)mem, +- pdd->vm); +-err_unlock: +- mutex_unlock(&p->mutex); +- return r; +-} +- +-int kfd_ipc_import_dmabuf(struct kfd_dev *dev, +- struct kfd_process *p, +- uint32_t gpu_id, int dmabuf_fd, +- uint64_t va_addr, uint64_t *handle, +- uint64_t *mmap_offset) +-{ +- int r; +- struct dma_buf *dmabuf = dma_buf_get(dmabuf_fd); +- +- if (!dmabuf) +- return -EINVAL; +- +- r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, dmabuf, +- va_addr, handle, mmap_offset, +- NULL); +- dma_buf_put(dmabuf); +- return r; +-} +- +-int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, +- uint32_t gpu_id, uint32_t *share_handle, +- uint64_t va_addr, uint64_t *handle, +- uint64_t *mmap_offset) +-{ +- int r; +- struct kfd_ipc_obj *entry, *found = NULL; +- +- mutex_lock(&kfd_ipc_handles.lock); +- /* Convert the user provided handle to hash key and search only in that +- * bucket +- */ +- hlist_for_each_entry(entry, +- &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { +- if (!memcmp(entry->share_handle, share_handle, +- sizeof(entry->share_handle))) { +- found = entry; +- break; +- } +- } +- mutex_unlock(&kfd_ipc_handles.lock); +- +- if (!found) +- return -EINVAL; +- ipc_obj_get(found); +- +- pr_debug("Found ipc_dma_buf: %p\n", found->data); +- +- r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, found->data, +- va_addr, handle, mmap_offset, +- found); +- if (r) +- goto error_unref; +- +- return r; +- +-error_unref: +- ipc_obj_put(&found); +- return r; +-} +- +-int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, +- uint64_t handle, uint32_t *ipc_handle) +-{ +- struct kfd_process_device *pdd = NULL; +- struct kfd_ipc_obj *obj; +- struct kfd_bo *kfd_bo = NULL; +- struct dma_buf *dmabuf; +- int r; +- +- if (!dev || !ipc_handle) +- return -EINVAL; +- +- mutex_lock(&p->mutex); +- pdd = kfd_bind_process_to_device(dev, p); +- if (IS_ERR(pdd)) { +- mutex_unlock(&p->mutex); +- pr_err("Failed to get pdd\n"); +- return PTR_ERR(pdd); +- } +- +- kfd_bo = kfd_process_device_find_bo(pdd, GET_IDR_HANDLE(handle)); +- mutex_unlock(&p->mutex); +- +- if (!kfd_bo) { +- pr_err("Failed to get bo"); +- return -EINVAL; +- } +- if (kfd_bo->kfd_ipc_obj) { +- memcpy(ipc_handle, kfd_bo->kfd_ipc_obj->share_handle, +- sizeof(kfd_bo->kfd_ipc_obj->share_handle)); +- return 0; +- } +- +- r = dev->kfd2kgd->export_dmabuf(dev->kgd, pdd->vm, +- (struct kgd_mem *)kfd_bo->mem, +- &dmabuf); +- if (r) +- return r; +- +- r = ipc_store_insert(dmabuf, ipc_handle, &obj); +- if (r) +- return r; +- +- kfd_bo->kfd_ipc_obj = obj; +- +- return r; +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h +deleted file mode 100644 +index 9ee8627..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h ++++ /dev/null +@@ -1,51 +0,0 @@ +-/* +- * Copyright 2014 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- * +- */ +- +-#ifndef KFD_IPC_H_ +-#define KFD_IPC_H_ +- +-#include <linux/types.h> +-#include "kfd_priv.h" +- +-struct kfd_ipc_obj { +- struct hlist_node node; +- struct kref ref; +- void *data; +- uint32_t share_handle[4]; +-}; +- +-int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, +- uint32_t gpu_id, uint32_t *share_handle, +- uint64_t va_addr, uint64_t *handle, +- uint64_t *mmap_offset); +-int kfd_ipc_import_dmabuf(struct kfd_dev *kfd, struct kfd_process *p, +- uint32_t gpu_id, int dmabuf_fd, +- uint64_t va_addr, uint64_t *handle, +- uint64_t *mmap_offset); +-int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, +- uint64_t handle, uint32_t *ipc_handle); +- +-void ipc_obj_get(struct kfd_ipc_obj *obj); +-void ipc_obj_put(struct kfd_ipc_obj **obj); +- +-#endif /* KFD_IPC_H_ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +index 903ef25..d135cd0 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +@@ -41,8 +41,11 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + int retval; + union PM4_MES_TYPE_3_HEADER nop; + +- pr_debug("Initializing queue type %d size %d\n", KFD_QUEUE_TYPE_HIQ, +- queue_size); ++ BUG_ON(!kq || !dev); ++ BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ); ++ ++ pr_debug("amdkfd: In func %s initializing queue type %d size %d\n", ++ __func__, KFD_QUEUE_TYPE_HIQ, queue_size); + + memset(&prop, 0, sizeof(prop)); + memset(&nop, 0, sizeof(nop)); +@@ -60,23 +63,23 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + KFD_MQD_TYPE_HIQ); + break; + default: +- pr_err("Invalid queue type %d\n", type); +- return false; ++ BUG(); ++ break; + } + +- if (!kq->mqd) ++ if (kq->mqd == NULL) + return false; + + prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off); + +- if (!prop.doorbell_ptr) { +- pr_err("Failed to initialize doorbell"); ++ if (prop.doorbell_ptr == NULL) { ++ pr_err("amdkfd: error init doorbell"); + goto err_get_kernel_doorbell; + } + + retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq); + if (retval != 0) { +- pr_err("Failed to init pq queues size %d\n", queue_size); ++ pr_err("amdkfd: error init pq queues size (%d)\n", queue_size); + goto err_pq_allocate_vidmem; + } + +@@ -84,7 +87,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + kq->pq_gpu_addr = kq->pq->gpu_addr; + + retval = kq->ops_asic_specific.initialize(kq, dev, type, queue_size); +- if (!retval) ++ if (retval == false) + goto err_eop_allocate_vidmem; + + retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel), +@@ -96,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + kq->rptr_kernel = kq->rptr_mem->cpu_ptr; + kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; + +- retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, ++ retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), + &kq->wptr_mem); + + if (retval != 0) +@@ -120,7 +123,6 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr; + prop.eop_ring_buffer_address = kq->eop_gpu_addr; + prop.eop_ring_buffer_size = PAGE_SIZE; +- prop.cu_mask = NULL; + + if (init_queue(&kq->queue, &prop) != 0) + goto err_init_queue; +@@ -137,12 +139,11 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + + /* assign HIQ to HQD */ + if (type == KFD_QUEUE_TYPE_HIQ) { +- pr_debug("Assigning hiq to hqd\n"); ++ pr_debug("assigning hiq to hqd\n"); + kq->queue->pipe = KFD_CIK_HIQ_PIPE; + kq->queue->queue = KFD_CIK_HIQ_QUEUE; + kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe, +- kq->queue->queue, &kq->queue->properties, +- NULL); ++ kq->queue->queue, NULL); + } else { + /* allocate fence for DIQ */ + +@@ -179,10 +180,12 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + + static void uninitialize(struct kernel_queue *kq) + { ++ BUG_ON(!kq); ++ + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) + kq->mqd->destroy_mqd(kq->mqd, +- kq->queue->mqd, +- KFD_PREEMPT_TYPE_WAVEFRONT_RESET, ++ NULL, ++ false, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, + kq->queue->pipe, + kq->queue->queue); +@@ -206,17 +209,12 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + size_t available_size; + size_t queue_size_dwords; + uint32_t wptr, rptr; +- uint64_t wptr64; + unsigned int *queue_address; + +- /* When rptr == wptr, the buffer is empty. +- * When rptr == wptr + 1, the buffer is full. +- * It is always rptr that advances to the position of wptr, rather than +- * the opposite. So we can only use up to queue_size_dwords - 1 dwords. +- */ ++ BUG_ON(!kq || !buffer_ptr); ++ + rptr = *kq->rptr_kernel; +- wptr = kq->pending_wptr; +- wptr64 = kq->pending_wptr64; ++ wptr = *kq->wptr_kernel; + queue_address = (unsigned int *)kq->pq_kernel_addr; + queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); + +@@ -224,72 +222,28 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + pr_debug("wptr: %d\n", wptr); + pr_debug("queue_address 0x%p\n", queue_address); + +- available_size = (rptr + queue_size_dwords - 1 - wptr) % ++ available_size = (rptr - 1 - wptr + queue_size_dwords) % + queue_size_dwords; + +- if (packet_size_in_dwords > available_size) { ++ if (packet_size_in_dwords >= queue_size_dwords || ++ packet_size_in_dwords >= available_size) { + /* + * make sure calling functions know + * acquire_packet_buffer() failed + */ +- goto err_no_space; ++ *buffer_ptr = NULL; ++ return -ENOMEM; + } + + if (wptr + packet_size_in_dwords >= queue_size_dwords) { +- /* make sure after rolling back to position 0, there is +- * still enough space. +- */ +- if (packet_size_in_dwords >= rptr) +- goto err_no_space; +- +- /* fill nops, roll back and start at position 0 */ + while (wptr > 0) { + queue_address[wptr] = kq->nop_packet; + wptr = (wptr + 1) % queue_size_dwords; +- wptr64++; + } + } + + *buffer_ptr = &queue_address[wptr]; + kq->pending_wptr = wptr + packet_size_in_dwords; +- kq->pending_wptr64 = wptr64 + packet_size_in_dwords; +- +- return 0; +- +-err_no_space: +- *buffer_ptr = NULL; +- return -ENOMEM; +-} +- +-static int acquire_inline_ib(struct kernel_queue *kq, +- size_t size_in_dwords, +- unsigned int **buffer_ptr, +- uint64_t *gpu_addr) +-{ +- int ret; +- unsigned int *buf; +- union PM4_MES_TYPE_3_HEADER nop; +- +- if (size_in_dwords >= (1 << 14)) +- return -EINVAL; +- +- /* Allocate size_in_dwords on the ring, plus an extra dword +- * for a NOP packet header +- */ +- ret = acquire_packet_buffer(kq, size_in_dwords + 1, &buf); +- if (ret) +- return ret; +- +- /* Build a NOP packet that contains the IB as "payload". */ +- nop.u32all = 0; +- nop.opcode = IT_NOP; +- nop.count = size_in_dwords - 1; +- nop.type = PM4_TYPE_3; +- +- *buf = nop.u32all; +- *buffer_ptr = buf + 1; +- *gpu_addr = kq->pq_gpu_addr + ((unsigned long)*buffer_ptr - +- (unsigned long)kq->pq_kernel_addr); + + return 0; + } +@@ -298,7 +252,11 @@ static void submit_packet(struct kernel_queue *kq) + { + #ifdef DEBUG + int i; ++#endif ++ ++ BUG_ON(!kq); + ++#ifdef DEBUG + for (i = *kq->wptr_kernel; i < kq->pending_wptr; i++) { + pr_debug("0x%2X ", kq->pq_kernel_addr[i]); + if (i % 15 == 0) +@@ -307,11 +265,14 @@ static void submit_packet(struct kernel_queue *kq) + pr_debug("\n"); + #endif + +- kq->ops_asic_specific.submit_packet(kq); ++ *kq->wptr_kernel = kq->pending_wptr; ++ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, ++ kq->pending_wptr); + } + + static void rollback_packet(struct kernel_queue *kq) + { ++ BUG_ON(!kq); + kq->pending_wptr = *kq->queue->properties.write_ptr; + } + +@@ -320,41 +281,30 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + { + struct kernel_queue *kq; + +- kq = kzalloc(sizeof(*kq), GFP_KERNEL); ++ BUG_ON(!dev); ++ ++ kq = kzalloc(sizeof(struct kernel_queue), GFP_KERNEL); + if (!kq) + return NULL; + + kq->ops.initialize = initialize; + kq->ops.uninitialize = uninitialize; + kq->ops.acquire_packet_buffer = acquire_packet_buffer; +- kq->ops.acquire_inline_ib = acquire_inline_ib; + kq->ops.submit_packet = submit_packet; + kq->ops.rollback_packet = rollback_packet; + + switch (dev->device_info->asic_family) { + case CHIP_CARRIZO: +- case CHIP_TONGA: +- case CHIP_FIJI: +- case CHIP_POLARIS10: +- case CHIP_POLARIS11: + kernel_queue_init_vi(&kq->ops_asic_specific); + break; + + case CHIP_KAVERI: +- case CHIP_HAWAII: + kernel_queue_init_cik(&kq->ops_asic_specific); + break; +- +- case CHIP_VEGA10: +- case CHIP_RAVEN: +- kernel_queue_init_v9(&kq->ops_asic_specific); +- break; +- default: +- BUG(); + } + + if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) { +- pr_err("Failed to init kernel queue\n"); ++ pr_err("amdkfd: failed to init kernel queue\n"); + kfree(kq); + return NULL; + } +@@ -363,37 +313,32 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + + void kernel_queue_uninit(struct kernel_queue *kq) + { ++ BUG_ON(!kq); ++ + kq->ops.uninitialize(kq); + kfree(kq); + } + +-/* FIXME: Can this test be removed? */ + static __attribute__((unused)) void test_kq(struct kfd_dev *dev) + { + struct kernel_queue *kq; + uint32_t *buffer, i; + int retval; + +- pr_err("Starting kernel queue test\n"); ++ BUG_ON(!dev); ++ ++ pr_err("amdkfd: starting kernel queue test\n"); + + kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); +- if (unlikely(!kq)) { +- pr_err(" Failed to initialize HIQ\n"); +- pr_err("Kernel queue test failed\n"); +- return; +- } ++ BUG_ON(!kq); + + retval = kq->ops.acquire_packet_buffer(kq, 5, &buffer); +- if (unlikely(retval != 0)) { +- pr_err(" Failed to acquire packet buffer\n"); +- pr_err("Kernel queue test failed\n"); +- return; +- } ++ BUG_ON(retval != 0); + for (i = 0; i < 5; i++) + buffer[i] = kq->nop_packet; + kq->ops.submit_packet(kq); + +- pr_err("Ending kernel queue test\n"); ++ pr_err("amdkfd: ending kernel queue test\n"); + } + + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +index 82c94a6..5940531 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +@@ -42,12 +42,6 @@ + * pending write pointer to that location so subsequent calls to + * acquire_packet_buffer will get a correct write pointer + * +- * @acquire_inline_ib: Returns a pointer to the location in the kernel +- * queue ring buffer where the calling function can write an inline IB. It is +- * Guaranteed that there is enough space for that IB. It also updates the +- * pending write pointer to that location so subsequent calls to +- * acquire_packet_buffer will get a correct write pointer +- * + * @submit_packet: Update the write pointer and doorbell of a kernel queue. + * + * @sync_with_hw: Wait until the write pointer and the read pointer of a kernel +@@ -65,10 +59,6 @@ struct kernel_queue_ops { + int (*acquire_packet_buffer)(struct kernel_queue *kq, + size_t packet_size_in_dwords, + unsigned int **buffer_ptr); +- int (*acquire_inline_ib)(struct kernel_queue *kq, +- size_t packet_size_in_dwords, +- unsigned int **buffer_ptr, +- uint64_t *gpu_addr); + + void (*submit_packet)(struct kernel_queue *kq); + void (*rollback_packet)(struct kernel_queue *kq); +@@ -82,7 +72,6 @@ struct kernel_queue { + struct kfd_dev *dev; + struct mqd_manager *mqd; + struct queue *queue; +- uint64_t pending_wptr64; + uint32_t pending_wptr; + unsigned int nop_packet; + +@@ -90,10 +79,7 @@ struct kernel_queue { + uint32_t *rptr_kernel; + uint64_t rptr_gpu_addr; + struct kfd_mem_obj *wptr_mem; +- union { +- uint64_t *wptr64_kernel; +- uint32_t *wptr_kernel; +- }; ++ uint32_t *wptr_kernel; + uint64_t wptr_gpu_addr; + struct kfd_mem_obj *pq; + uint64_t pq_gpu_addr; +@@ -111,6 +97,5 @@ struct kernel_queue { + + void kernel_queue_init_cik(struct kernel_queue_ops *ops); + void kernel_queue_init_vi(struct kernel_queue_ops *ops); +-void kernel_queue_init_v9(struct kernel_queue_ops *ops); + + #endif /* KFD_KERNEL_QUEUE_H_ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +index 2808422..a90eb44 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +@@ -22,19 +22,15 @@ + */ + + #include "kfd_kernel_queue.h" +-#include "kfd_pm4_headers.h" +-#include "kfd_pm4_opcodes.h" + + static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); + static void uninitialize_cik(struct kernel_queue *kq); +-static void submit_packet_cik(struct kernel_queue *kq); + + void kernel_queue_init_cik(struct kernel_queue_ops *ops) + { + ops->initialize = initialize_cik; + ops->uninitialize = uninitialize_cik; +- ops->submit_packet = submit_packet_cik; + } + + static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, +@@ -46,127 +42,3 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + static void uninitialize_cik(struct kernel_queue *kq) + { + } +- +-static void submit_packet_cik(struct kernel_queue *kq) +-{ +- *kq->wptr_kernel = kq->pending_wptr; +- write_kernel_doorbell(kq->queue->properties.doorbell_ptr, +- kq->pending_wptr); +-} +- +-static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer, +- struct qcm_process_device *qpd) +-{ +- struct pm4_map_process *packet; +- +- packet = (struct pm4_map_process *)buffer; +- +- memset(buffer, 0, sizeof(struct pm4_map_process)); +- +- packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, +- sizeof(struct pm4_map_process)); +- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; +- packet->bitfields2.process_quantum = 1; +- packet->bitfields2.pasid = qpd->pqm->process->pasid; +- packet->bitfields3.page_table_base = qpd->page_table_base; +- packet->bitfields10.gds_size = qpd->gds_size; +- packet->bitfields10.num_gws = qpd->num_gws; +- packet->bitfields10.num_oac = qpd->num_oac; +- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; +- +- packet->sh_mem_config = qpd->sh_mem_config; +- packet->sh_mem_bases = qpd->sh_mem_bases; +- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; +- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; +- +- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); +- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); +- +- return 0; +-} +- +-static int pm_map_process_scratch_cik(struct packet_manager *pm, +- uint32_t *buffer, struct qcm_process_device *qpd) +-{ +- struct pm4_map_process_scratch_kv *packet; +- +- packet = (struct pm4_map_process_scratch_kv *)buffer; +- +- memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); +- +- packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, +- sizeof(struct pm4_map_process_scratch_kv)); +- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; +- packet->bitfields2.process_quantum = 1; +- packet->bitfields2.pasid = qpd->pqm->process->pasid; +- packet->bitfields3.page_table_base = qpd->page_table_base; +- packet->bitfields14.gds_size = qpd->gds_size; +- packet->bitfields14.num_gws = qpd->num_gws; +- packet->bitfields14.num_oac = qpd->num_oac; +- packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; +- +- packet->sh_mem_config = qpd->sh_mem_config; +- packet->sh_mem_bases = qpd->sh_mem_bases; +- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; +- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; +- +- packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; +- +- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); +- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); +- +- return 0; +-} +- +-static uint32_t pm_get_map_process_packet_size_cik(void) +-{ +- return sizeof(struct pm4_map_process); +-} +-static uint32_t pm_get_map_process_scratch_packet_size_cik(void) +-{ +- return sizeof(struct pm4_map_process_scratch_kv); +-} +- +- +-static struct packet_manager_funcs kfd_cik_pm_funcs = { +- .map_process = pm_map_process_cik, +- .runlist = pm_runlist_vi, +- .set_resources = pm_set_resources_vi, +- .map_queues = pm_map_queues_vi, +- .unmap_queues = pm_unmap_queues_vi, +- .query_status = pm_query_status_vi, +- .release_mem = pm_release_mem_vi, +- .get_map_process_packet_size = pm_get_map_process_packet_size_cik, +- .get_runlist_packet_size = pm_get_runlist_packet_size_vi, +- .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, +- .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, +- .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, +- .get_query_status_packet_size = pm_get_query_status_packet_size_vi, +- .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, +-}; +- +-static struct packet_manager_funcs kfd_cik_scratch_pm_funcs = { +- .map_process = pm_map_process_scratch_cik, +- .runlist = pm_runlist_vi, +- .set_resources = pm_set_resources_vi, +- .map_queues = pm_map_queues_vi, +- .unmap_queues = pm_unmap_queues_vi, +- .query_status = pm_query_status_vi, +- .release_mem = pm_release_mem_vi, +- .get_map_process_packet_size = +- pm_get_map_process_scratch_packet_size_cik, +- .get_runlist_packet_size = pm_get_runlist_packet_size_vi, +- .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, +- .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, +- .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, +- .get_query_status_packet_size = pm_get_query_status_packet_size_vi, +- .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, +-}; +- +-void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver) +-{ +- if (fw_ver >= KFD_SCRATCH_KV_FW_VER) +- pm->pmf = &kfd_cik_scratch_pm_funcs; +- else +- pm->pmf = &kfd_cik_pm_funcs; +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +deleted file mode 100644 +index 5fe4f60..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c ++++ /dev/null +@@ -1,377 +0,0 @@ +-/* +- * Copyright 2016 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- * +- */ +- +-#include "kfd_kernel_queue.h" +-#include "kfd_device_queue_manager.h" +-#include "kfd_pm4_headers_ai.h" +-#include "kfd_pm4_opcodes.h" +- +-static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, +- enum kfd_queue_type type, unsigned int queue_size); +-static void uninitialize_v9(struct kernel_queue *kq); +-static void submit_packet_v9(struct kernel_queue *kq); +- +-void kernel_queue_init_v9(struct kernel_queue_ops *ops) +-{ +- ops->initialize = initialize_v9; +- ops->uninitialize = uninitialize_v9; +- ops->submit_packet = submit_packet_v9; +-} +- +-static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, +- enum kfd_queue_type type, unsigned int queue_size) +-{ +- int retval; +- +- retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); +- if (retval != 0) +- return false; +- +- kq->eop_gpu_addr = kq->eop_mem->gpu_addr; +- kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; +- +- memset(kq->eop_kernel_addr, 0, PAGE_SIZE); +- +- return true; +-} +- +-static void uninitialize_v9(struct kernel_queue *kq) +-{ +- kfd_gtt_sa_free(kq->dev, kq->eop_mem); +-} +- +-static void submit_packet_v9(struct kernel_queue *kq) +-{ +- *kq->wptr64_kernel = kq->pending_wptr64; +- write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, +- kq->pending_wptr64); +-} +- +-static int pm_map_process_v9(struct packet_manager *pm, +- uint32_t *buffer, struct qcm_process_device *qpd) +-{ +- struct pm4_mes_map_process *packet; +- uint64_t vm_page_table_base_addr = +- (uint64_t)(qpd->page_table_base) << 12; +- +- packet = (struct pm4_mes_map_process *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_map_process)); +- +- packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, +- sizeof(struct pm4_mes_map_process)); +- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; +- packet->bitfields2.process_quantum = 1; +- packet->bitfields2.pasid = qpd->pqm->process->pasid; +- packet->bitfields14.gds_size = qpd->gds_size; +- packet->bitfields14.num_gws = qpd->num_gws; +- packet->bitfields14.num_oac = qpd->num_oac; +- packet->bitfields14.sdma_enable = 1; +- packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; +- +- packet->sh_mem_config = qpd->sh_mem_config; +- packet->sh_mem_bases = qpd->sh_mem_bases; +- packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); +- packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); +- packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); +- packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); +- +- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); +- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); +- +- packet->vm_context_page_table_base_addr_lo32 = +- lower_32_bits(vm_page_table_base_addr); +- packet->vm_context_page_table_base_addr_hi32 = +- upper_32_bits(vm_page_table_base_addr); +- +- return 0; +-} +- +-static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, +- uint64_t ib, size_t ib_size_in_dwords, bool chain) +-{ +- struct pm4_mes_runlist *packet; +- +- int concurrent_proc_cnt = 0; +- struct kfd_dev *kfd = pm->dqm->dev; +- +- /* Determine the number of processes to map together to HW: +- * it can not exceed the number of VMIDs available to the +- * scheduler, and it is determined by the smaller of the number +- * of processes in the runlist and kfd module parameter +- * hws_max_conc_proc. +- * Note: the arbitration between the number of VMIDs and +- * hws_max_conc_proc has been done in +- * kgd2kfd_device_init(). +- */ +- concurrent_proc_cnt = min(pm->dqm->processes_count, +- kfd->max_proc_per_quantum); +- +- +- packet = (struct pm4_mes_runlist *)buffer; +- +- memset(buffer, 0, sizeof(struct pm4_mes_runlist)); +- packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, +- sizeof(struct pm4_mes_runlist)); +- +- packet->bitfields4.ib_size = ib_size_in_dwords; +- packet->bitfields4.chain = chain ? 1 : 0; +- packet->bitfields4.offload_polling = 0; +- packet->bitfields4.valid = 1; +- packet->bitfields4.process_cnt = concurrent_proc_cnt; +- packet->ordinal2 = lower_32_bits(ib); +- packet->ib_base_hi = upper_32_bits(ib); +- +- return 0; +-} +- +-static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, +- struct queue *q, bool is_static) +-{ +- struct pm4_mes_map_queues *packet; +- bool use_static = is_static; +- +- packet = (struct pm4_mes_map_queues *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); +- +- packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, +- sizeof(struct pm4_mes_map_queues)); +- packet->bitfields2.alloc_format = +- alloc_format__mes_map_queues__one_per_pipe_vi; +- packet->bitfields2.num_queues = 1; +- packet->bitfields2.queue_sel = +- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; +- +- packet->bitfields2.engine_sel = +- engine_sel__mes_map_queues__compute_vi; +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__normal_compute_vi; +- +- switch (q->properties.type) { +- case KFD_QUEUE_TYPE_COMPUTE: +- if (use_static) +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__normal_latency_static_queue_vi; +- break; +- case KFD_QUEUE_TYPE_DIQ: +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__debug_interface_queue_vi; +- break; +- case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = q->properties.sdma_engine_id + +- engine_sel__mes_map_queues__sdma0_vi; +- use_static = false; /* no static queues under SDMA */ +- break; +- default: +- WARN(1, "queue type %d", q->properties.type); +- return -EINVAL; +- } +- packet->bitfields3.doorbell_offset = +- q->properties.doorbell_off; +- +- packet->mqd_addr_lo = +- lower_32_bits(q->gart_mqd_addr); +- +- packet->mqd_addr_hi = +- upper_32_bits(q->gart_mqd_addr); +- +- packet->wptr_addr_lo = +- lower_32_bits((uint64_t)q->properties.write_ptr); +- +- packet->wptr_addr_hi = +- upper_32_bits((uint64_t)q->properties.write_ptr); +- +- return 0; +-} +- +-static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, +- enum kfd_queue_type type, +- enum kfd_unmap_queues_filter filter, +- uint32_t filter_param, bool reset, +- unsigned int sdma_engine) +-{ +- struct pm4_mes_unmap_queues *packet; +- +- packet = (struct pm4_mes_unmap_queues *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); +- +- packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, +- sizeof(struct pm4_mes_unmap_queues)); +- switch (type) { +- case KFD_QUEUE_TYPE_COMPUTE: +- case KFD_QUEUE_TYPE_DIQ: +- packet->bitfields2.engine_sel = +- engine_sel__mes_unmap_queues__compute; +- break; +- case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = +- engine_sel__mes_unmap_queues__sdma0 + sdma_engine; +- break; +- default: +- WARN(1, "queue type %d", type); +- return -EINVAL; +- } +- +- if (reset) +- packet->bitfields2.action = +- action__mes_unmap_queues__reset_queues; +- else +- packet->bitfields2.action = +- action__mes_unmap_queues__preempt_queues; +- +- switch (filter) { +- case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_specified_queues; +- packet->bitfields2.num_queues = 1; +- packet->bitfields3b.doorbell_offset0 = filter_param; +- break; +- case KFD_UNMAP_QUEUES_FILTER_BY_PASID: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; +- packet->bitfields3a.pasid = filter_param; +- break; +- case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__unmap_all_queues; +- break; +- case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: +- /* in this case, we do not preempt static queues */ +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__unmap_all_non_static_queues; +- break; +- default: +- WARN(1, "filter %d", filter); +- return -EINVAL; +- } +- +- return 0; +- +-} +- +-static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, +- uint64_t fence_address, uint32_t fence_value) +-{ +- struct pm4_mes_query_status *packet; +- +- packet = (struct pm4_mes_query_status *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_query_status)); +- +- +- packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, +- sizeof(struct pm4_mes_query_status)); +- +- packet->bitfields2.context_id = 0; +- packet->bitfields2.interrupt_sel = +- interrupt_sel__mes_query_status__completion_status; +- packet->bitfields2.command = +- command__mes_query_status__fence_only_after_write_ack; +- +- packet->addr_hi = upper_32_bits((uint64_t)fence_address); +- packet->addr_lo = lower_32_bits((uint64_t)fence_address); +- packet->data_hi = upper_32_bits((uint64_t)fence_value); +- packet->data_lo = lower_32_bits((uint64_t)fence_value); +- +- return 0; +-} +- +- +-static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) +-{ +- struct pm4_mec_release_mem *packet; +- +- packet = (struct pm4_mec_release_mem *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); +- +- packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, +- sizeof(struct pm4_mec_release_mem)); +- +- packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; +- packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; +- packet->bitfields2.tcl1_action_ena = 1; +- packet->bitfields2.tc_action_ena = 1; +- packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; +- +- packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; +- packet->bitfields3.int_sel = +- int_sel__mec_release_mem__send_interrupt_after_write_confirm; +- +- packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; +- packet->address_hi = upper_32_bits(gpu_addr); +- +- packet->data_lo = 0; +- +- return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); +-} +- +-static uint32_t pm_get_map_process_packet_size_v9(void) +-{ +- return sizeof(struct pm4_mes_map_process); +-} +- +-static uint32_t pm_get_runlist_packet_size_v9(void) +-{ +- return sizeof(struct pm4_mes_runlist); +-} +- +-static uint32_t pm_get_map_queues_packet_size_v9(void) +-{ +- return sizeof(struct pm4_mes_map_queues); +-} +- +-static uint32_t pm_get_unmap_queues_packet_size_v9(void) +-{ +- return sizeof(struct pm4_mes_unmap_queues); +-} +- +-static uint32_t pm_get_query_status_packet_size_v9(void) +-{ +- return sizeof(struct pm4_mes_query_status); +-} +- +-static uint32_t pm_get_release_mem_packet_size_v9(void) +-{ +- return sizeof(struct pm4_mec_release_mem); +-} +- +-static struct packet_manager_funcs kfd_v9_pm_funcs = { +- .map_process = pm_map_process_v9, +- .runlist = pm_runlist_v9, +- .set_resources = pm_set_resources_vi, +- .map_queues = pm_map_queues_v9, +- .unmap_queues = pm_unmap_queues_v9, +- .query_status = pm_query_status_v9, +- .release_mem = pm_release_mem_v9, +- .get_map_process_packet_size = pm_get_map_process_packet_size_v9, +- .get_runlist_packet_size = pm_get_runlist_packet_size_v9, +- .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, +- .get_map_queues_packet_size = pm_get_map_queues_packet_size_v9, +- .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_v9, +- .get_query_status_packet_size = pm_get_query_status_packet_size_v9, +- .get_release_mem_packet_size = pm_get_release_mem_packet_size_v9, +-}; +- +-void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver) +-{ +- pm->pmf = &kfd_v9_pm_funcs; +-} +- +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +index ecf4a33..f1d4828 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +@@ -22,20 +22,15 @@ + */ + + #include "kfd_kernel_queue.h" +-#include "kfd_device_queue_manager.h" +-#include "kfd_pm4_headers_vi.h" +-#include "kfd_pm4_opcodes.h" + + static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); + static void uninitialize_vi(struct kernel_queue *kq); +-static void submit_packet_vi(struct kernel_queue *kq); + + void kernel_queue_init_vi(struct kernel_queue_ops *ops) + { + ops->initialize = initialize_vi; + ops->uninitialize = uninitialize_vi; +- ops->submit_packet = submit_packet_vi; + } + + static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, +@@ -59,359 +54,3 @@ static void uninitialize_vi(struct kernel_queue *kq) + { + kfd_gtt_sa_free(kq->dev, kq->eop_mem); + } +- +-static void submit_packet_vi(struct kernel_queue *kq) +-{ +- *kq->wptr_kernel = kq->pending_wptr; +- write_kernel_doorbell(kq->queue->properties.doorbell_ptr, +- kq->pending_wptr); +-} +- +-static int pm_map_process_vi(struct packet_manager *pm, +- uint32_t *buffer, struct qcm_process_device *qpd) +-{ +- struct pm4_mes_map_process *packet; +- +- packet = (struct pm4_mes_map_process *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_map_process)); +- +- packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, +- sizeof(struct pm4_mes_map_process)); +- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; +- packet->bitfields2.process_quantum = 1; +- packet->bitfields2.pasid = qpd->pqm->process->pasid; +- packet->bitfields3.page_table_base = qpd->page_table_base; +- packet->bitfields10.gds_size = qpd->gds_size; +- packet->bitfields10.num_gws = qpd->num_gws; +- packet->bitfields10.num_oac = qpd->num_oac; +- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; +- +- packet->sh_mem_config = qpd->sh_mem_config; +- packet->sh_mem_bases = qpd->sh_mem_bases; +- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; +- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; +- +- packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; +- +- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); +- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); +- +- return 0; +-} +- +- +-unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) +-{ +- union PM4_MES_TYPE_3_HEADER header; +- +- header.u32All = 0; +- header.opcode = opcode; +- header.count = packet_size/sizeof(uint32_t) - 2; +- header.type = PM4_TYPE_3; +- +- return header.u32All; +-} +- +-int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, +- uint64_t ib, size_t ib_size_in_dwords, bool chain) +-{ +- struct pm4_mes_runlist *packet; +- +- int concurrent_proc_cnt = 0; +- struct kfd_dev *kfd = pm->dqm->dev; +- +- /* Determine the number of processes to map together to HW: +- * it can not exceed the number of VMIDs available to the +- * scheduler, and it is determined by the smaller of the number +- * of processes in the runlist and kfd module parameter +- * hws_max_conc_proc. +- * Note: the arbitration between the number of VMIDs and +- * hws_max_conc_proc has been done in +- * kgd2kfd_device_init(). +- */ +- concurrent_proc_cnt = min(pm->dqm->processes_count, +- kfd->max_proc_per_quantum); +- +- +- packet = (struct pm4_mes_runlist *)buffer; +- +- memset(buffer, 0, sizeof(struct pm4_mes_runlist)); +- packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, +- sizeof(struct pm4_mes_runlist)); +- +- packet->bitfields4.ib_size = ib_size_in_dwords; +- packet->bitfields4.chain = chain ? 1 : 0; +- packet->bitfields4.offload_polling = 0; +- packet->bitfields4.valid = 1; +- packet->bitfields4.process_cnt = concurrent_proc_cnt; +- packet->ordinal2 = lower_32_bits(ib); +- packet->bitfields3.ib_base_hi = upper_32_bits(ib); +- +- return 0; +-} +- +-int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, +- struct queue *q, bool is_static) +-{ +- struct pm4_mes_map_queues *packet; +- bool use_static = is_static; +- +- packet = (struct pm4_mes_map_queues *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); +- +- packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, +- sizeof(struct pm4_mes_map_queues)); +- packet->bitfields2.alloc_format = +- alloc_format__mes_map_queues__one_per_pipe_vi; +- packet->bitfields2.num_queues = 1; +- packet->bitfields2.queue_sel = +- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; +- +- packet->bitfields2.engine_sel = +- engine_sel__mes_map_queues__compute_vi; +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__normal_compute_vi; +- +- switch (q->properties.type) { +- case KFD_QUEUE_TYPE_COMPUTE: +- if (use_static) +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__normal_latency_static_queue_vi; +- break; +- case KFD_QUEUE_TYPE_DIQ: +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__debug_interface_queue_vi; +- break; +- case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = q->properties.sdma_engine_id + +- engine_sel__mes_map_queues__sdma0_vi; +- use_static = false; /* no static queues under SDMA */ +- break; +- default: +- WARN(1, "queue type %d", q->properties.type); +- return -EINVAL; +- } +- packet->bitfields3.doorbell_offset = +- q->properties.doorbell_off; +- +- packet->mqd_addr_lo = +- lower_32_bits(q->gart_mqd_addr); +- +- packet->mqd_addr_hi = +- upper_32_bits(q->gart_mqd_addr); +- +- packet->wptr_addr_lo = +- lower_32_bits((uint64_t)q->properties.write_ptr); +- +- packet->wptr_addr_hi = +- upper_32_bits((uint64_t)q->properties.write_ptr); +- +- return 0; +-} +- +-int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, +- struct scheduling_resources *res) +-{ +- struct pm4_mes_set_resources *packet; +- +- packet = (struct pm4_mes_set_resources *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); +- +- packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, +- sizeof(struct pm4_mes_set_resources)); +- +- packet->bitfields2.queue_type = +- queue_type__mes_set_resources__hsa_interface_queue_hiq; +- packet->bitfields2.vmid_mask = res->vmid_mask; +- packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; +- packet->bitfields7.oac_mask = res->oac_mask; +- packet->bitfields8.gds_heap_base = res->gds_heap_base; +- packet->bitfields8.gds_heap_size = res->gds_heap_size; +- +- packet->gws_mask_lo = lower_32_bits(res->gws_mask); +- packet->gws_mask_hi = upper_32_bits(res->gws_mask); +- +- packet->queue_mask_lo = lower_32_bits(res->queue_mask); +- packet->queue_mask_hi = upper_32_bits(res->queue_mask); +- +- return 0; +-} +- +-int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, +- enum kfd_queue_type type, +- enum kfd_unmap_queues_filter filter, +- uint32_t filter_param, bool reset, +- unsigned int sdma_engine) +-{ +- struct pm4_mes_unmap_queues *packet; +- +- packet = (struct pm4_mes_unmap_queues *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); +- +- packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, +- sizeof(struct pm4_mes_unmap_queues)); +- switch (type) { +- case KFD_QUEUE_TYPE_COMPUTE: +- case KFD_QUEUE_TYPE_DIQ: +- packet->bitfields2.engine_sel = +- engine_sel__mes_unmap_queues__compute; +- break; +- case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = +- engine_sel__mes_unmap_queues__sdma0 + sdma_engine; +- break; +- default: +- WARN(1, "queue type %d", type); +- return -EINVAL; +- } +- +- if (reset) +- packet->bitfields2.action = +- action__mes_unmap_queues__reset_queues; +- else +- packet->bitfields2.action = +- action__mes_unmap_queues__preempt_queues; +- +- switch (filter) { +- case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_specified_queues; +- packet->bitfields2.num_queues = 1; +- packet->bitfields3b.doorbell_offset0 = filter_param; +- break; +- case KFD_UNMAP_QUEUES_FILTER_BY_PASID: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; +- packet->bitfields3a.pasid = filter_param; +- break; +- case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__unmap_all_queues; +- break; +- case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: +- /* in this case, we do not preempt static queues */ +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__unmap_all_non_static_queues; +- break; +- default: +- WARN(1, "filter %d", filter); +- return -EINVAL; +- } +- +- return 0; +- +-} +- +-int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, +- uint64_t fence_address, uint32_t fence_value) +-{ +- struct pm4_mes_query_status *packet; +- +- packet = (struct pm4_mes_query_status *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_query_status)); +- +- +- packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, +- sizeof(struct pm4_mes_query_status)); +- +- packet->bitfields2.context_id = 0; +- packet->bitfields2.interrupt_sel = +- interrupt_sel__mes_query_status__completion_status; +- packet->bitfields2.command = +- command__mes_query_status__fence_only_after_write_ack; +- +- packet->addr_hi = upper_32_bits((uint64_t)fence_address); +- packet->addr_lo = lower_32_bits((uint64_t)fence_address); +- packet->data_hi = upper_32_bits((uint64_t)fence_value); +- packet->data_lo = lower_32_bits((uint64_t)fence_value); +- +- return 0; +-} +- +- +-uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) +-{ +- struct pm4_mec_release_mem *packet; +- +- packet = (struct pm4_mec_release_mem *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); +- +- packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, +- sizeof(struct pm4_mec_release_mem)); +- +- packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; +- packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; +- packet->bitfields2.tcl1_action_ena = 1; +- packet->bitfields2.tc_action_ena = 1; +- packet->bitfields2.cache_policy = cache_policy___release_mem__lru; +- packet->bitfields2.atc = 0; +- +- packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; +- packet->bitfields3.int_sel = +- int_sel___release_mem__send_interrupt_after_write_confirm; +- +- packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; +- packet->address_hi = upper_32_bits(gpu_addr); +- +- packet->data_lo = 0; +- +- return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); +-} +- +-uint32_t pm_get_map_process_packet_size_vi(void) +-{ +- return sizeof(struct pm4_mes_map_process); +-} +- +-uint32_t pm_get_runlist_packet_size_vi(void) +-{ +- return sizeof(struct pm4_mes_runlist); +-} +- +-uint32_t pm_get_set_resources_packet_size_vi(void) +-{ +- return sizeof(struct pm4_mes_set_resources); +-} +- +-uint32_t pm_get_map_queues_packet_size_vi(void) +-{ +- return sizeof(struct pm4_mes_map_queues); +-} +- +-uint32_t pm_get_unmap_queues_packet_size_vi(void) +-{ +- return sizeof(struct pm4_mes_unmap_queues); +-} +- +-uint32_t pm_get_query_status_packet_size_vi(void) +-{ +- return sizeof(struct pm4_mes_query_status); +-} +- +-uint32_t pm_get_release_mem_packet_size_vi(void) +-{ +- return sizeof(struct pm4_mec_release_mem); +-} +- +- +-static struct packet_manager_funcs kfd_vi_pm_funcs = { +- .map_process = pm_map_process_vi, +- .runlist = pm_runlist_vi, +- .set_resources = pm_set_resources_vi, +- .map_queues = pm_map_queues_vi, +- .unmap_queues = pm_unmap_queues_vi, +- .query_status = pm_query_status_vi, +- .release_mem = pm_release_mem_vi, +- .get_map_process_packet_size = pm_get_map_process_packet_size_vi, +- .get_runlist_packet_size = pm_get_runlist_packet_size_vi, +- .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, +- .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, +- .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, +- .get_query_status_packet_size = pm_get_query_status_packet_size_vi, +- .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, +-}; +- +-void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver) +-{ +- pm->pmf = &kfd_vi_pm_funcs; +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c +index ba4d5de..850a562 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c +@@ -29,10 +29,10 @@ + #define KFD_DRIVER_AUTHOR "AMD Inc. and others" + + #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" +-#define KFD_DRIVER_DATE "20160408" +-#define KFD_DRIVER_MAJOR 2 +-#define KFD_DRIVER_MINOR 0 +-#define KFD_DRIVER_PATCHLEVEL 0 ++#define KFD_DRIVER_DATE "20150421" ++#define KFD_DRIVER_MAJOR 0 ++#define KFD_DRIVER_MINOR 7 ++#define KFD_DRIVER_PATCHLEVEL 2 + + static const struct kgd2kfd_calls kgd2kfd = { + .exit = kgd2kfd_exit, +@@ -42,10 +42,6 @@ static const struct kgd2kfd_calls kgd2kfd = { + .interrupt = kgd2kfd_interrupt, + .suspend = kgd2kfd_suspend, + .resume = kgd2kfd_resume, +- .quiesce_mm = kgd2kfd_quiesce_mm, +- .resume_mm = kgd2kfd_resume_mm, +- .schedule_evict_and_restore_process = +- kgd2kfd_schedule_evict_and_restore_process, + }; + + int sched_policy = KFD_SCHED_POLICY_HWS; +@@ -53,15 +49,6 @@ module_param(sched_policy, int, 0444); + MODULE_PARM_DESC(sched_policy, + "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); + +-int hws_max_conc_proc = 8; +-module_param(hws_max_conc_proc, int, 0444); +-MODULE_PARM_DESC(hws_max_conc_proc, +- "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); +- +-int cwsr_enable = 1; +-module_param(cwsr_enable, int, 0444); +-MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); +- + int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; + module_param(max_num_of_queues_per_device, int, 0444); + MODULE_PARM_DESC(max_num_of_queues_per_device, +@@ -74,28 +61,7 @@ MODULE_PARM_DESC(send_sigterm, + + static int amdkfd_init_completed; + +-int debug_largebar; +-module_param(debug_largebar, int, 0444); +-MODULE_PARM_DESC(debug_largebar, +- "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); +- +-int ignore_crat; +-module_param(ignore_crat, int, 0444); +-MODULE_PARM_DESC(ignore_crat, +- "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); +- +-int vega10_noretry; +-module_param_named(noretry, vega10_noretry, int, 0644); +-MODULE_PARM_DESC(noretry, +- "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); +- +-int priv_cp_queues; +-module_param(priv_cp_queues, int, 0644); +-MODULE_PARM_DESC(priv_cp_queues, +- "Enable privileged mode for CP queues (0 = off (default), 1 = on)"); +- +-int kgd2kfd_init(unsigned int interface_version, +- const struct kgd2kfd_calls **g2f) ++int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f) + { + if (!amdkfd_init_completed) + return -EPROBE_DEFER; +@@ -124,7 +90,7 @@ static int __init kfd_module_init(void) + /* Verify module parameters */ + if ((sched_policy < KFD_SCHED_POLICY_HWS) || + (sched_policy > KFD_SCHED_POLICY_NO_HWS)) { +- pr_err("sched_policy has invalid value\n"); ++ pr_err("kfd: sched_policy has invalid value\n"); + return -1; + } + +@@ -132,13 +98,13 @@ static int __init kfd_module_init(void) + if ((max_num_of_queues_per_device < 1) || + (max_num_of_queues_per_device > + KFD_MAX_NUM_OF_QUEUES_PER_DEVICE)) { +- pr_err("max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n"); ++ pr_err("kfd: max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n"); + return -1; + } + + err = kfd_pasid_init(); + if (err < 0) +- return err; ++ goto err_pasid; + + err = kfd_chardev_init(); + if (err < 0) +@@ -148,16 +114,8 @@ static int __init kfd_module_init(void) + if (err < 0) + goto err_topology; + +- err = kfd_ipc_init(); +- if (err < 0) +- goto err_topology; +- + kfd_process_create_wq(); + +- kfd_init_peer_direct(); +- +- kfd_debugfs_init(); +- + amdkfd_init_completed = 1; + + dev_info(kfd_device, "Initialized module\n"); +@@ -168,6 +126,7 @@ static int __init kfd_module_init(void) + kfd_chardev_exit(); + err_ioctl: + kfd_pasid_exit(); ++err_pasid: + return err; + } + +@@ -175,8 +134,6 @@ static void __exit kfd_module_exit(void) + { + amdkfd_init_completed = 0; + +- kfd_debugfs_fini(); +- kfd_close_peer_direct(); + kfd_process_destroy_wq(); + kfd_topology_shutdown(); + kfd_chardev_exit(); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +index 9eb2d54..b1ef136 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +@@ -23,68 +23,14 @@ + + #include "kfd_priv.h" + +-/* Mapping queue priority to pipe priority, indexed by queue priority */ +-int pipe_priority_map[] = { +- KFD_PIPE_PRIORITY_CS_LOW, +- KFD_PIPE_PRIORITY_CS_LOW, +- KFD_PIPE_PRIORITY_CS_LOW, +- KFD_PIPE_PRIORITY_CS_LOW, +- KFD_PIPE_PRIORITY_CS_LOW, +- KFD_PIPE_PRIORITY_CS_LOW, +- KFD_PIPE_PRIORITY_CS_LOW, +- KFD_PIPE_PRIORITY_CS_MEDIUM, +- KFD_PIPE_PRIORITY_CS_MEDIUM, +- KFD_PIPE_PRIORITY_CS_MEDIUM, +- KFD_PIPE_PRIORITY_CS_MEDIUM, +- KFD_PIPE_PRIORITY_CS_HIGH, +- KFD_PIPE_PRIORITY_CS_HIGH, +- KFD_PIPE_PRIORITY_CS_HIGH, +- KFD_PIPE_PRIORITY_CS_HIGH, +- KFD_PIPE_PRIORITY_CS_HIGH +-}; +- +-/* Mapping queue priority to SPI priority, indexed by queue priority +- * SPI priority 2 and 3 are reserved for trap handler context save +- */ +-int spi_priority_map[] = { +- KFD_SPI_PRIORITY_EXTRA_LOW, +- KFD_SPI_PRIORITY_EXTRA_LOW, +- KFD_SPI_PRIORITY_EXTRA_LOW, +- KFD_SPI_PRIORITY_EXTRA_LOW, +- KFD_SPI_PRIORITY_EXTRA_LOW, +- KFD_SPI_PRIORITY_EXTRA_LOW, +- KFD_SPI_PRIORITY_EXTRA_LOW, +- KFD_SPI_PRIORITY_EXTRA_LOW, +- KFD_SPI_PRIORITY_LOW, +- KFD_SPI_PRIORITY_LOW, +- KFD_SPI_PRIORITY_LOW, +- KFD_SPI_PRIORITY_LOW, +- KFD_SPI_PRIORITY_LOW, +- KFD_SPI_PRIORITY_LOW, +- KFD_SPI_PRIORITY_LOW, +- KFD_SPI_PRIORITY_LOW +-}; +- + struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) + { + switch (dev->device_info->asic_family) { + case CHIP_KAVERI: + return mqd_manager_init_cik(type, dev); +- case CHIP_HAWAII: +- return mqd_manager_init_cik_hawaii(type, dev); + case CHIP_CARRIZO: + return mqd_manager_init_vi(type, dev); +- case CHIP_TONGA: +- case CHIP_FIJI: +- case CHIP_POLARIS10: +- case CHIP_POLARIS11: +- return mqd_manager_init_vi_tonga(type, dev); +- case CHIP_VEGA10: +- case CHIP_RAVEN: +- return mqd_manager_init_v9(type, dev); +- default: +- BUG(); + } + + return NULL; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +index dcaeda8..213a71e 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +@@ -43,9 +43,6 @@ + * + * @is_occupied: Checks if the relevant HQD slot is occupied. + * +- * @get_wave_state: Retrieves context save state and optionally copies the +- * control stack, if kept in the MQD, to the given userspace address. +- * + * @mqd_mutex: Mqd manager mutex. + * + * @dev: The kfd device structure coupled with this module. +@@ -62,8 +59,7 @@ + * per KFD_MQD_TYPE for each device. + * + */ +-extern int pipe_priority_map[]; +-extern int spi_priority_map[]; ++ + struct mqd_manager { + int (*init_mqd)(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +@@ -71,8 +67,7 @@ struct mqd_manager { + + int (*load_mqd)(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, +- struct queue_properties *p, +- struct mm_struct *mms); ++ uint32_t __user *wptr); + + int (*update_mqd)(struct mqd_manager *mm, void *mqd, + struct queue_properties *q); +@@ -89,15 +84,6 @@ struct mqd_manager { + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id); + +- int (*get_wave_state)(struct mqd_manager *mm, void *mqd, +- void __user *ctl_stack, +- u32 *ctl_stack_used_size, +- u32 *save_area_used_size); +- +-#if defined(CONFIG_DEBUG_FS) +- int (*debugfs_show_mqd)(struct seq_file *m, void *data); +-#endif +- + struct mutex mqd_mutex; + struct kfd_dev *dev; + }; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +index 5724d33..6acc431 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +@@ -30,80 +30,12 @@ + #include "cik_regs.h" + #include "cik_structs.h" + #include "oss/oss_2_4_sh_mask.h" +-#include "gca/gfx_7_2_sh_mask.h" + + static inline struct cik_mqd *get_mqd(void *mqd) + { + return (struct cik_mqd *)mqd; + } + +-static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) +-{ +- return (struct cik_sdma_rlc_registers *)mqd; +-} +- +-static void update_cu_mask(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- struct cik_mqd *m; +- struct kfd_cu_info cu_info; +- uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ +- uint32_t cu_mask_count = q->cu_mask_count; +- const uint32_t *cu_mask = q->cu_mask; +- int se, cu_per_sh, cu_index, i; +- +- if (cu_mask_count == 0) +- return; +- +- m = get_mqd(mqd); +- m->compute_static_thread_mgmt_se0 = 0; +- m->compute_static_thread_mgmt_se1 = 0; +- m->compute_static_thread_mgmt_se2 = 0; +- m->compute_static_thread_mgmt_se3 = 0; +- +- mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); +- +- /* If # CU mask bits > # CUs, set it to the # of CUs */ +- if (cu_mask_count > cu_info.cu_active_number) +- cu_mask_count = cu_info.cu_active_number; +- +- cu_index = 0; +- for (se = 0; se < cu_info.num_shader_engines; se++) { +- cu_per_sh = 0; +- +- /* Get the number of CUs on this Shader Engine */ +- for (i = 0; i < 4; i++) +- cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); +- +- se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); +- if ((cu_per_sh + (cu_index % 32)) > 32) +- se_mask[se] |= cu_mask[(cu_index / 32) + 1] +- << (32 - (cu_index % 32)); +- se_mask[se] &= (1 << cu_per_sh) - 1; +- cu_index += cu_per_sh; +- } +- m->compute_static_thread_mgmt_se0 = se_mask[0]; +- m->compute_static_thread_mgmt_se1 = se_mask[1]; +- m->compute_static_thread_mgmt_se2 = se_mask[2]; +- m->compute_static_thread_mgmt_se3 = se_mask[3]; +- +- pr_debug("Update cu mask to %#x %#x %#x %#x\n", +- m->compute_static_thread_mgmt_se0, +- m->compute_static_thread_mgmt_se1, +- m->compute_static_thread_mgmt_se2, +- m->compute_static_thread_mgmt_se3); +-} +- +-static void set_priority(struct cik_mqd *m, struct queue_properties *q) +-{ +- m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; +- m->cp_hqd_queue_priority = q->priority; +- m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & +- (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | +- (spi_priority_map[q->priority] << +- COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); +-} +- + static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +@@ -112,6 +44,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + struct cik_mqd *m; + int retval; + ++ BUG_ON(!mm || !q || !mqd); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), + mqd_mem_obj); + +@@ -142,6 +78,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + ++ m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN; ++ /* Although WinKFD writes this, I suspect it should not be necessary */ ++ m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; ++ + m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | + QUANTUM_DURATION(10); + +@@ -154,17 +94,14 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + * 1 = CS_MEDIUM (typically between HP3D and GFX + * 2 = CS_HIGH (typically above HP3D) + */ +- set_priority(m, q); ++ m->cp_hqd_pipe_priority = 1; ++ m->cp_hqd_queue_priority = 15; + + if (q->format == KFD_QUEUE_FORMAT_AQL) + m->cp_hqd_iq_rptr = AQL_ENABLE; + +- if (priv_cp_queues) +- m->cp_hqd_pq_control |= +- 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; +- + *mqd = m; +- if (gart_addr) ++ if (gart_addr != NULL) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + +@@ -178,6 +115,8 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + int retval; + struct cik_sdma_rlc_registers *m; + ++ BUG_ON(!mm || !mqd || !mqd_mem_obj); ++ + retval = kfd_gtt_sa_allocate(mm->dev, + sizeof(struct cik_sdma_rlc_registers), + mqd_mem_obj); +@@ -190,7 +129,7 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + memset(m, 0, sizeof(struct cik_sdma_rlc_registers)); + + *mqd = m; +- if (gart_addr) ++ if (gart_addr != NULL) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); +@@ -201,50 +140,43 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + static void uninit_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) + { ++ BUG_ON(!mm || !mqd); + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + } + + static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) + { ++ BUG_ON(!mm || !mqd); + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + } + + static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, struct queue_properties *p, +- struct mm_struct *mms) ++ uint32_t queue_id, uint32_t __user *wptr) + { +- /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ +- uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); +- uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); +- +- return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, +- (uint32_t __user *)p->write_ptr, +- wptr_shift, wptr_mask, mms); ++ return mm->dev->kfd2kgd->hqd_load ++ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); + } + + static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, +- uint32_t pipe_id, uint32_t queue_id, +- struct queue_properties *p, struct mm_struct *mms) ++ uint32_t pipe_id, uint32_t queue_id, ++ uint32_t __user *wptr) + { +- return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, +- (uint32_t __user *)p->write_ptr, +- mms); ++ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); + } + +-static int __update_mqd(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q, unsigned int atc_bit) ++static int update_mqd(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) + { + struct cik_mqd *m; + ++ BUG_ON(!mm || !q || !mqd); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ + m = get_mqd(mqd); + m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | +- DEFAULT_MIN_AVAIL_SIZE; +- m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE; +- if (atc_bit) { +- m->cp_hqd_pq_control |= PQ_ATC_EN; +- m->cp_hqd_ib_control |= IB_ATC_EN; +- } ++ DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN; + + /* + * Calculating queue size which is log base 2 of actual queue size -1 +@@ -256,47 +188,37 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); +- m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); ++ m->cp_hqd_pq_doorbell_control = DOORBELL_EN | ++ DOORBELL_OFFSET(q->doorbell_off); + + m->cp_hqd_vmid = q->vmid; + +- if (q->format == KFD_QUEUE_FORMAT_AQL) ++ if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_pq_control |= NO_UPDATE_RPTR; ++ } + +- update_cu_mask(mm, mqd, q); +- set_priority(m, q); +- ++ m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0 && +- !q->is_evicted) { ++ q->queue_percent > 0) { ++ m->cp_hqd_active = 1; + q->is_active = true; + } + + return 0; + } + +-static int update_mqd(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- return __update_mqd(mm, mqd, q, 1); +-} +- +-static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- return __update_mqd(mm, mqd, q, 0); +-} +- + static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) + { + struct cik_sdma_rlc_registers *m; + ++ BUG_ON(!mm || !mqd || !q); ++ + m = get_sdma_mqd(mqd); +- m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) +- << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | ++ m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << ++ SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; +@@ -305,8 +227,9 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); +- m->sdma_rlc_doorbell = +- q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; ++ m->sdma_rlc_doorbell = q->doorbell_off << ++ SDMA0_RLC0_DOORBELL__OFFSET__SHIFT | ++ 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; + + m->sdma_rlc_virtual_addr = q->sdma_vm_addr; + +@@ -316,8 +239,10 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0 && +- !q->is_evicted) { ++ q->queue_percent > 0) { ++ m->sdma_rlc_rb_cntl |= ++ 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; ++ + q->is_active = true; + } + +@@ -329,7 +254,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) + { +- return mm->dev->kfd2kgd->hqd_destroy(mm->dev->kgd, mqd, type, timeout, ++ return mm->dev->kfd2kgd->hqd_destroy(mm->dev->kgd, type, timeout, + pipe_id, queue_id); + } + +@@ -376,6 +301,10 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct cik_mqd *m; + int retval; + ++ BUG_ON(!mm || !q || !mqd || !mqd_mem_obj); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), + mqd_mem_obj); + +@@ -414,7 +343,8 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + * 1 = CS_MEDIUM (typically between HP3D and GFX + * 2 = CS_HIGH (typically above HP3D) + */ +- set_priority(m, q); ++ m->cp_hqd_pipe_priority = 1; ++ m->cp_hqd_queue_priority = 15; + + *mqd = m; + if (gart_addr) +@@ -429,6 +359,10 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + { + struct cik_mqd *m; + ++ BUG_ON(!mm || !q || !mqd); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ + m = get_mqd(mqd); + m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | + DEFAULT_MIN_AVAIL_SIZE | +@@ -445,50 +379,45 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); +- m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); ++ m->cp_hqd_pq_doorbell_control = DOORBELL_EN | ++ DOORBELL_OFFSET(q->doorbell_off); + + m->cp_hqd_vmid = q->vmid; + ++ m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0 && +- !q->is_evicted) { ++ q->queue_percent > 0) { ++ m->cp_hqd_active = 1; + q->is_active = true; + } + +- set_priority(m, q); + return 0; + } + +-#if defined(CONFIG_DEBUG_FS) +- +-static int debugfs_show_mqd(struct seq_file *m, void *data) ++struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) + { +- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, +- data, sizeof(struct cik_mqd), false); +- return 0; +-} ++ struct cik_sdma_rlc_registers *m; + +-static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +-{ +- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, +- data, sizeof(struct cik_sdma_rlc_registers), false); +- return 0; +-} ++ BUG_ON(!mqd); + +-#endif ++ m = (struct cik_sdma_rlc_registers *)mqd; + ++ return m; ++} + + struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) + { + struct mqd_manager *mqd; + +- if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) +- return NULL; ++ BUG_ON(!dev); ++ BUG_ON(type >= KFD_MQD_TYPE_MAX); + +- mqd = kzalloc(sizeof(*mqd), GFP_NOIO); ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); + if (!mqd) + return NULL; + +@@ -503,9 +432,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +-#if defined(CONFIG_DEBUG_FS) +- mqd->debugfs_show_mqd = debugfs_show_mqd; +-#endif + break; + case KFD_MQD_TYPE_HIQ: + mqd->init_mqd = init_mqd_hiq; +@@ -514,9 +440,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +-#if defined(CONFIG_DEBUG_FS) +- mqd->debugfs_show_mqd = debugfs_show_mqd; +-#endif + break; + case KFD_MQD_TYPE_SDMA: + mqd->init_mqd = init_mqd_sdma; +@@ -525,9 +448,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; +-#if defined(CONFIG_DEBUG_FS) +- mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +-#endif + break; + default: + kfree(mqd); +@@ -537,15 +457,3 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + return mqd; + } + +-struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, +- struct kfd_dev *dev) +-{ +- struct mqd_manager *mqd; +- +- mqd = mqd_manager_init_cik(type, dev); +- if (!mqd) +- return NULL; +- if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) +- mqd->update_mqd = update_mqd_hawaii; +- return mqd; +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +deleted file mode 100644 +index 6c302d2..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c ++++ /dev/null +@@ -1,528 +0,0 @@ +-/* +- * Copyright 2016 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- * +- */ +- +-#include <linux/printk.h> +-#include <linux/slab.h> +-#include <linux/uaccess.h> +-#include "kfd_priv.h" +-#include "kfd_mqd_manager.h" +-#include "v9_structs.h" +-#include "vega10/GC/gc_9_0_offset.h" +-#include "vega10/GC/gc_9_0_sh_mask.h" +-#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" +- +-static inline struct v9_mqd *get_mqd(void *mqd) +-{ +- return (struct v9_mqd *)mqd; +-} +- +-static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) +-{ +- return (struct v9_sdma_mqd *)mqd; +-} +- +-static void update_cu_mask(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- struct v9_mqd *m; +- struct kfd_cu_info cu_info; +- uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ +- uint32_t cu_mask_count = q->cu_mask_count; +- const uint32_t *cu_mask = q->cu_mask; +- int se, cu_per_sh, cu_index, i; +- +- if (cu_mask_count == 0) +- return; +- +- m = get_mqd(mqd); +- m->compute_static_thread_mgmt_se0 = 0; +- m->compute_static_thread_mgmt_se1 = 0; +- m->compute_static_thread_mgmt_se2 = 0; +- m->compute_static_thread_mgmt_se3 = 0; +- +- mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); +- +- /* If # CU mask bits > # CUs, set it to the # of CUs */ +- if (cu_mask_count > cu_info.cu_active_number) +- cu_mask_count = cu_info.cu_active_number; +- +- cu_index = 0; +- for (se = 0; se < cu_info.num_shader_engines; se++) { +- cu_per_sh = 0; +- +- /* Get the number of CUs on this Shader Engine */ +- for (i = 0; i < 4; i++) +- cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); +- +- se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); +- if ((cu_per_sh + (cu_index % 32)) > 32) +- se_mask[se] |= cu_mask[(cu_index / 32) + 1] +- << (32 - (cu_index % 32)); +- se_mask[se] &= (1 << cu_per_sh) - 1; +- cu_index += cu_per_sh; +- } +- m->compute_static_thread_mgmt_se0 = se_mask[0]; +- m->compute_static_thread_mgmt_se1 = se_mask[1]; +- m->compute_static_thread_mgmt_se2 = se_mask[2]; +- m->compute_static_thread_mgmt_se3 = se_mask[3]; +- +- pr_debug("update cu mask to %#x %#x %#x %#x\n", +- m->compute_static_thread_mgmt_se0, +- m->compute_static_thread_mgmt_se1, +- m->compute_static_thread_mgmt_se2, +- m->compute_static_thread_mgmt_se3); +-} +- +-static int init_mqd(struct mqd_manager *mm, void **mqd, +- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +- struct queue_properties *q) +-{ +- int retval; +- uint64_t addr; +- struct v9_mqd *m; +- struct kfd_dev *kfd = mm->dev; +- +- /* From V9, for CWSR, the control stack is located on the next page +- * boundary after the mqd, we will use the gtt allocation function +- * instead of sub-allocation function. +- */ +- if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { +- *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); +- retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, +- ALIGN(q->ctl_stack_size, PAGE_SIZE) + +- ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), +- &((*mqd_mem_obj)->gtt_mem), +- &((*mqd_mem_obj)->gpu_addr), +- (void *)&((*mqd_mem_obj)->cpu_ptr)); +- } else +- retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), +- mqd_mem_obj); +- if (retval != 0) +- return -ENOMEM; +- +- m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; +- addr = (*mqd_mem_obj)->gpu_addr; +- +- memset(m, 0, sizeof(struct v9_mqd)); +- +- m->header = 0xC0310800; +- m->compute_pipelinestat_enable = 1; +- m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; +- m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; +- m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; +- m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; +- +- m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | +- 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; +- +- m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; +- +- m->cp_mqd_base_addr_lo = lower_32_bits(addr); +- m->cp_mqd_base_addr_hi = upper_32_bits(addr); +- +- m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | +- 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | +- 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; +- +- m->cp_hqd_pipe_priority = 1; +- m->cp_hqd_queue_priority = 15; +- +- if (q->format == KFD_QUEUE_FORMAT_AQL) { +- m->cp_hqd_aql_control = +- 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; +- } +- +- if (q->tba_addr) { +- m->compute_pgm_rsrc2 |= +- (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); +- } +- +- if (mm->dev->cwsr_enabled) { +- m->cp_hqd_persistent_state |= +- (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); +- m->cp_hqd_ctx_save_base_addr_lo = +- lower_32_bits(q->ctx_save_restore_area_address); +- m->cp_hqd_ctx_save_base_addr_hi = +- upper_32_bits(q->ctx_save_restore_area_address); +- m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; +- m->cp_hqd_cntl_stack_size = q->ctl_stack_size; +- m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; +- m->cp_hqd_wg_state_offset = q->ctl_stack_size; +- } +- +- if (priv_cp_queues) +- m->cp_hqd_pq_control |= +- 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; +- +- *mqd = m; +- if (gart_addr) +- *gart_addr = addr; +- retval = mm->update_mqd(mm, m, q); +- +- return retval; +-} +- +-static int load_mqd(struct mqd_manager *mm, void *mqd, +- uint32_t pipe_id, uint32_t queue_id, +- struct queue_properties *p, struct mm_struct *mms) +-{ +- /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ +- uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); +- +- return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, +- (uint32_t __user *)p->write_ptr, +- wptr_shift, 0, mms); +-} +- +-static int update_mqd(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- struct v9_mqd *m; +- +- m = get_mqd(mqd); +- +- m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; +- m->cp_hqd_pq_control |= +- ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; +- pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); +- +- m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); +- m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); +- +- m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); +- m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); +- m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); +- m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); +- +- m->cp_hqd_pq_doorbell_control = +- q->doorbell_off << +- CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; +- pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", +- m->cp_hqd_pq_doorbell_control); +- +- m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; +- +- /* +- * HW does not clamp this field correctly. Maximum EOP queue size +- * is constrained by per-SE EOP done signal count, which is 8-bit. +- * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit +- * more than (EOP entry count - 1) so a queue size of 0x800 dwords +- * is safe, giving a maximum field value of 0xA. +- */ +- m->cp_hqd_eop_control = min(0xA, +- ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); +- m->cp_hqd_eop_base_addr_lo = +- lower_32_bits(q->eop_ring_buffer_address >> 8); +- m->cp_hqd_eop_base_addr_hi = +- upper_32_bits(q->eop_ring_buffer_address >> 8); +- +- m->cp_hqd_iq_timer = 0; +- +- m->cp_hqd_vmid = q->vmid; +- +- if (q->format == KFD_QUEUE_FORMAT_AQL) { +- m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | +- 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | +- 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | +- 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; +- m->cp_hqd_pq_doorbell_control |= +- 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; +- } +- if (mm->dev->cwsr_enabled) +- m->cp_hqd_ctx_save_control = 0; +- +- update_cu_mask(mm, mqd, q); +- +- q->is_active = false; +- if (q->queue_size > 0 && +- q->queue_address != 0 && +- q->queue_percent > 0 && +- !q->is_evicted) { +- q->is_active = true; +- } +- +- return 0; +-} +- +- +-static int destroy_mqd(struct mqd_manager *mm, void *mqd, +- enum kfd_preempt_type type, +- unsigned int timeout, uint32_t pipe_id, +- uint32_t queue_id) +-{ +- return mm->dev->kfd2kgd->hqd_destroy +- (mm->dev->kgd, mqd, type, timeout, +- pipe_id, queue_id); +-} +- +-static void uninit_mqd(struct mqd_manager *mm, void *mqd, +- struct kfd_mem_obj *mqd_mem_obj) +-{ +- struct kfd_dev *kfd = mm->dev; +- +- if (mqd_mem_obj->gtt_mem) { +- kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); +- kfree(mqd_mem_obj); +- } else { +- kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +- } +-} +- +-static bool is_occupied(struct mqd_manager *mm, void *mqd, +- uint64_t queue_address, uint32_t pipe_id, +- uint32_t queue_id) +-{ +- return mm->dev->kfd2kgd->hqd_is_occupied( +- mm->dev->kgd, queue_address, +- pipe_id, queue_id); +-} +- +-static int get_wave_state(struct mqd_manager *mm, void *mqd, +- void __user *ctl_stack, +- u32 *ctl_stack_used_size, +- u32 *save_area_used_size) +-{ +- struct v9_mqd *m; +- +- /* Control stack is located one page after MQD. */ +- void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); +- +- m = get_mqd(mqd); +- +- *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - +- m->cp_hqd_cntl_stack_offset; +- *save_area_used_size = m->cp_hqd_wg_state_offset - +- m->cp_hqd_cntl_stack_size; +- +- if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) +- return -EFAULT; +- +- return 0; +-} +- +-static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, +- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +- struct queue_properties *q) +-{ +- struct v9_mqd *m; +- int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); +- +- if (retval != 0) +- return retval; +- +- m = get_mqd(*mqd); +- +- m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | +- 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; +- +- return retval; +-} +- +-static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- struct v9_mqd *m; +- int retval = update_mqd(mm, mqd, q); +- +- if (retval != 0) +- return retval; +- +- /* TODO: what's the point? update_mqd already does this. */ +- m = get_mqd(mqd); +- m->cp_hqd_vmid = q->vmid; +- return retval; +-} +- +-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, +- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +- struct queue_properties *q) +-{ +- int retval; +- struct v9_sdma_mqd *m; +- +- +- retval = kfd_gtt_sa_allocate(mm->dev, +- sizeof(struct v9_sdma_mqd), +- mqd_mem_obj); +- +- if (retval != 0) +- return -ENOMEM; +- +- m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; +- +- memset(m, 0, sizeof(struct v9_sdma_mqd)); +- +- *mqd = m; +- if (gart_addr) +- *gart_addr = (*mqd_mem_obj)->gpu_addr; +- +- retval = mm->update_mqd(mm, m, q); +- +- return retval; +-} +- +-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, +- struct kfd_mem_obj *mqd_mem_obj) +-{ +- kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +-} +- +-static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, +- uint32_t pipe_id, uint32_t queue_id, +- struct queue_properties *p, struct mm_struct *mms) +-{ +- return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, +- (uint32_t __user *)p->write_ptr, +- mms); +-} +- +-#define SDMA_RLC_DUMMY_DEFAULT 0xf +- +-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- struct v9_sdma_mqd *m; +- +- m = get_sdma_mqd(mqd); +- m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) +- << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | +- q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | +- 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | +- 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; +- +- m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); +- m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); +- m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); +- m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); +- m->sdmax_rlcx_doorbell_offset = +- q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; +- +- m->sdma_engine_id = q->sdma_engine_id; +- m->sdma_queue_id = q->sdma_queue_id; +- m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; +- +- q->is_active = false; +- if (q->queue_size > 0 && +- q->queue_address != 0 && +- q->queue_percent > 0 && +- !q->is_evicted) { +- q->is_active = true; +- } +- +- return 0; +-} +- +-/* +- * * preempt type here is ignored because there is only one way +- * * to preempt sdma queue +- */ +-static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, +- enum kfd_preempt_type type, +- unsigned int timeout, uint32_t pipe_id, +- uint32_t queue_id) +-{ +- return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +-} +- +-static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, +- uint64_t queue_address, uint32_t pipe_id, +- uint32_t queue_id) +-{ +- return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +-} +- +-#if defined(CONFIG_DEBUG_FS) +- +-static int debugfs_show_mqd(struct seq_file *m, void *data) +-{ +- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, +- data, sizeof(struct v9_mqd), false); +- return 0; +-} +- +-static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +-{ +- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, +- data, sizeof(struct v9_sdma_mqd), false); +- return 0; +-} +- +-#endif +- +-struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, +- struct kfd_dev *dev) +-{ +- struct mqd_manager *mqd; +- +- if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) +- return NULL; +- +- mqd = kzalloc(sizeof(*mqd), GFP_NOIO); +- if (!mqd) +- return NULL; +- +- mqd->dev = dev; +- +- switch (type) { +- case KFD_MQD_TYPE_CP: +- case KFD_MQD_TYPE_COMPUTE: +- mqd->init_mqd = init_mqd; +- mqd->uninit_mqd = uninit_mqd; +- mqd->load_mqd = load_mqd; +- mqd->update_mqd = update_mqd; +- mqd->destroy_mqd = destroy_mqd; +- mqd->is_occupied = is_occupied; +- mqd->get_wave_state = get_wave_state; +-#if defined(CONFIG_DEBUG_FS) +- mqd->debugfs_show_mqd = debugfs_show_mqd; +-#endif +- break; +- case KFD_MQD_TYPE_HIQ: +- mqd->init_mqd = init_mqd_hiq; +- mqd->uninit_mqd = uninit_mqd; +- mqd->load_mqd = load_mqd; +- mqd->update_mqd = update_mqd_hiq; +- mqd->destroy_mqd = destroy_mqd; +- mqd->is_occupied = is_occupied; +-#if defined(CONFIG_DEBUG_FS) +- mqd->debugfs_show_mqd = debugfs_show_mqd; +-#endif +- break; +- case KFD_MQD_TYPE_SDMA: +- mqd->init_mqd = init_mqd_sdma; +- mqd->uninit_mqd = uninit_mqd_sdma; +- mqd->load_mqd = load_mqd_sdma; +- mqd->update_mqd = update_mqd_sdma; +- mqd->destroy_mqd = destroy_mqd_sdma; +- mqd->is_occupied = is_occupied_sdma; +-#if defined(CONFIG_DEBUG_FS) +- mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +-#endif +- break; +- default: +- kfree(mqd); +- return NULL; +- } +- +- return mqd; +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +index 5c26e5a..a9b9882 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +@@ -30,7 +30,6 @@ + #include "vi_structs.h" + #include "gca/gfx_8_0_sh_mask.h" + #include "gca/gfx_8_0_enum.h" +-#include "oss/oss_3_0_sh_mask.h" + + #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 + +@@ -39,73 +38,6 @@ static inline struct vi_mqd *get_mqd(void *mqd) + return (struct vi_mqd *)mqd; + } + +-static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) +-{ +- return (struct vi_sdma_mqd *)mqd; +-} +- +-static void update_cu_mask(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- struct vi_mqd *m; +- struct kfd_cu_info cu_info; +- uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ +- uint32_t cu_mask_count = q->cu_mask_count; +- const uint32_t *cu_mask = q->cu_mask; +- int se, cu_per_sh, cu_index, i; +- +- if (cu_mask_count == 0) +- return; +- +- m = get_mqd(mqd); +- m->compute_static_thread_mgmt_se0 = 0; +- m->compute_static_thread_mgmt_se1 = 0; +- m->compute_static_thread_mgmt_se2 = 0; +- m->compute_static_thread_mgmt_se3 = 0; +- +- mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); +- +- /* If # CU mask bits > # CUs, set it to the # of CUs */ +- if (cu_mask_count > cu_info.cu_active_number) +- cu_mask_count = cu_info.cu_active_number; +- +- cu_index = 0; +- for (se = 0; se < cu_info.num_shader_engines; se++) { +- cu_per_sh = 0; +- +- /* Get the number of CUs on this Shader Engine */ +- for (i = 0; i < 4; i++) +- cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); +- +- se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); +- if ((cu_per_sh + (cu_index % 32)) > 32) +- se_mask[se] |= cu_mask[(cu_index / 32) + 1] +- << (32 - (cu_index % 32)); +- se_mask[se] &= (1 << cu_per_sh) - 1; +- cu_index += cu_per_sh; +- } +- m->compute_static_thread_mgmt_se0 = se_mask[0]; +- m->compute_static_thread_mgmt_se1 = se_mask[1]; +- m->compute_static_thread_mgmt_se2 = se_mask[2]; +- m->compute_static_thread_mgmt_se3 = se_mask[3]; +- +- pr_debug("Update cu mask to %#x %#x %#x %#x\n", +- m->compute_static_thread_mgmt_se0, +- m->compute_static_thread_mgmt_se1, +- m->compute_static_thread_mgmt_se2, +- m->compute_static_thread_mgmt_se3); +-} +- +-static void set_priority(struct vi_mqd *m, struct queue_properties *q) +-{ +- m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; +- m->cp_hqd_queue_priority = q->priority; +- m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & +- (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | +- (spi_priority_map[q->priority] << +- COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); +-} +- + static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +@@ -144,40 +76,16 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + +- set_priority(m, q); ++ m->cp_hqd_pipe_priority = 1; ++ m->cp_hqd_queue_priority = 15; ++ + m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT; + + if (q->format == KFD_QUEUE_FORMAT_AQL) + m->cp_hqd_iq_rptr = 1; + +- if (q->tba_addr) { +- m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); +- m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); +- m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); +- m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); +- m->compute_pgm_rsrc2 |= +- (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); +- } +- +- if (mm->dev->cwsr_enabled) { +- m->cp_hqd_persistent_state |= +- (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); +- m->cp_hqd_ctx_save_base_addr_lo = +- lower_32_bits(q->ctx_save_restore_area_address); +- m->cp_hqd_ctx_save_base_addr_hi = +- upper_32_bits(q->ctx_save_restore_area_address); +- m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; +- m->cp_hqd_cntl_stack_size = q->ctl_stack_size; +- m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; +- m->cp_hqd_wg_state_offset = q->ctl_stack_size; +- } +- +- if (priv_cp_queues) +- m->cp_hqd_pq_control |= +- 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; +- + *mqd = m; +- if (gart_addr) ++ if (gart_addr != NULL) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + +@@ -186,15 +94,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + + static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, +- struct queue_properties *p, struct mm_struct *mms) ++ uint32_t __user *wptr) + { +- /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ +- uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); +- uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); +- +- return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, +- (uint32_t __user *)p->write_ptr, +- wptr_shift, wptr_mask, mms); ++ return mm->dev->kfd2kgd->hqd_load ++ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); + } + + static int __update_mqd(struct mqd_manager *mm, void *mqd, +@@ -203,6 +106,10 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, + { + struct vi_mqd *m; + ++ BUG_ON(!mm || !q || !mqd); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ + m = get_mqd(mqd); + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT | +@@ -210,20 +117,19 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, + mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT; + m->cp_hqd_pq_control |= + ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; +- pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); ++ pr_debug("kfd: cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); +- m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); +- m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = ++ 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_EN__SHIFT | + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; +- pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", ++ pr_debug("kfd: cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_eop_control = atc_bit << CP_HQD_EOP_CONTROL__EOP_ATC__SHIFT | +@@ -233,15 +139,8 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, + 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | + mtype << CP_HQD_IB_CONTROL__MTYPE__SHIFT; + +- /* +- * HW does not clamp this field correctly. Maximum EOP queue size +- * is constrained by per-SE EOP done signal count, which is 8-bit. +- * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit +- * more than (EOP entry count - 1) so a queue size of 0x800 dwords +- * is safe, giving a maximum field value of 0xA. +- */ +- m->cp_hqd_eop_control |= min(0xA, +- ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); ++ m->cp_hqd_eop_control |= ++ ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1; + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = +@@ -256,19 +155,13 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; + } +- if (mm->dev->cwsr_enabled) +- m->cp_hqd_ctx_save_control = +- atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | +- mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; +- +- update_cu_mask(mm, mqd, q); +- set_priority(m, q); + ++ m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0 && +- !q->is_evicted) { ++ q->queue_percent > 0) { ++ m->cp_hqd_active = 1; + q->is_active = true; + } + +@@ -282,25 +175,20 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, + return __update_mqd(mm, mqd, q, MTYPE_CC, 1); + } + +-static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- return __update_mqd(mm, mqd, q, MTYPE_UC, 0); +-} +- + static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) + { + return mm->dev->kfd2kgd->hqd_destroy +- (mm->dev->kgd, mqd, type, timeout, ++ (mm->dev->kgd, type, timeout, + pipe_id, queue_id); + } + + static void uninit_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) + { ++ BUG_ON(!mm || !mqd); + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + } + +@@ -313,28 +201,6 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd, + pipe_id, queue_id); + } + +-static int get_wave_state(struct mqd_manager *mm, void *mqd, +- void __user *ctl_stack, +- u32 *ctl_stack_used_size, +- u32 *save_area_used_size) +-{ +- struct vi_mqd *m; +- +- m = get_mqd(mqd); +- +- *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - +- m->cp_hqd_cntl_stack_offset; +- *save_area_used_size = m->cp_hqd_wg_state_offset - +- m->cp_hqd_cntl_stack_size; +- +- /* Control stack is not copied to user mode for GFXv8 because +- * it's part of the context save area that is already +- * accessible to user mode +- */ +- +- return 0; +-} +- + static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +@@ -367,130 +233,17 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + return retval; + } + +-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, +- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +- struct queue_properties *q) +-{ +- int retval; +- struct vi_sdma_mqd *m; +- +- +- retval = kfd_gtt_sa_allocate(mm->dev, +- sizeof(struct vi_sdma_mqd), +- mqd_mem_obj); +- +- if (retval != 0) +- return -ENOMEM; +- +- m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; +- +- memset(m, 0, sizeof(struct vi_sdma_mqd)); +- +- *mqd = m; +- if (gart_addr) +- *gart_addr = (*mqd_mem_obj)->gpu_addr; +- +- retval = mm->update_mqd(mm, m, q); +- +- return retval; +-} +- +-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, +- struct kfd_mem_obj *mqd_mem_obj) +-{ +- kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +-} +- +-static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, +- uint32_t pipe_id, uint32_t queue_id, +- struct queue_properties *p, struct mm_struct *mms) +-{ +- return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, +- (uint32_t __user *)p->write_ptr, +- mms); +-} +- +-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) +-{ +- struct vi_sdma_mqd *m; +- +- m = get_sdma_mqd(mqd); +- m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) +- << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | +- q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | +- 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | +- 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; +- +- m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); +- m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); +- m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); +- m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); +- m->sdmax_rlcx_doorbell = +- q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; +- +- m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr; +- +- m->sdma_engine_id = q->sdma_engine_id; +- m->sdma_queue_id = q->sdma_queue_id; +- +- q->is_active = false; +- if (q->queue_size > 0 && +- q->queue_address != 0 && +- q->queue_percent > 0 && +- !q->is_evicted) { +- q->is_active = true; +- } +- +- return 0; +-} +- +-/* +- * * preempt type here is ignored because there is only one way +- * * to preempt sdma queue +- */ +-static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, +- enum kfd_preempt_type type, +- unsigned int timeout, uint32_t pipe_id, +- uint32_t queue_id) +-{ +- return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +-} +- +-static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, +- uint64_t queue_address, uint32_t pipe_id, +- uint32_t queue_id) +-{ +- return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +-} +- +-#if defined(CONFIG_DEBUG_FS) +- +-static int debugfs_show_mqd(struct seq_file *m, void *data) +-{ +- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, +- data, sizeof(struct vi_mqd), false); +- return 0; +-} +- +-static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +-{ +- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, +- data, sizeof(struct vi_sdma_mqd), false); +- return 0; +-} +- +-#endif +- + struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) + { + struct mqd_manager *mqd; + +- if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) +- return NULL; ++ BUG_ON(!dev); ++ BUG_ON(type >= KFD_MQD_TYPE_MAX); ++ ++ pr_debug("kfd: In func %s\n", __func__); + +- mqd = kzalloc(sizeof(*mqd), GFP_NOIO); ++ mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); + if (!mqd) + return NULL; + +@@ -505,10 +258,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +- mqd->get_wave_state = get_wave_state; +-#if defined(CONFIG_DEBUG_FS) +- mqd->debugfs_show_mqd = debugfs_show_mqd; +-#endif + break; + case KFD_MQD_TYPE_HIQ: + mqd->init_mqd = init_mqd_hiq; +@@ -517,20 +266,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +-#if defined(CONFIG_DEBUG_FS) +- mqd->debugfs_show_mqd = debugfs_show_mqd; +-#endif + break; + case KFD_MQD_TYPE_SDMA: +- mqd->init_mqd = init_mqd_sdma; +- mqd->uninit_mqd = uninit_mqd_sdma; +- mqd->load_mqd = load_mqd_sdma; +- mqd->update_mqd = update_mqd_sdma; +- mqd->destroy_mqd = destroy_mqd_sdma; +- mqd->is_occupied = is_occupied_sdma; +-#if defined(CONFIG_DEBUG_FS) +- mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +-#endif + break; + default: + kfree(mqd); +@@ -539,17 +276,3 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + + return mqd; + } +- +-struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, +- struct kfd_dev *dev) +-{ +- struct mqd_manager *mqd; +- +- mqd = mqd_manager_init_vi(type, dev); +- if (!mqd) +- return NULL; +- if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) +- mqd->update_mqd = update_mqd_tonga; +- return mqd; +-} +- +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +index 7cca7b4..7e92921 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +@@ -26,6 +26,8 @@ + #include "kfd_device_queue_manager.h" + #include "kfd_kernel_queue.h" + #include "kfd_priv.h" ++#include "kfd_pm4_headers.h" ++#include "kfd_pm4_headers_vi.h" + #include "kfd_pm4_opcodes.h" + + static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, +@@ -33,45 +35,47 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, + { + unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t); + +- WARN((temp * sizeof(uint32_t)) > buffer_size_bytes, +- "Runlist IB overflow"); ++ BUG_ON((temp * sizeof(uint32_t)) > buffer_size_bytes); + *wptr = temp; + } + ++static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) ++{ ++ union PM4_MES_TYPE_3_HEADER header; ++ ++ header.u32all = 0; ++ header.opcode = opcode; ++ header.count = packet_size/sizeof(uint32_t) - 2; ++ header.type = PM4_TYPE_3; ++ ++ return header.u32all; ++} ++ + static void pm_calc_rlib_size(struct packet_manager *pm, + unsigned int *rlib_size, + bool *over_subscription) + { +- unsigned int process_count, queue_count, compute_queue_count; ++ unsigned int process_count, queue_count; + unsigned int map_queue_size; +- unsigned int max_proc_per_quantum = 1; + +- struct kfd_dev *dev = pm->dqm->dev; ++ BUG_ON(!pm || !rlib_size || !over_subscription); + + process_count = pm->dqm->processes_count; + queue_count = pm->dqm->queue_count; +- compute_queue_count = queue_count - pm->dqm->sdma_queue_count; +- +- /* check if there is over subscription +- * Note: the arbitration between the number of VMIDs and +- * hws_max_conc_proc has been done in +- * kgd2kfd_device_init(). +- */ + ++ /* check if there is over subscription*/ + *over_subscription = false; +- +- if (dev->max_proc_per_quantum > 1) +- max_proc_per_quantum = dev->max_proc_per_quantum; +- +- if ((process_count > max_proc_per_quantum) || +- compute_queue_count > get_queues_num(pm->dqm)) { ++ if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) { + *over_subscription = true; +- pr_debug("Over subscribed runlist\n"); ++ pr_debug("kfd: over subscribed runlist\n"); + } + +- map_queue_size = pm->pmf->get_map_queues_packet_size(); ++ map_queue_size = ++ (pm->dqm->dev->device_info->asic_family == CHIP_CARRIZO) ? ++ sizeof(struct pm4_mes_map_queues) : ++ sizeof(struct pm4_map_queues); + /* calculate run list ib allocation size */ +- *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + ++ *rlib_size = process_count * sizeof(struct pm4_map_process) + + queue_count * map_queue_size; + + /* +@@ -79,9 +83,9 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + * when over subscription + */ + if (*over_subscription) +- *rlib_size += pm->pmf->get_runlist_packet_size(); ++ *rlib_size += sizeof(struct pm4_runlist); + +- pr_debug("runlist ib size %d\n", *rlib_size); ++ pr_debug("kfd: runlist ib size %d\n", *rlib_size); + } + + static int pm_allocate_runlist_ib(struct packet_manager *pm, +@@ -92,19 +96,18 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, + { + int retval; + +- if (WARN_ON(pm->allocated)) +- return -EINVAL; ++ BUG_ON(!pm); ++ BUG_ON(pm->allocated); ++ BUG_ON(is_over_subscription == NULL); + + pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + +- mutex_lock(&pm->lock); +- + retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, + &pm->ib_buffer_obj); + +- if (retval) { +- pr_err("Failed to allocate runlist IB\n"); +- goto out; ++ if (retval != 0) { ++ pr_err("kfd: failed to allocate runlist IB\n"); ++ return retval; + } + + *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; +@@ -112,12 +115,198 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, + + memset(*rl_buffer, 0, *rl_buffer_size); + pm->allocated = true; +- +-out: +- mutex_unlock(&pm->lock); + return retval; + } + ++static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t ib, size_t ib_size_in_dwords, bool chain) ++{ ++ struct pm4_runlist *packet; ++ ++ BUG_ON(!pm || !buffer || !ib); ++ ++ packet = (struct pm4_runlist *)buffer; ++ ++ memset(buffer, 0, sizeof(struct pm4_runlist)); ++ packet->header.u32all = build_pm4_header(IT_RUN_LIST, ++ sizeof(struct pm4_runlist)); ++ ++ packet->bitfields4.ib_size = ib_size_in_dwords; ++ packet->bitfields4.chain = chain ? 1 : 0; ++ packet->bitfields4.offload_polling = 0; ++ packet->bitfields4.valid = 1; ++ packet->ordinal2 = lower_32_bits(ib); ++ packet->bitfields3.ib_base_hi = upper_32_bits(ib); ++ ++ return 0; ++} ++ ++static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, ++ struct qcm_process_device *qpd) ++{ ++ struct pm4_map_process *packet; ++ struct queue *cur; ++ uint32_t num_queues; ++ ++ BUG_ON(!pm || !buffer || !qpd); ++ ++ packet = (struct pm4_map_process *)buffer; ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ memset(buffer, 0, sizeof(struct pm4_map_process)); ++ ++ packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_map_process)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields3.page_table_base = qpd->page_table_base; ++ packet->bitfields10.gds_size = qpd->gds_size; ++ packet->bitfields10.num_gws = qpd->num_gws; ++ packet->bitfields10.num_oac = qpd->num_oac; ++ num_queues = 0; ++ list_for_each_entry(cur, &qpd->queues_list, list) ++ num_queues++; ++ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; ++ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ return 0; ++} ++ ++static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static) ++{ ++ struct pm4_mes_map_queues *packet; ++ bool use_static = is_static; ++ ++ BUG_ON(!pm || !buffer || !q); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ packet = (struct pm4_mes_map_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_map_queues)); ++ ++ packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, ++ sizeof(struct pm4_map_queues)); ++ packet->bitfields2.alloc_format = ++ alloc_format__mes_map_queues__one_per_pipe_vi; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; ++ ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_map_queues__compute_vi; ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_compute_vi; ++ ++ switch (q->properties.type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ if (use_static) ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_latency_static_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__debug_interface_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_map_queues__sdma0_vi; ++ use_static = false; /* no static queues under SDMA */ ++ break; ++ default: ++ pr_err("kfd: in %s queue type %d\n", __func__, ++ q->properties.type); ++ BUG(); ++ break; ++ } ++ packet->bitfields3.doorbell_offset = ++ q->properties.doorbell_off; ++ ++ packet->mqd_addr_lo = ++ lower_32_bits(q->gart_mqd_addr); ++ ++ packet->mqd_addr_hi = ++ upper_32_bits(q->gart_mqd_addr); ++ ++ packet->wptr_addr_lo = ++ lower_32_bits((uint64_t)q->properties.write_ptr); ++ ++ packet->wptr_addr_hi = ++ upper_32_bits((uint64_t)q->properties.write_ptr); ++ ++ return 0; ++} ++ ++static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static) ++{ ++ struct pm4_map_queues *packet; ++ bool use_static = is_static; ++ ++ BUG_ON(!pm || !buffer || !q); ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ packet = (struct pm4_map_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_map_queues)); ++ ++ packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, ++ sizeof(struct pm4_map_queues)); ++ packet->bitfields2.alloc_format = ++ alloc_format__mes_map_queues__one_per_pipe; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots; ++ ++ packet->bitfields2.vidmem = (q->properties.is_interop) ? ++ vidmem__mes_map_queues__uses_video_memory : ++ vidmem__mes_map_queues__uses_no_video_memory; ++ ++ switch (q->properties.type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_map_queues__compute; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_map_queues__sdma0; ++ use_static = false; /* no static queues under SDMA */ ++ break; ++ default: ++ BUG(); ++ break; ++ } ++ ++ packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset = ++ q->properties.doorbell_off; ++ ++ packet->mes_map_queues_ordinals[0].bitfields3.is_static = ++ (use_static) ? 1 : 0; ++ ++ packet->mes_map_queues_ordinals[0].mqd_addr_lo = ++ lower_32_bits(q->gart_mqd_addr); ++ ++ packet->mes_map_queues_ordinals[0].mqd_addr_hi = ++ upper_32_bits(q->gart_mqd_addr); ++ ++ packet->mes_map_queues_ordinals[0].wptr_addr_lo = ++ lower_32_bits((uint64_t)q->properties.write_ptr); ++ ++ packet->mes_map_queues_ordinals[0].wptr_addr_hi = ++ upper_32_bits((uint64_t)q->properties.write_ptr); ++ ++ return 0; ++} + + static int pm_create_runlist_ib(struct packet_manager *pm, + struct list_head *queues, +@@ -133,17 +322,19 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + struct kernel_queue *kq; + bool is_over_subscription; + ++ BUG_ON(!pm || !queues || !rl_size_bytes || !rl_gpu_addr); ++ + rl_wptr = retval = proccesses_mapped = 0; + + retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr, + &alloc_size_bytes, &is_over_subscription); +- if (retval) ++ if (retval != 0) + return retval; + + *rl_size_bytes = alloc_size_bytes; +- pm->ib_size_bytes = alloc_size_bytes; + +- pr_debug("Building runlist ib process count: %d queues count %d\n", ++ pr_debug("kfd: In func %s\n", __func__); ++ pr_debug("kfd: building runlist ib process count: %d queues count %d\n", + pm->dqm->processes_count, pm->dqm->queue_count); + + /* build the run list ib packet */ +@@ -151,35 +342,42 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + qpd = cur->qpd; + /* build map process packet */ + if (proccesses_mapped >= pm->dqm->processes_count) { +- pr_debug("Not enough space left in runlist IB\n"); ++ pr_debug("kfd: not enough space left in runlist IB\n"); + pm_release_ib(pm); + return -ENOMEM; + } + +- retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); +- if (retval) ++ retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); ++ if (retval != 0) + return retval; + + proccesses_mapped++; +- inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(), ++ inc_wptr(&rl_wptr, sizeof(struct pm4_map_process), + alloc_size_bytes); + + list_for_each_entry(kq, &qpd->priv_queue_list, list) { + if (!kq->queue->properties.is_active) + continue; + +- pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", ++ pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n", + kq->queue->queue, qpd->is_debug); + +- retval = pm->pmf->map_queues(pm, ++ if (pm->dqm->dev->device_info->asic_family == ++ CHIP_CARRIZO) ++ retval = pm_create_map_queue_vi(pm, ++ &rl_buffer[rl_wptr], ++ kq->queue, ++ qpd->is_debug); ++ else ++ retval = pm_create_map_queue(pm, + &rl_buffer[rl_wptr], + kq->queue, + qpd->is_debug); +- if (retval) ++ if (retval != 0) + return retval; + + inc_wptr(&rl_wptr, +- pm->pmf->get_map_queues_packet_size(), ++ sizeof(struct pm4_map_queues), + alloc_size_bytes); + } + +@@ -187,74 +385,63 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + if (!q->properties.is_active) + continue; + +- pr_debug("static_queue, mapping user queue %d, is debug status %d\n", ++ pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n", + q->queue, qpd->is_debug); + +- retval = pm->pmf->map_queues(pm, ++ if (pm->dqm->dev->device_info->asic_family == ++ CHIP_CARRIZO) ++ retval = pm_create_map_queue_vi(pm, + &rl_buffer[rl_wptr], + q, + qpd->is_debug); +- if (retval) ++ else ++ retval = pm_create_map_queue(pm, ++ &rl_buffer[rl_wptr], ++ q, ++ qpd->is_debug); ++ ++ if (retval != 0) + return retval; + + inc_wptr(&rl_wptr, +- pm->pmf->get_map_queues_packet_size(), ++ sizeof(struct pm4_map_queues), + alloc_size_bytes); + } + } + +- pr_debug("Finished map process and queues to runlist\n"); ++ pr_debug("kfd: finished map process and queues to runlist\n"); + + if (is_over_subscription) +- retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], +- *rl_gpu_addr, +- alloc_size_bytes / sizeof(uint32_t), +- true); ++ pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, ++ alloc_size_bytes / sizeof(uint32_t), true); + + for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) + pr_debug("0x%2X ", rl_buffer[i]); + pr_debug("\n"); + +- return retval; ++ return 0; + } + +-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, +- uint16_t fw_ver) ++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) + { ++ BUG_ON(!dqm); ++ + pm->dqm = dqm; + mutex_init(&pm->lock); + pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); +- if (!pm->priv_queue) { ++ if (pm->priv_queue == NULL) { + mutex_destroy(&pm->lock); + return -ENOMEM; + } + pm->allocated = false; + +- switch (pm->dqm->dev->device_info->asic_family) { +- case CHIP_KAVERI: +- case CHIP_HAWAII: +- kfd_pm_func_init_cik(pm, fw_ver); +- break; +- case CHIP_CARRIZO: +- case CHIP_TONGA: +- case CHIP_FIJI: +- case CHIP_POLARIS10: +- case CHIP_POLARIS11: +- kfd_pm_func_init_vi(pm, fw_ver); +- break; +- case CHIP_VEGA10: +- case CHIP_RAVEN: +- kfd_pm_func_init_v9(pm, fw_ver); +- break; +- default: +- BUG(); +- } +- + return 0; + } + + void pm_uninit(struct packet_manager *pm) + { ++ BUG_ON(!pm); ++ + mutex_destroy(&pm->lock); + kernel_queue_uninit(pm->priv_queue); + } +@@ -262,30 +449,45 @@ void pm_uninit(struct packet_manager *pm) + int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res) + { +- uint32_t *buffer, size; +- int retval = 0; ++ struct pm4_set_resources *packet; ++ ++ BUG_ON(!pm || !res); ++ ++ pr_debug("kfd: In func %s\n", __func__); + +- size = pm->pmf->get_set_resources_packet_size(); + mutex_lock(&pm->lock); + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, +- size / sizeof(uint32_t), +- (unsigned int **)&buffer); +- if (!buffer) { +- pr_err("Failed to allocate buffer on kernel queue\n"); +- retval = -ENOMEM; +- goto out; ++ sizeof(*packet) / sizeof(uint32_t), ++ (unsigned int **)&packet); ++ if (packet == NULL) { ++ mutex_unlock(&pm->lock); ++ pr_err("kfd: failed to allocate buffer on kernel queue\n"); ++ return -ENOMEM; + } + +- retval = pm->pmf->set_resources(pm, buffer, res); +- if (!retval) +- pm->priv_queue->ops.submit_packet(pm->priv_queue); +- else +- pm->priv_queue->ops.rollback_packet(pm->priv_queue); ++ memset(packet, 0, sizeof(struct pm4_set_resources)); ++ packet->header.u32all = build_pm4_header(IT_SET_RESOURCES, ++ sizeof(struct pm4_set_resources)); ++ ++ packet->bitfields2.queue_type = ++ queue_type__mes_set_resources__hsa_interface_queue_hiq; ++ packet->bitfields2.vmid_mask = res->vmid_mask; ++ packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY; ++ packet->bitfields7.oac_mask = res->oac_mask; ++ packet->bitfields8.gds_heap_base = res->gds_heap_base; ++ packet->bitfields8.gds_heap_size = res->gds_heap_size; ++ ++ packet->gws_mask_lo = lower_32_bits(res->gws_mask); ++ packet->gws_mask_hi = upper_32_bits(res->gws_mask); ++ ++ packet->queue_mask_lo = lower_32_bits(res->queue_mask); ++ packet->queue_mask_hi = upper_32_bits(res->queue_mask); ++ ++ pm->priv_queue->ops.submit_packet(pm->priv_queue); + +-out: + mutex_unlock(&pm->lock); + +- return retval; ++ return 0; + } + + int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) +@@ -295,25 +497,26 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) + size_t rl_ib_size, packet_size_dwords; + int retval; + ++ BUG_ON(!pm || !dqm_queues); ++ + retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr, + &rl_ib_size); +- if (retval) ++ if (retval != 0) + goto fail_create_runlist_ib; + +- pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); ++ pr_debug("kfd: runlist IB address: 0x%llX\n", rl_gpu_ib_addr); + +- packet_size_dwords = pm->pmf->get_runlist_packet_size() / +- sizeof(uint32_t); ++ packet_size_dwords = sizeof(struct pm4_runlist) / sizeof(uint32_t); + mutex_lock(&pm->lock); + + retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + packet_size_dwords, &rl_buffer); +- if (retval) ++ if (retval != 0) + goto fail_acquire_packet_buffer; + +- retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, +- rl_ib_size / sizeof(uint32_t), false); +- if (retval) ++ retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, ++ rl_ib_size / sizeof(uint32_t), false); ++ if (retval != 0) + goto fail_create_runlist; + + pm->priv_queue->ops.submit_packet(pm->priv_queue); +@@ -327,72 +530,138 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) + fail_acquire_packet_buffer: + mutex_unlock(&pm->lock); + fail_create_runlist_ib: +- pm_release_ib(pm); ++ if (pm->allocated) ++ pm_release_ib(pm); + return retval; + } + + int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value) + { +- uint32_t *buffer, size; +- int retval = 0; ++ int retval; ++ struct pm4_query_status *packet; + +- if (WARN_ON(!fence_address)) +- return -EFAULT; ++ BUG_ON(!pm || !fence_address); + +- size = pm->pmf->get_query_status_packet_size(); + mutex_lock(&pm->lock); +- pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, +- size / sizeof(uint32_t), (unsigned int **)&buffer); +- if (!buffer) { +- pr_err("Failed to allocate buffer on kernel queue\n"); +- retval = -ENOMEM; +- goto out; +- } ++ retval = pm->priv_queue->ops.acquire_packet_buffer( ++ pm->priv_queue, ++ sizeof(struct pm4_query_status) / sizeof(uint32_t), ++ (unsigned int **)&packet); ++ if (retval != 0) ++ goto fail_acquire_packet_buffer; + +- retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); +- if (!retval) +- pm->priv_queue->ops.submit_packet(pm->priv_queue); +- else +- pm->priv_queue->ops.rollback_packet(pm->priv_queue); ++ packet->header.u32all = build_pm4_header(IT_QUERY_STATUS, ++ sizeof(struct pm4_query_status)); ++ ++ packet->bitfields2.context_id = 0; ++ packet->bitfields2.interrupt_sel = ++ interrupt_sel__mes_query_status__completion_status; ++ packet->bitfields2.command = ++ command__mes_query_status__fence_only_after_write_ack; ++ ++ packet->addr_hi = upper_32_bits((uint64_t)fence_address); ++ packet->addr_lo = lower_32_bits((uint64_t)fence_address); ++ packet->data_hi = upper_32_bits((uint64_t)fence_value); ++ packet->data_lo = lower_32_bits((uint64_t)fence_value); ++ ++ pm->priv_queue->ops.submit_packet(pm->priv_queue); ++ mutex_unlock(&pm->lock); ++ ++ return 0; + +-out: ++fail_acquire_packet_buffer: + mutex_unlock(&pm->lock); + return retval; + } + + int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, +- enum kfd_unmap_queues_filter filter, ++ enum kfd_preempt_type_filter mode, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) + { +- uint32_t *buffer, size; +- int retval = 0; ++ int retval; ++ uint32_t *buffer; ++ struct pm4_unmap_queues *packet; ++ ++ BUG_ON(!pm); + +- size = pm->pmf->get_unmap_queues_packet_size(); + mutex_lock(&pm->lock); +- pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, +- size / sizeof(uint32_t), (unsigned int **)&buffer); +- if (!buffer) { +- pr_err("Failed to allocate buffer on kernel queue\n"); +- retval = -ENOMEM; +- goto out; ++ retval = pm->priv_queue->ops.acquire_packet_buffer( ++ pm->priv_queue, ++ sizeof(struct pm4_unmap_queues) / sizeof(uint32_t), ++ &buffer); ++ if (retval != 0) ++ goto err_acquire_packet_buffer; ++ ++ packet = (struct pm4_unmap_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_unmap_queues)); ++ pr_debug("kfd: static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n", ++ mode, reset, type); ++ packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, ++ sizeof(struct pm4_unmap_queues)); ++ switch (type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__compute; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__sdma0 + sdma_engine; ++ break; ++ default: ++ BUG(); ++ break; + } + +- retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, +- reset, sdma_engine); +- if (!retval) +- pm->priv_queue->ops.submit_packet(pm->priv_queue); ++ if (reset) ++ packet->bitfields2.action = ++ action__mes_unmap_queues__reset_queues; + else +- pm->priv_queue->ops.rollback_packet(pm->priv_queue); ++ packet->bitfields2.action = ++ action__mes_unmap_queues__preempt_queues; ++ ++ switch (mode) { ++ case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_specified_queues; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields3b.doorbell_offset0 = filter_param; ++ break; ++ case KFD_PREEMPT_TYPE_FILTER_BY_PASID: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; ++ packet->bitfields3a.pasid = filter_param; ++ break; ++ case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; ++ break; ++ case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES: ++ /* in this case, we do not preempt static queues */ ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; ++ break; ++ default: ++ BUG(); ++ break; ++ } ++ ++ pm->priv_queue->ops.submit_packet(pm->priv_queue); + +-out: ++ mutex_unlock(&pm->lock); ++ return 0; ++ ++err_acquire_packet_buffer: + mutex_unlock(&pm->lock); + return retval; + } + + void pm_release_ib(struct packet_manager *pm) + { ++ BUG_ON(!pm); ++ + mutex_lock(&pm->lock); + if (pm->allocated) { + kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj); +@@ -400,18 +669,3 @@ void pm_release_ib(struct packet_manager *pm) + } + mutex_unlock(&pm->lock); + } +- +-int pm_debugfs_runlist(struct seq_file *m, void *data) +-{ +- struct packet_manager *pm = data; +- +- if (!pm->allocated) { +- seq_puts(m, " No active runlist\n"); +- return 0; +- } +- +- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, +- pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); +- +- return 0; +-} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c +index 1e06de0..6cfe7f1 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c +@@ -32,8 +32,7 @@ int kfd_pasid_init(void) + { + pasid_limit = KFD_MAX_NUM_OF_PROCESSES; + +- pasid_bitmap = kcalloc(BITS_TO_LONGS(pasid_limit), sizeof(long), +- GFP_KERNEL); ++ pasid_bitmap = kcalloc(BITS_TO_LONGS(pasid_limit), sizeof(long), GFP_KERNEL); + if (!pasid_bitmap) + return -ENOMEM; + +@@ -92,6 +91,6 @@ unsigned int kfd_pasid_alloc(void) + + void kfd_pasid_free(unsigned int pasid) + { +- if (!WARN_ON(pasid == 0 || pasid >= pasid_limit)) +- clear_bit(pasid, pasid_bitmap); ++ BUG_ON(pasid == 0 || pasid >= pasid_limit); ++ clear_bit(pasid, pasid_bitmap); + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c +deleted file mode 100644 +index 543ed83..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c ++++ /dev/null +@@ -1,513 +0,0 @@ +-/* +- * Copyright 2016 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +- +-/* NOTE: +- * +- * This file contains logic to dynamically detect and enable PeerDirect +- * suppor. PeerDirect support is delivered e.g. as part of OFED +- * from Mellanox. Because we are not able to rely on the fact that the +- * corresponding OFED will be installed we should: +- * - copy PeerDirect definitions locally to avoid dependency on +- * corresponding header file +- * - try dynamically detect address of PeerDirect function +- * pointers. +- * +- * If dynamic detection failed then PeerDirect support should be +- * enabled using the standard PeerDirect bridge driver from: +- * https://github.com/RadeonOpenCompute/ROCnRDMA +- * +- * +- * Logic to support PeerDirect relies only on official public API to be +- * non-intrusive as much as possible. +- * +- **/ +- +-#include <linux/device.h> +-#include <linux/export.h> +-#include <linux/pid.h> +-#include <linux/err.h> +-#include <linux/slab.h> +-#include <linux/scatterlist.h> +-#include <linux/module.h> +- +-#include "kfd_priv.h" +- +- +- +-/* ----------------------- PeerDirect interface ------------------------------*/ +- +-/* +- * Copyright (c) 2013, Mellanox Technologies. All rights reserved. +- * +- * This software is available to you under a choice of one of two +- * licenses. You may choose to be licensed under the terms of the GNU +- * General Public License (GPL) Version 2, available from the file +- * COPYING in the main directory of this source tree, or the +- * OpenIB.org BSD license below: +- * +- * Redistribution and use in source and binary forms, with or +- * without modification, are permitted provided that the following +- * conditions are met: +- * +- * - Redistributions of source code must retain the above +- * copyright notice, this list of conditions and the following +- * disclaimer. +- * +- * - Redistributions in binary form must reproduce the above +- * copyright notice, this list of conditions and the following +- * disclaimer in the documentation and/or other materials +- * provided with the distribution. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +-#define IB_PEER_MEMORY_NAME_MAX 64 +-#define IB_PEER_MEMORY_VER_MAX 16 +- +-struct peer_memory_client { +- char name[IB_PEER_MEMORY_NAME_MAX]; +- char version[IB_PEER_MEMORY_VER_MAX]; +- /* acquire return code: 1-mine, 0-not mine */ +- int (*acquire)(unsigned long addr, size_t size, +- void *peer_mem_private_data, +- char *peer_mem_name, +- void **client_context); +- int (*get_pages)(unsigned long addr, +- size_t size, int write, int force, +- struct sg_table *sg_head, +- void *client_context, void *core_context); +- int (*dma_map)(struct sg_table *sg_head, void *client_context, +- struct device *dma_device, int dmasync, int *nmap); +- int (*dma_unmap)(struct sg_table *sg_head, void *client_context, +- struct device *dma_device); +- void (*put_pages)(struct sg_table *sg_head, void *client_context); +- unsigned long (*get_page_size)(void *client_context); +- void (*release)(void *client_context); +- void* (*get_context_private_data)(u64 peer_id); +- void (*put_context_private_data)(void *context); +-}; +- +-typedef int (*invalidate_peer_memory)(void *reg_handle, +- void *core_context); +- +-void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, +- invalidate_peer_memory *invalidate_callback); +-void ib_unregister_peer_memory_client(void *reg_handle); +- +- +-/*------------------- PeerDirect bridge driver ------------------------------*/ +- +-#define AMD_PEER_BRIDGE_DRIVER_VERSION "1.0" +-#define AMD_PEER_BRIDGE_DRIVER_NAME "amdkfd" +- +- +-static void* (*pfn_ib_register_peer_memory_client)(struct peer_memory_client +- *peer_client, +- invalidate_peer_memory +- *invalidate_callback); +- +-static void (*pfn_ib_unregister_peer_memory_client)(void *reg_handle); +- +-static const struct amd_rdma_interface *rdma_interface; +- +-static invalidate_peer_memory ib_invalidate_callback; +-static void *ib_reg_handle; +- +-struct amd_mem_context { +- uint64_t va; +- uint64_t size; +- struct pid *pid; +- +- struct amd_p2p_info *p2p_info; +- +- /* Flag that free callback was called */ +- int free_callback_called; +- +- /* Context received from PeerDirect call */ +- void *core_context; +-}; +- +- +-static void free_callback(void *client_priv) +-{ +- struct amd_mem_context *mem_context = +- (struct amd_mem_context *)client_priv; +- +- pr_debug("data 0x%p\n", mem_context); +- +- if (!mem_context) { +- pr_warn("Invalid client context\n"); +- return; +- } +- +- pr_debug("mem_context->core_context 0x%p\n", mem_context->core_context); +- +- /* Call back IB stack asking to invalidate memory */ +- (*ib_invalidate_callback) (ib_reg_handle, mem_context->core_context); +- +- /* amdkfd will free resources when we return from this callback. +- * Set flag to inform that there is nothing to do on "put_pages", etc. +- */ +- ACCESS_ONCE(mem_context->free_callback_called) = 1; +-} +- +- +-static int amd_acquire(unsigned long addr, size_t size, +- void *peer_mem_private_data, +- char *peer_mem_name, void **client_context) +-{ +- int ret; +- struct amd_mem_context *mem_context; +- struct pid *pid; +- +- /* Get pointer to structure describing current process */ +- pid = get_task_pid(current, PIDTYPE_PID); +- +- pr_debug("addr:0x%lx,size:0x%x, pid 0x%p\n", +- addr, (unsigned int)size, pid); +- +- /* Check if address is handled by AMD GPU driver */ +- ret = rdma_interface->is_gpu_address(addr, pid); +- +- if (!ret) { +- pr_debug("Not GPU Address\n"); +- /* This is not GPU address */ +- return 0; +- } +- +- pr_debug("GPU address\n"); +- +- /* Initialize context used for operation with given address */ +- mem_context = kzalloc(sizeof(*mem_context), GFP_KERNEL); +- +- if (!mem_context) +- return 0; /* Error case handled as not GPU address */ +- +- mem_context->free_callback_called = 0; +- mem_context->va = addr; +- mem_context->size = size; +- +- /* Save PID. It is guaranteed that the function will be +- * called in the correct process context as opposite to others. +- */ +- mem_context->pid = pid; +- +- pr_debug("Client context %p\n", mem_context); +- +- /* Return pointer to allocated context */ +- *client_context = mem_context; +- +- /* Return 1 to inform that this address which will be handled +- * by AMD GPU driver +- */ +- return 1; +-} +- +-static int amd_get_pages(unsigned long addr, size_t size, int write, int force, +- struct sg_table *sg_head, +- void *client_context, void *core_context) +-{ +- int ret; +- struct amd_mem_context *mem_context = +- (struct amd_mem_context *)client_context; +- +- pr_debug("addr:0x%lx,size:0x%x, core_context:%p\n", +- addr, (unsigned int)size, core_context); +- +- if (!mem_context) { +- pr_warn("Invalid client context"); +- return -EINVAL; +- } +- +- pr_debug("pid :0x%p\n", mem_context->pid); +- +- +- if (addr != mem_context->va) { +- pr_warn("Context address (0x%llx) is not the same\n", +- mem_context->va); +- return -EINVAL; +- } +- +- if (size != mem_context->size) { +- pr_warn("Context size (0x%llx) is not the same\n", +- mem_context->size); +- return -EINVAL; +- } +- +- ret = rdma_interface->get_pages(addr, +- size, +- mem_context->pid, +- &mem_context->p2p_info, +- free_callback, +- mem_context); +- +- if (ret || !mem_context->p2p_info) { +- pr_err("Could not rdma::get_pages failure: %d\n", ret); +- return ret; +- } +- +- mem_context->core_context = core_context; +- +- /* Note: At this stage it is OK not to fill sg_table */ +- return 0; +-} +- +- +-static int amd_dma_map(struct sg_table *sg_head, void *client_context, +- struct device *dma_device, int dmasync, int *nmap) +-{ +- /* +- * NOTE/TODO: +- * We could have potentially three cases for real memory +- * location: +- * - all memory in the local +- * - all memory in the system (RAM) +- * - memory is spread (s/g) between local and system. +- * +- * In the case of all memory in the system we could use +- * iommu driver to build DMA addresses but not in the case +- * of local memory because currently iommu driver doesn't +- * deal with local/device memory addresses (it requires "struct +- * page"). +- * +- * Accordingly returning assumes that iommu funcutionality +- * should be disabled so we can assume that sg_table already +- * contains DMA addresses. +- * +- */ +- struct amd_mem_context *mem_context = +- (struct amd_mem_context *)client_context; +- +- pr_debug("Context 0x%p, sg_head 0x%p\n", +- client_context, sg_head); +- +- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", +- mem_context->pid, +- mem_context->va, +- mem_context->size); +- +- if (!mem_context->p2p_info) { +- pr_err("No sg table were allocated\n"); +- return -EINVAL; +- } +- +- /* Copy information about previosly allocated sg_table */ +- *sg_head = *mem_context->p2p_info->pages; +- +- /* Return number of pages */ +- *nmap = mem_context->p2p_info->pages->nents; +- +- return 0; +-} +- +-static int amd_dma_unmap(struct sg_table *sg_head, void *client_context, +- struct device *dma_device) +-{ +- struct amd_mem_context *mem_context = +- (struct amd_mem_context *)client_context; +- +- pr_debug("Context 0x%p, sg_table 0x%p\n", +- client_context, sg_head); +- +- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", +- mem_context->pid, +- mem_context->va, +- mem_context->size); +- +- /* Assume success */ +- return 0; +-} +-static void amd_put_pages(struct sg_table *sg_head, void *client_context) +-{ +- int ret = 0; +- struct amd_mem_context *mem_context = +- (struct amd_mem_context *)client_context; +- +- pr_debug("sg_head %p client_context: 0x%p\n", +- sg_head, client_context); +- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", +- mem_context->pid, +- mem_context->va, +- mem_context->size); +- +- pr_debug("mem_context->p2p_info %p\n", +- mem_context->p2p_info); +- +- if (ACCESS_ONCE(mem_context->free_callback_called)) { +- pr_debug("Free callback was called\n"); +- return; +- } +- +- if (mem_context->p2p_info) { +- ret = rdma_interface->put_pages(&mem_context->p2p_info); +- mem_context->p2p_info = NULL; +- +- if (ret) +- pr_err("Failure: %d (callback status %d)\n", +- ret, mem_context->free_callback_called); +- } else +- pr_err("Pointer to p2p info is null\n"); +-} +-static unsigned long amd_get_page_size(void *client_context) +-{ +- unsigned long page_size; +- int result; +- struct amd_mem_context *mem_context = +- (struct amd_mem_context *)client_context; +- +- pr_debug("context: %p\n", client_context); +- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", +- mem_context->pid, +- mem_context->va, +- mem_context->size); +- +- +- result = rdma_interface->get_page_size( +- mem_context->va, +- mem_context->size, +- mem_context->pid, +- &page_size); +- +- if (result) { +- pr_err("Could not get page size. %d\n", result); +- /* If we failed to get page size then do not know what to do. +- * Let's return some default value +- */ +- return PAGE_SIZE; +- } +- +- return page_size; +-} +- +-static void amd_release(void *client_context) +-{ +- struct amd_mem_context *mem_context = +- (struct amd_mem_context *)client_context; +- +- pr_debug("context: 0x%p\n", client_context); +- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", +- mem_context->pid, +- mem_context->va, +- mem_context->size); +- +- kfree(mem_context); +-} +- +- +-static struct peer_memory_client amd_mem_client = { +- .acquire = amd_acquire, +- .get_pages = amd_get_pages, +- .dma_map = amd_dma_map, +- .dma_unmap = amd_dma_unmap, +- .put_pages = amd_put_pages, +- .get_page_size = amd_get_page_size, +- .release = amd_release, +- .get_context_private_data = NULL, +- .put_context_private_data = NULL, +-}; +- +-/** Initialize PeerDirect interface with RDMA Network stack. +- * +- * Because network stack could potentially be loaded later we check +- * presence of PeerDirect when HSA process is created. If PeerDirect was +- * already initialized we do nothing otherwise try to detect and register. +- */ +-void kfd_init_peer_direct(void) +-{ +- int result; +- +- if (pfn_ib_unregister_peer_memory_client) { +- pr_debug("PeerDirect support was already initialized\n"); +- return; +- } +- +- pr_debug("Try to initialize PeerDirect support\n"); +- +- pfn_ib_register_peer_memory_client = +- (void *(*)(struct peer_memory_client *, +- invalidate_peer_memory *)) +- symbol_request(ib_register_peer_memory_client); +- +- pfn_ib_unregister_peer_memory_client = (void (*)(void *)) +- symbol_request(ib_unregister_peer_memory_client); +- +- if (!pfn_ib_register_peer_memory_client || +- !pfn_ib_unregister_peer_memory_client) { +- pr_debug("PeerDirect interface was not detected\n"); +- /* Do cleanup */ +- kfd_close_peer_direct(); +- return; +- } +- +- result = amdkfd_query_rdma_interface(&rdma_interface); +- +- if (result < 0) { +- pr_err("Cannot get RDMA Interface (result = %d)\n", result); +- return; +- } +- +- strcpy(amd_mem_client.name, AMD_PEER_BRIDGE_DRIVER_NAME); +- strcpy(amd_mem_client.version, AMD_PEER_BRIDGE_DRIVER_VERSION); +- +- ib_reg_handle = pfn_ib_register_peer_memory_client(&amd_mem_client, +- &ib_invalidate_callback); +- +- if (!ib_reg_handle) { +- pr_err("Cannot register peer memory client\n"); +- /* Do cleanup */ +- kfd_close_peer_direct(); +- return; +- } +- +- pr_info("PeerDirect support was initialized successfully\n"); +-} +- +-/** +- * Close connection with PeerDirect interface with RDMA Network stack. +- * +- */ +-void kfd_close_peer_direct(void) +-{ +- if (pfn_ib_unregister_peer_memory_client) { +- if (ib_reg_handle) +- pfn_ib_unregister_peer_memory_client(ib_reg_handle); +- +- symbol_put(ib_unregister_peer_memory_client); +- } +- +- if (pfn_ib_register_peer_memory_client) +- symbol_put(ib_register_peer_memory_client); +- +- +- /* Reset pointers to be safe */ +- pfn_ib_unregister_peer_memory_client = NULL; +- pfn_ib_register_peer_memory_client = NULL; +- ib_reg_handle = NULL; +-} +- +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h +index e50f73d..5b393f3 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h +@@ -28,19 +28,112 @@ + #define PM4_MES_HEADER_DEFINED + union PM4_MES_TYPE_3_HEADER { + struct { +- /* reserved */ +- uint32_t reserved1:8; +- /* IT opcode */ +- uint32_t opcode:8; +- /* number of DWORDs - 1 in the information body */ +- uint32_t count:14; +- /* packet identifier. It should be 3 for type 3 packets */ +- uint32_t type:2; ++ uint32_t reserved1:8; /* < reserved */ ++ uint32_t opcode:8; /* < IT opcode */ ++ uint32_t count:14; /* < number of DWORDs - 1 ++ * in the information body. ++ */ ++ uint32_t type:2; /* < packet identifier. ++ * It should be 3 for type 3 packets ++ */ + }; + uint32_t u32all; + }; + #endif /* PM4_MES_HEADER_DEFINED */ + ++/* --------------------MES_SET_RESOURCES-------------------- */ ++ ++#ifndef PM4_MES_SET_RESOURCES_DEFINED ++#define PM4_MES_SET_RESOURCES_DEFINED ++enum set_resources_queue_type_enum { ++ queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, ++ queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, ++ queue_type__mes_set_resources__hsa_debug_interface_queue = 4 ++}; ++ ++struct pm4_set_resources { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t vmid_mask:16; ++ uint32_t unmap_latency:8; ++ uint32_t reserved1:5; ++ enum set_resources_queue_type_enum queue_type:3; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ uint32_t queue_mask_lo; ++ uint32_t queue_mask_hi; ++ uint32_t gws_mask_lo; ++ uint32_t gws_mask_hi; ++ ++ union { ++ struct { ++ uint32_t oac_mask:16; ++ uint32_t reserved2:16; ++ } bitfields7; ++ uint32_t ordinal7; ++ }; ++ ++ union { ++ struct { ++ uint32_t gds_heap_base:6; ++ uint32_t reserved3:5; ++ uint32_t gds_heap_size:6; ++ uint32_t reserved4:15; ++ } bitfields8; ++ uint32_t ordinal8; ++ }; ++ ++}; ++#endif ++ ++/*--------------------MES_RUN_LIST-------------------- */ ++ ++#ifndef PM4_MES_RUN_LIST_DEFINED ++#define PM4_MES_RUN_LIST_DEFINED ++ ++struct pm4_runlist { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved1:2; ++ uint32_t ib_base_lo:30; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t ib_base_hi:16; ++ uint32_t reserved2:16; ++ } bitfields3; ++ uint32_t ordinal3; ++ }; ++ ++ union { ++ struct { ++ uint32_t ib_size:20; ++ uint32_t chain:1; ++ uint32_t offload_polling:1; ++ uint32_t reserved3:1; ++ uint32_t valid:1; ++ uint32_t reserved4:8; ++ } bitfields4; ++ uint32_t ordinal4; ++ }; ++ ++}; ++#endif + + /*--------------------MES_MAP_PROCESS-------------------- */ + +@@ -93,58 +186,217 @@ struct pm4_map_process { + }; + #endif + +-#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH +-#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH ++/*--------------------MES_MAP_QUEUES--------------------*/ ++ ++#ifndef PM4_MES_MAP_QUEUES_DEFINED ++#define PM4_MES_MAP_QUEUES_DEFINED ++enum map_queues_queue_sel_enum { ++ queue_sel__mes_map_queues__map_to_specified_queue_slots = 0, ++ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots = 1, ++ queue_sel__mes_map_queues__enable_process_queues = 2 ++}; + +-struct pm4_map_process_scratch_kv { ++enum map_queues_vidmem_enum { ++ vidmem__mes_map_queues__uses_no_video_memory = 0, ++ vidmem__mes_map_queues__uses_video_memory = 1 ++}; ++ ++enum map_queues_alloc_format_enum { ++ alloc_format__mes_map_queues__one_per_pipe = 0, ++ alloc_format__mes_map_queues__all_on_one_pipe = 1 ++}; ++ ++enum map_queues_engine_sel_enum { ++ engine_sel__mes_map_queues__compute = 0, ++ engine_sel__mes_map_queues__sdma0 = 2, ++ engine_sel__mes_map_queues__sdma1 = 3 ++}; ++ ++struct pm4_map_queues { + union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; + }; + + union { + struct { +- uint32_t pasid:16; +- uint32_t reserved1:8; +- uint32_t diq_enable:1; +- uint32_t process_quantum:7; ++ uint32_t reserved1:4; ++ enum map_queues_queue_sel_enum queue_sel:2; ++ uint32_t reserved2:2; ++ uint32_t vmid:4; ++ uint32_t reserved3:4; ++ enum map_queues_vidmem_enum vidmem:2; ++ uint32_t reserved4:6; ++ enum map_queues_alloc_format_enum alloc_format:2; ++ enum map_queues_engine_sel_enum engine_sel:3; ++ uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + ++ struct { ++ union { ++ struct { ++ uint32_t is_static:1; ++ uint32_t reserved5:1; ++ uint32_t doorbell_offset:21; ++ uint32_t reserved6:3; ++ uint32_t queue:6; ++ } bitfields3; ++ uint32_t ordinal3; ++ }; ++ ++ uint32_t mqd_addr_lo; ++ uint32_t mqd_addr_hi; ++ uint32_t wptr_addr_lo; ++ uint32_t wptr_addr_hi; ++ ++ } mes_map_queues_ordinals[1]; /* 1..N of these ordinal groups */ ++ ++}; ++#endif ++ ++/*--------------------MES_QUERY_STATUS--------------------*/ ++ ++#ifndef PM4_MES_QUERY_STATUS_DEFINED ++#define PM4_MES_QUERY_STATUS_DEFINED ++enum query_status_interrupt_sel_enum { ++ interrupt_sel__mes_query_status__completion_status = 0, ++ interrupt_sel__mes_query_status__process_status = 1, ++ interrupt_sel__mes_query_status__queue_status = 2 ++}; ++ ++enum query_status_command_enum { ++ command__mes_query_status__interrupt_only = 0, ++ command__mes_query_status__fence_only_immediate = 1, ++ command__mes_query_status__fence_only_after_write_ack = 2, ++ command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 ++}; ++ ++enum query_status_engine_sel_enum { ++ engine_sel__mes_query_status__compute = 0, ++ engine_sel__mes_query_status__sdma0_queue = 2, ++ engine_sel__mes_query_status__sdma1_queue = 3 ++}; ++ ++struct pm4_query_status { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ + union { + struct { +- uint32_t page_table_base:28; +- uint32_t reserved2:4; +- } bitfields3; ++ uint32_t context_id:28; ++ enum query_status_interrupt_sel_enum interrupt_sel:2; ++ enum query_status_command_enum command:2; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved1:16; ++ } bitfields3a; ++ struct { ++ uint32_t reserved2:2; ++ uint32_t doorbell_offset:21; ++ uint32_t reserved3:3; ++ enum query_status_engine_sel_enum engine_sel:3; ++ uint32_t reserved4:3; ++ } bitfields3b; + uint32_t ordinal3; + }; + +- uint32_t reserved3; +- uint32_t sh_mem_bases; +- uint32_t sh_mem_config; +- uint32_t sh_mem_ape1_base; +- uint32_t sh_mem_ape1_limit; +- uint32_t sh_hidden_private_base_vmid; +- uint32_t reserved4; +- uint32_t reserved5; +- uint32_t gds_addr_lo; +- uint32_t gds_addr_hi; ++ uint32_t addr_lo; ++ uint32_t addr_hi; ++ uint32_t data_lo; ++ uint32_t data_hi; ++}; ++#endif ++ ++/*--------------------MES_UNMAP_QUEUES--------------------*/ ++ ++#ifndef PM4_MES_UNMAP_QUEUES_DEFINED ++#define PM4_MES_UNMAP_QUEUES_DEFINED ++enum unmap_queues_action_enum { ++ action__mes_unmap_queues__preempt_queues = 0, ++ action__mes_unmap_queues__reset_queues = 1, ++ action__mes_unmap_queues__disable_process_queues = 2 ++}; ++ ++enum unmap_queues_queue_sel_enum { ++ queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, ++ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, ++ queue_sel__mes_unmap_queues__perform_request_on_all_active_queues = 2, ++ queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only = 3 ++}; ++ ++enum unmap_queues_engine_sel_enum { ++ engine_sel__mes_unmap_queues__compute = 0, ++ engine_sel__mes_unmap_queues__sdma0 = 2, ++ engine_sel__mes_unmap_queues__sdma1 = 3 ++}; ++ ++struct pm4_unmap_queues { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ enum unmap_queues_action_enum action:2; ++ uint32_t reserved1:2; ++ enum unmap_queues_queue_sel_enum queue_sel:2; ++ uint32_t reserved2:20; ++ enum unmap_queues_engine_sel_enum engine_sel:3; ++ uint32_t num_queues:3; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved3:16; ++ } bitfields3a; ++ struct { ++ uint32_t reserved4:2; ++ uint32_t doorbell_offset0:21; ++ uint32_t reserved5:9; ++ } bitfields3b; ++ uint32_t ordinal3; ++ }; + + union { + struct { +- uint32_t num_gws:6; + uint32_t reserved6:2; +- uint32_t num_oac:4; +- uint32_t reserved7:4; +- uint32_t gds_size:6; +- uint32_t num_queues:10; +- } bitfields14; +- uint32_t ordinal14; ++ uint32_t doorbell_offset1:21; ++ uint32_t reserved7:9; ++ } bitfields4; ++ uint32_t ordinal4; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved8:2; ++ uint32_t doorbell_offset2:21; ++ uint32_t reserved9:9; ++ } bitfields5; ++ uint32_t ordinal5; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved10:2; ++ uint32_t doorbell_offset3:21; ++ uint32_t reserved11:9; ++ } bitfields6; ++ uint32_t ordinal6; + }; + +- uint32_t completion_signal_lo32; +-uint32_t completion_signal_hi32; + }; + #endif + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +deleted file mode 100644 +index ddad9be..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h ++++ /dev/null +@@ -1,583 +0,0 @@ +-/* +- * Copyright 2016 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- * +- */ +- +-#ifndef F32_MES_PM4_PACKETS_H +-#define F32_MES_PM4_PACKETS_H +- +-#ifndef PM4_MES_HEADER_DEFINED +-#define PM4_MES_HEADER_DEFINED +-union PM4_MES_TYPE_3_HEADER { +- struct { +- uint32_t reserved1 : 8; /* < reserved */ +- uint32_t opcode : 8; /* < IT opcode */ +- uint32_t count : 14;/* < number of DWORDs - 1 in the +- * information body. +- */ +- uint32_t type : 2; /* < packet identifier. +- * It should be 3 for type 3 packets +- */ +- }; +- uint32_t u32All; +-}; +-#endif /* PM4_MES_HEADER_DEFINED */ +- +-/*--------------------MES_SET_RESOURCES--------------------*/ +- +-#ifndef PM4_MES_SET_RESOURCES_DEFINED +-#define PM4_MES_SET_RESOURCES_DEFINED +-enum mes_set_resources_queue_type_enum { +- queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, +- queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, +- queue_type__mes_set_resources__hsa_debug_interface_queue = 4 +-}; +- +- +-struct pm4_mes_set_resources { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t vmid_mask:16; +- uint32_t unmap_latency:8; +- uint32_t reserved1:5; +- enum mes_set_resources_queue_type_enum queue_type:3; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- uint32_t queue_mask_lo; +- uint32_t queue_mask_hi; +- uint32_t gws_mask_lo; +- uint32_t gws_mask_hi; +- +- union { +- struct { +- uint32_t oac_mask:16; +- uint32_t reserved2:16; +- } bitfields7; +- uint32_t ordinal7; +- }; +- +- union { +- struct { +- uint32_t gds_heap_base:6; +- uint32_t reserved3:5; +- uint32_t gds_heap_size:6; +- uint32_t reserved4:15; +- } bitfields8; +- uint32_t ordinal8; +- }; +- +-}; +-#endif +- +-/*--------------------MES_RUN_LIST--------------------*/ +- +-#ifndef PM4_MES_RUN_LIST_DEFINED +-#define PM4_MES_RUN_LIST_DEFINED +- +-struct pm4_mes_runlist { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t reserved1:2; +- uint32_t ib_base_lo:30; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- uint32_t ib_base_hi; +- +- union { +- struct { +- uint32_t ib_size:20; +- uint32_t chain:1; +- uint32_t offload_polling:1; +- uint32_t reserved2:1; +- uint32_t valid:1; +- uint32_t process_cnt:4; +- uint32_t reserved3:4; +- } bitfields4; +- uint32_t ordinal4; +- }; +- +-}; +-#endif +- +-/*--------------------MES_MAP_PROCESS--------------------*/ +- +-#ifndef PM4_MES_MAP_PROCESS_DEFINED +-#define PM4_MES_MAP_PROCESS_DEFINED +- +-struct pm4_mes_map_process { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t pasid:16; +- uint32_t reserved1:8; +- uint32_t diq_enable:1; +- uint32_t process_quantum:7; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- uint32_t vm_context_page_table_base_addr_lo32; +- +- uint32_t vm_context_page_table_base_addr_hi32; +- +- uint32_t sh_mem_bases; +- +- uint32_t sh_mem_config; +- +- uint32_t sq_shader_tba_lo; +- +- uint32_t sq_shader_tba_hi; +- +- uint32_t sq_shader_tma_lo; +- +- uint32_t sq_shader_tma_hi; +- +- uint32_t reserved6; +- +- uint32_t gds_addr_lo; +- +- uint32_t gds_addr_hi; +- +- union { +- struct { +- uint32_t num_gws:6; +- uint32_t reserved7:1; +- uint32_t sdma_enable:1; +- uint32_t num_oac:4; +- uint32_t reserved8:4; +- uint32_t gds_size:6; +- uint32_t num_queues:10; +- } bitfields14; +- uint32_t ordinal14; +- }; +- +- uint32_t completion_signal_lo; +- +- uint32_t completion_signal_hi; +- +-}; +- +-#endif +- +-/*--------------------MES_MAP_PROCESS_VM--------------------*/ +- +-#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED +-#define PM4_MES_MAP_PROCESS_VM_DEFINED +- +-struct PM4_MES_MAP_PROCESS_VM { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- uint32_t reserved1; +- +- uint32_t vm_context_cntl; +- +- uint32_t reserved2; +- +- uint32_t vm_context_page_table_end_addr_lo32; +- +- uint32_t vm_context_page_table_end_addr_hi32; +- +- uint32_t vm_context_page_table_start_addr_lo32; +- +- uint32_t vm_context_page_table_start_addr_hi32; +- +- uint32_t reserved3; +- +- uint32_t reserved4; +- +- uint32_t reserved5; +- +- uint32_t reserved6; +- +- uint32_t reserved7; +- +- uint32_t reserved8; +- +- uint32_t completion_signal_lo32; +- +- uint32_t completion_signal_hi32; +- +-}; +-#endif +- +-/*--------------------MES_MAP_QUEUES--------------------*/ +- +-#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED +-#define PM4_MES_MAP_QUEUES_VI_DEFINED +-enum mes_map_queues_queue_sel_enum { +- queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, +-queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 +-}; +- +-enum mes_map_queues_queue_type_enum { +- queue_type__mes_map_queues__normal_compute_vi = 0, +- queue_type__mes_map_queues__debug_interface_queue_vi = 1, +- queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, +-queue_type__mes_map_queues__low_latency_static_queue_vi = 3 +-}; +- +-enum mes_map_queues_alloc_format_enum { +- alloc_format__mes_map_queues__one_per_pipe_vi = 0, +-alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 +-}; +- +-enum mes_map_queues_engine_sel_enum { +- engine_sel__mes_map_queues__compute_vi = 0, +- engine_sel__mes_map_queues__sdma0_vi = 2, +- engine_sel__mes_map_queues__sdma1_vi = 3 +-}; +- +- +-struct pm4_mes_map_queues { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t reserved1:4; +- enum mes_map_queues_queue_sel_enum queue_sel:2; +- uint32_t reserved2:15; +- enum mes_map_queues_queue_type_enum queue_type:3; +- enum mes_map_queues_alloc_format_enum alloc_format:2; +- enum mes_map_queues_engine_sel_enum engine_sel:3; +- uint32_t num_queues:3; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- union { +- struct { +- uint32_t reserved3:1; +- uint32_t check_disable:1; +- uint32_t doorbell_offset:26; +- uint32_t reserved4:4; +- } bitfields3; +- uint32_t ordinal3; +- }; +- +- uint32_t mqd_addr_lo; +- uint32_t mqd_addr_hi; +- uint32_t wptr_addr_lo; +- uint32_t wptr_addr_hi; +-}; +-#endif +- +-/*--------------------MES_QUERY_STATUS--------------------*/ +- +-#ifndef PM4_MES_QUERY_STATUS_DEFINED +-#define PM4_MES_QUERY_STATUS_DEFINED +-enum mes_query_status_interrupt_sel_enum { +- interrupt_sel__mes_query_status__completion_status = 0, +- interrupt_sel__mes_query_status__process_status = 1, +- interrupt_sel__mes_query_status__queue_status = 2 +-}; +- +-enum mes_query_status_command_enum { +- command__mes_query_status__interrupt_only = 0, +- command__mes_query_status__fence_only_immediate = 1, +- command__mes_query_status__fence_only_after_write_ack = 2, +- command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 +-}; +- +-enum mes_query_status_engine_sel_enum { +- engine_sel__mes_query_status__compute = 0, +- engine_sel__mes_query_status__sdma0_queue = 2, +- engine_sel__mes_query_status__sdma1_queue = 3 +-}; +- +-struct pm4_mes_query_status { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t context_id:28; +- enum mes_query_status_interrupt_sel_enum interrupt_sel:2; +- enum mes_query_status_command_enum command:2; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- union { +- struct { +- uint32_t pasid:16; +- uint32_t reserved1:16; +- } bitfields3a; +- struct { +- uint32_t reserved2:2; +- uint32_t doorbell_offset:26; +- enum mes_query_status_engine_sel_enum engine_sel:3; +- uint32_t reserved3:1; +- } bitfields3b; +- uint32_t ordinal3; +- }; +- +- uint32_t addr_lo; +- uint32_t addr_hi; +- uint32_t data_lo; +- uint32_t data_hi; +-}; +-#endif +- +-/*--------------------MES_UNMAP_QUEUES--------------------*/ +- +-#ifndef PM4_MES_UNMAP_QUEUES_DEFINED +-#define PM4_MES_UNMAP_QUEUES_DEFINED +-enum mes_unmap_queues_action_enum { +- action__mes_unmap_queues__preempt_queues = 0, +- action__mes_unmap_queues__reset_queues = 1, +- action__mes_unmap_queues__disable_process_queues = 2, +- action__mes_unmap_queues__reserved = 3 +-}; +- +-enum mes_unmap_queues_queue_sel_enum { +- queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, +- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, +- queue_sel__mes_unmap_queues__unmap_all_queues = 2, +- queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 +-}; +- +-enum mes_unmap_queues_engine_sel_enum { +- engine_sel__mes_unmap_queues__compute = 0, +- engine_sel__mes_unmap_queues__sdma0 = 2, +- engine_sel__mes_unmap_queues__sdmal = 3 +-}; +- +-struct pm4_mes_unmap_queues { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- enum mes_unmap_queues_action_enum action:2; +- uint32_t reserved1:2; +- enum mes_unmap_queues_queue_sel_enum queue_sel:2; +- uint32_t reserved2:20; +- enum mes_unmap_queues_engine_sel_enum engine_sel:3; +- uint32_t num_queues:3; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- union { +- struct { +- uint32_t pasid:16; +- uint32_t reserved3:16; +- } bitfields3a; +- struct { +- uint32_t reserved4:2; +- uint32_t doorbell_offset0:26; +- int32_t reserved5:4; +- } bitfields3b; +- uint32_t ordinal3; +- }; +- +- union { +- struct { +- uint32_t reserved6:2; +- uint32_t doorbell_offset1:26; +- uint32_t reserved7:4; +- } bitfields4; +- uint32_t ordinal4; +- }; +- +- union { +- struct { +- uint32_t reserved8:2; +- uint32_t doorbell_offset2:26; +- uint32_t reserved9:4; +- } bitfields5; +- uint32_t ordinal5; +- }; +- +- union { +- struct { +- uint32_t reserved10:2; +- uint32_t doorbell_offset3:26; +- uint32_t reserved11:4; +- } bitfields6; +- uint32_t ordinal6; +- }; +-}; +-#endif +- +-#ifndef PM4_MEC_RELEASE_MEM_DEFINED +-#define PM4_MEC_RELEASE_MEM_DEFINED +- +-enum mec_release_mem_event_index_enum { +- event_index__mec_release_mem__end_of_pipe = 5, +- event_index__mec_release_mem__shader_done = 6 +-}; +- +-enum mec_release_mem_cache_policy_enum { +- cache_policy__mec_release_mem__lru = 0, +- cache_policy__mec_release_mem__stream = 1 +-}; +- +-enum mec_release_mem_pq_exe_status_enum { +- pq_exe_status__mec_release_mem__default = 0, +- pq_exe_status__mec_release_mem__phase_update = 1 +-}; +- +-enum mec_release_mem_dst_sel_enum { +- dst_sel__mec_release_mem__memory_controller = 0, +- dst_sel__mec_release_mem__tc_l2 = 1, +- dst_sel__mec_release_mem__queue_write_pointer_register = 2, +- dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 +-}; +- +-enum mec_release_mem_int_sel_enum { +- int_sel__mec_release_mem__none = 0, +- int_sel__mec_release_mem__send_interrupt_only = 1, +- int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, +- int_sel__mec_release_mem__send_data_after_write_confirm = 3, +- int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, +- int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, +- int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 +-}; +- +-enum mec_release_mem_data_sel_enum { +- data_sel__mec_release_mem__none = 0, +- data_sel__mec_release_mem__send_32_bit_low = 1, +- data_sel__mec_release_mem__send_64_bit_data = 2, +- data_sel__mec_release_mem__send_gpu_clock_counter = 3, +- data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, +- data_sel__mec_release_mem__store_gds_data_to_memory = 5 +-}; +- +-struct pm4_mec_release_mem { +- union { +- union PM4_MES_TYPE_3_HEADER header; /*header */ +- unsigned int ordinal1; +- }; +- +- union { +- struct { +- unsigned int event_type:6; +- unsigned int reserved1:2; +- enum mec_release_mem_event_index_enum event_index:4; +- unsigned int tcl1_vol_action_ena:1; +- unsigned int tc_vol_action_ena:1; +- unsigned int reserved2:1; +- unsigned int tc_wb_action_ena:1; +- unsigned int tcl1_action_ena:1; +- unsigned int tc_action_ena:1; +- uint32_t reserved3:1; +- uint32_t tc_nc_action_ena:1; +- uint32_t tc_wc_action_ena:1; +- uint32_t tc_md_action_ena:1; +- uint32_t reserved4:3; +- enum mec_release_mem_cache_policy_enum cache_policy:2; +- uint32_t reserved5:2; +- enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; +- uint32_t reserved6:2; +- } bitfields2; +- unsigned int ordinal2; +- }; +- +- union { +- struct { +- uint32_t reserved7:16; +- enum mec_release_mem_dst_sel_enum dst_sel:2; +- uint32_t reserved8:6; +- enum mec_release_mem_int_sel_enum int_sel:3; +- uint32_t reserved9:2; +- enum mec_release_mem_data_sel_enum data_sel:3; +- } bitfields3; +- unsigned int ordinal3; +- }; +- +- union { +- struct { +- uint32_t reserved10:2; +- unsigned int address_lo_32b:30; +- } bitfields4; +- struct { +- uint32_t reserved11:3; +- uint32_t address_lo_64b:29; +- } bitfields4b; +- uint32_t reserved12; +- unsigned int ordinal4; +- }; +- +- union { +- uint32_t address_hi; +- uint32_t reserved13; +- uint32_t ordinal5; +- }; +- +- union { +- uint32_t data_lo; +- uint32_t cmp_data_lo; +- struct { +- uint32_t dw_offset:16; +- uint32_t num_dwords:16; +- } bitfields6c; +- uint32_t reserved14; +- uint32_t ordinal6; +- }; +- +- union { +- uint32_t data_hi; +- uint32_t cmp_data_hi; +- uint32_t reserved15; +- uint32_t reserved16; +- uint32_t ordinal7; +- }; +- +- uint32_t int_ctxid; +- +-}; +- +-#endif +- +-enum { +- CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 +-}; +-#endif +- +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h +index 0b314a8..a0ff348 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h +@@ -77,6 +77,103 @@ struct pm4__indirect_buffer_pasid { + + #endif + ++/*--------------------_RELEASE_MEM-------------------- */ ++ ++#ifndef _PM4__RELEASE_MEM_DEFINED ++#define _PM4__RELEASE_MEM_DEFINED ++enum _RELEASE_MEM_event_index_enum { ++ event_index___release_mem__end_of_pipe = 5, ++ event_index___release_mem__shader_done = 6 ++}; ++ ++enum _RELEASE_MEM_cache_policy_enum { ++ cache_policy___release_mem__lru = 0, ++ cache_policy___release_mem__stream = 1, ++ cache_policy___release_mem__bypass = 2 ++}; ++ ++enum _RELEASE_MEM_dst_sel_enum { ++ dst_sel___release_mem__memory_controller = 0, ++ dst_sel___release_mem__tc_l2 = 1, ++ dst_sel___release_mem__queue_write_pointer_register = 2, ++ dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 ++}; ++ ++enum _RELEASE_MEM_int_sel_enum { ++ int_sel___release_mem__none = 0, ++ int_sel___release_mem__send_interrupt_only = 1, ++ int_sel___release_mem__send_interrupt_after_write_confirm = 2, ++ int_sel___release_mem__send_data_after_write_confirm = 3 ++}; ++ ++enum _RELEASE_MEM_data_sel_enum { ++ data_sel___release_mem__none = 0, ++ data_sel___release_mem__send_32_bit_low = 1, ++ data_sel___release_mem__send_64_bit_data = 2, ++ data_sel___release_mem__send_gpu_clock_counter = 3, ++ data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, ++ data_sel___release_mem__store_gds_data_to_memory = 5 ++}; ++ ++struct pm4__release_mem { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /*header */ ++ unsigned int ordinal1; ++ }; ++ ++ union { ++ struct { ++ unsigned int event_type:6; ++ unsigned int reserved1:2; ++ enum _RELEASE_MEM_event_index_enum event_index:4; ++ unsigned int tcl1_vol_action_ena:1; ++ unsigned int tc_vol_action_ena:1; ++ unsigned int reserved2:1; ++ unsigned int tc_wb_action_ena:1; ++ unsigned int tcl1_action_ena:1; ++ unsigned int tc_action_ena:1; ++ unsigned int reserved3:6; ++ unsigned int atc:1; ++ enum _RELEASE_MEM_cache_policy_enum cache_policy:2; ++ unsigned int reserved4:5; ++ } bitfields2; ++ unsigned int ordinal2; ++ }; ++ ++ union { ++ struct { ++ unsigned int reserved5:16; ++ enum _RELEASE_MEM_dst_sel_enum dst_sel:2; ++ unsigned int reserved6:6; ++ enum _RELEASE_MEM_int_sel_enum int_sel:3; ++ unsigned int reserved7:2; ++ enum _RELEASE_MEM_data_sel_enum data_sel:3; ++ } bitfields3; ++ unsigned int ordinal3; ++ }; ++ ++ union { ++ struct { ++ unsigned int reserved8:2; ++ unsigned int address_lo_32b:30; ++ } bitfields4; ++ struct { ++ unsigned int reserved9:3; ++ unsigned int address_lo_64b:29; ++ } bitfields5; ++ unsigned int ordinal4; ++ }; ++ ++ unsigned int address_hi; ++ ++ unsigned int data_lo; ++ ++ unsigned int data_hi; ++ ++}; ++#endif ++ ++ + /*--------------------_SET_CONFIG_REG-------------------- */ + + #ifndef _PM4__SET_CONFIG_REG_DEFINED +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h +index 7c8d9b3..08c7219 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h +@@ -30,12 +30,10 @@ union PM4_MES_TYPE_3_HEADER { + struct { + uint32_t reserved1 : 8; /* < reserved */ + uint32_t opcode : 8; /* < IT opcode */ +- uint32_t count : 14;/* < Number of DWORDS - 1 in the +- * information body +- */ +- uint32_t type : 2; /* < packet identifier +- * It should be 3 for type 3 packets +- */ ++ uint32_t count : 14;/* < number of DWORDs - 1 in the ++ information body. */ ++ uint32_t type : 2; /* < packet identifier. ++ It should be 3 for type 3 packets */ + }; + uint32_t u32All; + }; +@@ -126,10 +124,9 @@ struct pm4_mes_runlist { + uint32_t ib_size:20; + uint32_t chain:1; + uint32_t offload_polling:1; +- uint32_t reserved2:1; ++ uint32_t reserved3:1; + uint32_t valid:1; +- uint32_t process_cnt:4; +- uint32_t reserved3:4; ++ uint32_t reserved4:8; + } bitfields4; + uint32_t ordinal4; + }; +@@ -144,8 +141,8 @@ struct pm4_mes_runlist { + + struct pm4_mes_map_process { + union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; + }; + + union { +@@ -156,48 +153,36 @@ struct pm4_mes_map_process { + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; +- }; ++}; + + union { + struct { + uint32_t page_table_base:28; +- uint32_t reserved3:4; ++ uint32_t reserved2:4; + } bitfields3; + uint32_t ordinal3; + }; + +- uint32_t reserved; +- + uint32_t sh_mem_bases; +- uint32_t sh_mem_config; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; +- +- uint32_t sh_hidden_private_base_vmid; +- +- uint32_t reserved2; +- uint32_t reserved3; +- ++ uint32_t sh_mem_config; + uint32_t gds_addr_lo; + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; +- uint32_t reserved4:2; ++ uint32_t reserved3:2; + uint32_t num_oac:4; +- uint32_t reserved5:4; ++ uint32_t reserved4:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields10; + uint32_t ordinal10; + }; + +- uint32_t completion_signal_lo; +- uint32_t completion_signal_hi; +- + }; +- + #endif + + /*--------------------MES_MAP_QUEUES--------------------*/ +@@ -350,7 +335,7 @@ enum mes_unmap_queues_engine_sel_enum { + engine_sel__mes_unmap_queues__sdmal = 3 + }; + +-struct pm4_mes_unmap_queues { ++struct PM4_MES_UNMAP_QUEUES { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; +@@ -410,101 +395,4 @@ struct pm4_mes_unmap_queues { + }; + #endif + +-#ifndef PM4_MEC_RELEASE_MEM_DEFINED +-#define PM4_MEC_RELEASE_MEM_DEFINED +-enum RELEASE_MEM_event_index_enum { +- event_index___release_mem__end_of_pipe = 5, +- event_index___release_mem__shader_done = 6 +-}; +- +-enum RELEASE_MEM_cache_policy_enum { +- cache_policy___release_mem__lru = 0, +- cache_policy___release_mem__stream = 1, +- cache_policy___release_mem__bypass = 2 +-}; +- +-enum RELEASE_MEM_dst_sel_enum { +- dst_sel___release_mem__memory_controller = 0, +- dst_sel___release_mem__tc_l2 = 1, +- dst_sel___release_mem__queue_write_pointer_register = 2, +- dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 +-}; +- +-enum RELEASE_MEM_int_sel_enum { +- int_sel___release_mem__none = 0, +- int_sel___release_mem__send_interrupt_only = 1, +- int_sel___release_mem__send_interrupt_after_write_confirm = 2, +- int_sel___release_mem__send_data_after_write_confirm = 3 +-}; +- +-enum RELEASE_MEM_data_sel_enum { +- data_sel___release_mem__none = 0, +- data_sel___release_mem__send_32_bit_low = 1, +- data_sel___release_mem__send_64_bit_data = 2, +- data_sel___release_mem__send_gpu_clock_counter = 3, +- data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, +- data_sel___release_mem__store_gds_data_to_memory = 5 +-}; +- +-struct pm4_mec_release_mem { +- union { +- union PM4_MES_TYPE_3_HEADER header; /*header */ +- unsigned int ordinal1; +- }; +- +- union { +- struct { +- unsigned int event_type:6; +- unsigned int reserved1:2; +- enum RELEASE_MEM_event_index_enum event_index:4; +- unsigned int tcl1_vol_action_ena:1; +- unsigned int tc_vol_action_ena:1; +- unsigned int reserved2:1; +- unsigned int tc_wb_action_ena:1; +- unsigned int tcl1_action_ena:1; +- unsigned int tc_action_ena:1; +- unsigned int reserved3:6; +- unsigned int atc:1; +- enum RELEASE_MEM_cache_policy_enum cache_policy:2; +- unsigned int reserved4:5; +- } bitfields2; +- unsigned int ordinal2; +- }; +- +- union { +- struct { +- unsigned int reserved5:16; +- enum RELEASE_MEM_dst_sel_enum dst_sel:2; +- unsigned int reserved6:6; +- enum RELEASE_MEM_int_sel_enum int_sel:3; +- unsigned int reserved7:2; +- enum RELEASE_MEM_data_sel_enum data_sel:3; +- } bitfields3; +- unsigned int ordinal3; +- }; +- +- union { +- struct { +- unsigned int reserved8:2; +- unsigned int address_lo_32b:30; +- } bitfields4; +- struct { +- unsigned int reserved9:3; +- unsigned int address_lo_64b:29; +- } bitfields5; +- unsigned int ordinal4; +- }; +- +- unsigned int address_hi; +- +- unsigned int data_lo; +- +- unsigned int data_hi; +-}; +-#endif +- +-enum { +- CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 +-}; +- + #endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +old mode 100755 +new mode 100644 +index 88fdfc9..4750cab +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -30,49 +30,13 @@ + #include <linux/atomic.h> + #include <linux/workqueue.h> + #include <linux/spinlock.h> +-#include <linux/idr.h> + #include <linux/kfd_ioctl.h> +-#include <linux/pid.h> +-#include <linux/interval_tree.h> +-#include <linux/seq_file.h> +-#include <linux/kref.h> +-#include <linux/kfifo.h> + #include <kgd_kfd_interface.h> + +-#include <drm/amd_rdma.h> +-#include "amd_shared.h" +- + #define KFD_SYSFS_FILE_MODE 0444 + +-/* GPU ID hash width in bits */ +-#define KFD_GPU_ID_HASH_WIDTH 16 +- +-/* Use upper bits of mmap offset to store KFD driver specific information. +- * BITS[63:62] - Encode MMAP type +- * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to +- * BITS[45:40] - Reserved. Not Used. +- * BITS[39:0] - MMAP offset value. Used by TTM. +- * +- * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these +- * defines are w.r.t to PAGE_SIZE +- */ +-#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) +-#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) +-#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) +-#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) +-#define KFD_MMAP_TYPE_MAP_BO (0x1ULL << KFD_MMAP_TYPE_SHIFT) +-#define KFD_MMAP_TYPE_RESERVED_MEM (0x0ULL << KFD_MMAP_TYPE_SHIFT) +- +-#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) +-#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ +- << KFD_MMAP_GPU_ID_SHIFT) +-#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ +- & KFD_MMAP_GPU_ID_MASK) +-#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ +- >> KFD_MMAP_GPU_ID_SHIFT) +- +-#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT) +-#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) ++#define KFD_MMAP_DOORBELL_MASK 0x8000000000000 ++#define KFD_MMAP_EVENTS_MASK 0x4000000000000 + + /* + * When working with cp scheduler we should assign the HIQ manually or via +@@ -84,6 +48,8 @@ + #define KFD_CIK_HIQ_PIPE 4 + #define KFD_CIK_HIQ_QUEUE 0 + ++/* GPU ID hash width in bits */ ++#define KFD_GPU_ID_HASH_WIDTH 16 + + /* Macro for allocating structures */ + #define kfd_alloc_struct(ptr_to_struct) \ +@@ -108,42 +74,12 @@ extern int max_num_of_queues_per_device; + /* Kernel module parameter to specify the scheduling policy */ + extern int sched_policy; + +-extern int cwsr_enable; +- +-/* +- * Kernel module parameter to specify the maximum process +- * number per HW scheduler +- */ +-extern int hws_max_conc_proc; +- + /* + * Kernel module parameter to specify whether to send sigterm to HSA process on + * unhandled exception + */ + extern int send_sigterm; + +-/* +- * This kernel module is used to simulate large bar machine on non-large bar +- * enabled machines. +- */ +-extern int debug_largebar; +- +-/* +- * Ignore CRAT table during KFD initialization, can be used to work around +- * broken CRAT tables on some AMD systems +- */ +-extern int ignore_crat; +- +-/* +- * Set sh_mem_config.retry_disable on Vega10 +- */ +-extern int vega10_noretry; +- +-/* +- * Enable privileged mode for all CP queues including user queues +- */ +-extern int priv_cp_queues; +- + /** + * enum kfd_sched_policy + * +@@ -176,28 +112,26 @@ enum cache_policy { + cache_policy_noncoherent + }; + +-#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) ++enum asic_family_type { ++ CHIP_KAVERI = 0, ++ CHIP_CARRIZO ++}; + + struct kfd_event_interrupt_class { + bool (*interrupt_isr)(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry, uint32_t *patched_ihre, +- bool *patched_flag); ++ const uint32_t *ih_ring_entry); + void (*interrupt_wq)(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry); ++ const uint32_t *ih_ring_entry); + }; + + struct kfd_device_info { +- enum amd_asic_type asic_family; ++ unsigned int asic_family; + const struct kfd_event_interrupt_class *event_interrupt_class; + unsigned int max_pasid_bits; + unsigned int max_no_of_hqd; +- unsigned int doorbell_size; + size_t ih_ring_entry_size; + uint8_t num_of_watch_points; + uint16_t mqd_size_aligned; +- bool is_need_iommu_device; +- bool supports_cwsr; +- bool needs_pci_atomics; + }; + + struct kfd_mem_obj { +@@ -205,13 +139,6 @@ struct kfd_mem_obj { + uint32_t range_end; + uint64_t gpu_addr; + uint32_t *cpu_ptr; +- void *gtt_mem; +-}; +- +-struct kfd_vmid_info { +- uint32_t first_vmid_kfd; +- uint32_t last_vmid_kfd; +- uint32_t vmid_num_kfd; + }; + + struct kfd_dev { +@@ -238,12 +165,11 @@ struct kfd_dev { + */ + + struct kgd2kfd_shared_resources shared_resources; +- struct kfd_vmid_info vm_info; + + const struct kfd2kgd_calls *kfd2kgd; + struct mutex doorbell_mutex; +- unsigned long doorbell_available_index[DIV_ROUND_UP( +- KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; ++ DECLARE_BITMAP(doorbell_available_index, ++ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + + void *gtt_mem; + uint64_t gtt_start_gpu_addr; +@@ -253,17 +179,18 @@ struct kfd_dev { + unsigned int gtt_sa_chunk_size; + unsigned int gtt_sa_num_of_chunks; + +- /* QCM Device instance */ +- struct device_queue_manager *dqm; +- +- bool init_complete; +- + /* Interrupts */ +- struct kfifo ih_fifo; +- struct workqueue_struct *ih_wq; ++ void *interrupt_ring; ++ size_t interrupt_ring_size; ++ atomic_t interrupt_ring_rptr; ++ atomic_t interrupt_ring_wptr; + struct work_struct interrupt_work; + spinlock_t interrupt_lock; + ++ /* QCM Device instance */ ++ struct device_queue_manager *dqm; ++ ++ bool init_complete; + /* + * Interrupts of interest to KFD are copied + * from the HW ring into a SW ring. +@@ -271,32 +198,7 @@ struct kfd_dev { + bool interrupts_active; + + /* Debug manager */ +- struct kfd_dbgmgr *dbgmgr; +- +- /* MEC firmware version*/ +- uint16_t mec_fw_version; +- +- /* Maximum process number mapped to HW scheduler */ +- unsigned int max_proc_per_quantum; +- +- /* cwsr */ +- bool cwsr_enabled; +- struct page *cwsr_pages; +- uint32_t cwsr_size; +- uint32_t tma_offset; /*Offset for TMA from the start of cwsr_mem*/ +- +- /* IB usage */ +- uint32_t ib_size; +-}; +- +-struct kfd_ipc_obj; +- +-struct kfd_bo { +- void *mem; +- struct interval_tree_node it; +- struct kfd_dev *dev; +- struct list_head cb_data_head; +- struct kfd_ipc_obj *kfd_ipc_obj; ++ struct kfd_dbgmgr *dbgmgr; + }; + + /* KGD2KFD callbacks */ +@@ -319,22 +221,27 @@ void kfd_chardev_exit(void); + struct device *kfd_chardev(void); + + /** +- * enum kfd_unmap_queues_filter ++ * enum kfd_preempt_type_filter + * +- * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue. ++ * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue. + * +- * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the ++ * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the + * running queues list. + * +- * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to ++ * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to + * specific process. + * + */ +-enum kfd_unmap_queues_filter { +- KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE, +- KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, +- KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, +- KFD_UNMAP_QUEUES_FILTER_BY_PASID ++enum kfd_preempt_type_filter { ++ KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE, ++ KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, ++ KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, ++ KFD_PREEMPT_TYPE_FILTER_BY_PASID ++}; ++ ++enum kfd_preempt_type { ++ KFD_PREEMPT_TYPE_WAVEFRONT, ++ KFD_PREEMPT_TYPE_WAVEFRONT_RESET + }; + + /** +@@ -360,11 +267,6 @@ enum kfd_queue_format { + KFD_QUEUE_FORMAT_AQL + }; + +-enum KFD_QUEUE_PRIORITY { +- KFD_QUEUE_PRIORITY_MINIMUM = 0, +- KFD_QUEUE_PRIORITY_MAXIMUM = 15 +-}; +- + /** + * struct queue_properties + * +@@ -392,13 +294,13 @@ enum KFD_QUEUE_PRIORITY { + * @write_ptr: Defines the number of dwords written to the ring buffer. + * + * @doorbell_ptr: This field aim is to notify the H/W of new packet written to +- * the queue ring buffer. This field should be similar to write_ptr and the +- * user should update this field after he updated the write_ptr. ++ * the queue ring buffer. This field should be similar to write_ptr and the user ++ * should update this field after he updated the write_ptr. + * + * @doorbell_off: The doorbell offset in the doorbell pci-bar. + * +- * @is_interop: Defines if this is a interop queue. Interop queue means that +- * the queue can access both graphics and compute resources. ++ * @is_interop: Defines if this is a interop queue. Interop queue means that the ++ * queue can access both graphics and compute resources. + * + * @is_active: Defines if the queue is active or not. + * +@@ -419,10 +321,9 @@ struct queue_properties { + uint32_t queue_percent; + uint32_t *read_ptr; + uint32_t *write_ptr; +- void __iomem *doorbell_ptr; ++ uint32_t __iomem *doorbell_ptr; + uint32_t doorbell_off; + bool is_interop; +- bool is_evicted; /* true -> queue is evicted */ + bool is_active; + /* Not relevant for user mode queues in cp scheduling */ + unsigned int vmid; +@@ -435,12 +336,6 @@ struct queue_properties { + uint32_t eop_ring_buffer_size; + uint64_t ctx_save_restore_area_address; + uint32_t ctx_save_restore_area_size; +- uint32_t ctl_stack_size; +- uint64_t tba_addr; +- uint64_t tma_addr; +- /* Relevant for CU */ +- uint32_t cu_mask_count; /* Must be a multiple of 32 */ +- uint32_t *cu_mask; + }; + + /** +@@ -457,10 +352,9 @@ struct queue_properties { + * @properties: The queue properties. + * + * @mec: Used only in no cp scheduling mode and identifies to micro engine id +- * that the queue should be execute on. ++ * that the queue should be execute on. + * +- * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe +- * id. ++ * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe id. + * + * @queue: Used only in no cp scheduliong mode and identifies the queue's slot. + * +@@ -485,7 +379,6 @@ struct queue { + uint32_t queue; + + unsigned int sdma_id; +- unsigned int doorbell_id; + + struct kfd_process *process; + struct kfd_dev *device; +@@ -502,19 +395,6 @@ enum KFD_MQD_TYPE { + KFD_MQD_TYPE_MAX + }; + +-enum KFD_PIPE_PRIORITY { +- KFD_PIPE_PRIORITY_CS_LOW = 0, +- KFD_PIPE_PRIORITY_CS_MEDIUM, +- KFD_PIPE_PRIORITY_CS_HIGH +-}; +- +-enum KFD_SPI_PRIORITY { +- KFD_SPI_PRIORITY_EXTRA_LOW = 0, +- KFD_SPI_PRIORITY_LOW, +- KFD_SPI_PRIORITY_MEDIUM, +- KFD_SPI_PRIORITY_HIGH +-}; +- + struct scheduling_resources { + unsigned int vmid_mask; + enum kfd_queue_type type; +@@ -528,6 +408,7 @@ struct scheduling_resources { + struct process_queue_manager { + /* data */ + struct kfd_process *process; ++ unsigned int num_concurrent_processes; + struct list_head queues; + unsigned long *queue_slot_bitmap; + }; +@@ -543,13 +424,6 @@ struct qcm_process_device { + unsigned int queue_count; + unsigned int vmid; + bool is_debug; +- unsigned int evicted; /* eviction counter, 0=active */ +- +- /* This flag tells if we should reset all wavefronts on +- * process termination +- */ +- bool reset_wavefronts; +- + /* + * All the memory management data should be here too + */ +@@ -562,55 +436,6 @@ struct qcm_process_device { + uint32_t gds_size; + uint32_t num_gws; + uint32_t num_oac; +- uint32_t sh_hidden_private_base; +- +- /*cwsr memory*/ +- uint64_t cwsr_base; +- uint64_t tba_addr; +- uint64_t tma_addr; +- void *cwsr_kaddr; +- struct page *cwsr_pages; +- +- /* IB memory */ +- uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */ +- void *ib_kaddr; +- +- /*doorbell resources per process per device*/ +- unsigned long *doorbell_bitmap; +-}; +- +-/* KFD Memory Eviction */ +-struct kfd_eviction_work { +- struct delayed_work dwork; +- struct dma_fence *quiesce_fence; +-}; +- +-/* Approx. wait time before attempting to restore evicted BOs */ +-#define PROCESS_RESTORE_TIME_MS 100 +-/* Approx. back off time if restore fails due to lack of memory */ +-#define PROCESS_BACK_OFF_TIME_MS 100 +-/* Approx. time before evicting the process again */ +-#define PROCESS_ACTIVE_TIME_MS 10 +- +-void kfd_evict_bo_worker(struct work_struct *work); +-void kfd_restore_bo_worker(struct work_struct *work); +-int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, +- struct dma_fence *fence); +-int quiesce_process_mm(struct kfd_process *p); +- +- +-/* 8 byte handle containing GPU ID in the most significant 4 bytes and +- * idr_handle in the least significant 4 bytes +- */ +-#define MAKE_HANDLE(gpu_id, idr_handle) \ +- (((uint64_t)(gpu_id) << 32) + idr_handle) +-#define GET_GPU_ID(handle) (handle >> 32) +-#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) +- +-enum kfd_pdd_bound { +- PDD_UNBOUND = 0, +- PDD_BOUND, +- PDD_BOUND_SUSPENDED, + }; + + /* Data that is per-process-per device. */ +@@ -624,8 +449,6 @@ struct kfd_process_device { + /* The device that owns this data. */ + struct kfd_dev *dev; + +- /* The process that owns this kfd_process_device. */ +- struct kfd_process *process; + + /* per-process-per device QCM data structure */ + struct qcm_process_device qpd; +@@ -637,27 +460,14 @@ struct kfd_process_device { + uint64_t gpuvm_limit; + uint64_t scratch_base; + uint64_t scratch_limit; +- uint64_t dgpu_base; +- uint64_t dgpu_limit; +- +- uint64_t sh_hidden_private_base_vmid; +- +- /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) +- */ +- enum kfd_pdd_bound bound; + +- /* VM context for GPUVM allocations */ +- void *vm; ++ /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ ++ bool bound; + +- /* GPUVM allocations storage */ +- struct idr alloc_idr; +- +- /* Flag used to tell the pdd has dequeued from the dqm. +- * This is used to prevent dev->dqm->ops.process_termination() from +- * being called twice when it is already called in IOMMU callback +- * function. ++ /* This flag tells if we should reset all ++ * wavefronts on process termination + */ +- bool already_dequeued; ++ bool reset_wavefronts; + }; + + #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) +@@ -670,15 +480,7 @@ struct kfd_process { + */ + struct hlist_node kfd_processes; + +- /* +- * Opaque pointer to mm_struct. We don't hold a reference to +- * it so it should never be dereferenced from here. This is +- * only used for looking up processes by their mm. +- */ +- void *mm; +- +- struct kref ref; +- struct work_struct release_work; ++ struct mm_struct *mm; + + struct mutex mutex; + +@@ -686,8 +488,6 @@ struct kfd_process { + * In any process, the thread that started main() is the lead + * thread and outlives the rest. + * It is here because amd_iommu_bind_pasid wants a task_struct. +- * It can also be used for safely getting a reference to the +- * mm_struct of the process. + */ + struct task_struct *lead_thread; + +@@ -707,8 +507,11 @@ struct kfd_process { + + struct process_queue_manager pqm; + +- unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, +- BITS_PER_LONG)]; ++ /* The process's queues. */ ++ size_t queue_array_size; ++ ++ /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ ++ struct kfd_queue **queues; + + /*Is the user space process 32 bit?*/ + bool is_32bit_user_mode; +@@ -717,29 +520,10 @@ struct kfd_process { + struct mutex event_mutex; + /* All events in process hashed by ID, linked on kfd_event.events. */ + DECLARE_HASHTABLE(events, 4); +- /* struct slot_page_header.event_pages */ +- struct list_head signal_event_pages; ++ struct list_head signal_event_pages; /* struct slot_page_header. ++ event_pages */ + u32 next_nonsignal_event_id; + size_t signal_event_count; +- bool signal_event_limit_reached; +- +- struct rb_root_cached bo_interval_tree; +- +- /* Information used for memory eviction */ +- void *process_info; +- /* Eviction fence that is attached to all the BOs of this process. The +- * fence will be triggered during eviction and new one will be created +- * during restore +- */ +- struct dma_fence *ef; +- +- /* Work items for evicting and restoring BOs */ +- struct kfd_eviction_work eviction_work; +- struct delayed_work restore_work; +- /* Approx. the last timestamp (in jiffies) when the process was +- * restored after an eviction +- */ +- unsigned long last_restore_timestamp; + }; + + /** +@@ -762,55 +546,21 @@ struct amdkfd_ioctl_desc { + + void kfd_process_create_wq(void); + void kfd_process_destroy_wq(void); +-struct kfd_process *kfd_create_process(struct file *filep); +-struct kfd_process *kfd_get_process(const struct task_struct *task); ++struct kfd_process *kfd_create_process(const struct task_struct *); ++struct kfd_process *kfd_get_process(const struct task_struct *); + struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); +-struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); +-void kfd_unref_process(struct kfd_process *p); +-void kfd_suspend_all_processes(void); +-int kfd_resume_all_processes(void); + + struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + struct kfd_process *p); +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +-int kfd_bind_processes_to_device(struct kfd_dev *dev); +-void kfd_unbind_processes_from_device(struct kfd_dev *dev); +-#endif +-void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid); ++void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); + struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); + struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); + +-int kfd_reserved_mem_mmap(struct kfd_process *process, +- struct vm_area_struct *vma); +- +-/* KFD process API for creating and translating handles */ +-int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, +- void *mem, uint64_t start, +- uint64_t length, +- struct kfd_ipc_obj *ipc_obj); +-void *kfd_process_device_translate_handle(struct kfd_process_device *p, +- int handle); +-struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, +- int handle); +-void *kfd_process_find_bo_from_interval(struct kfd_process *p, +- uint64_t start_addr, +- uint64_t last_addr); +-void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, +- int handle); +- +-void run_rdma_free_callback(struct kfd_bo *buf_obj); +-struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); +- +-/* kfd dgpu memory */ +-int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd); +- + /* Process device data iterator */ +-struct kfd_process_device *kfd_get_first_process_device_data( +- struct kfd_process *p); +-struct kfd_process_device *kfd_get_next_process_device_data( +- struct kfd_process *p, ++struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p); ++struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, + struct kfd_process_device *pdd); + bool kfd_has_process_device_data(struct kfd_process *p); + +@@ -823,20 +573,16 @@ unsigned int kfd_pasid_alloc(void); + void kfd_pasid_free(unsigned int pasid); + + /* Doorbells */ +-size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); +-int kfd_doorbell_init(struct kfd_dev *kfd); +-void kfd_doorbell_fini(struct kfd_dev *kfd); +-int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process, +- struct vm_area_struct *vma); +-void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, ++void kfd_doorbell_init(struct kfd_dev *kfd); ++int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); ++u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off); + void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); + u32 read_kernel_doorbell(u32 __iomem *db); +-void write_kernel_doorbell(void __iomem *db, u32 value); +-void write_kernel_doorbell64(void __iomem *db, u64 value); +-unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, ++void write_kernel_doorbell(u32 __iomem *db, u32 value); ++unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, + struct kfd_process *process, +- unsigned int doorbell_id); ++ unsigned int queue_id); + + /* GTT Sub-Allocator */ + +@@ -852,22 +598,16 @@ int kfd_topology_init(void); + void kfd_topology_shutdown(void); + int kfd_topology_add_device(struct kfd_dev *gpu); + int kfd_topology_remove_device(struct kfd_dev *gpu); +-struct kfd_topology_device *kfd_topology_device_by_proximity_domain( +- uint32_t proximity_domain); + struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); + struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); +-struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd); +-int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); +-int kfd_numa_node_to_apic_id(int numa_node_id); ++struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); + + /* Interrupts */ + int kfd_interrupt_init(struct kfd_dev *dev); + void kfd_interrupt_exit(struct kfd_dev *dev); + void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); + bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry); +-bool interrupt_is_wanted(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry, +- uint32_t *patched_ihre, bool *flag); ++bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry); + + /* Power Management */ + void kgd2kfd_suspend(struct kfd_dev *kfd); +@@ -875,10 +615,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd); + + /* amdkfd Apertures */ + int kfd_init_apertures(struct kfd_process *process); +-int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, +- uint64_t base, uint64_t limit); + + /* Queue Context Management */ ++struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); ++ + int init_queue(struct queue **q, const struct queue_properties *properties); + void uninit_queue(struct queue *q); + void print_queue_properties(struct queue_properties *q); +@@ -888,20 +628,13 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); + struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +-struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, +- struct kfd_dev *dev); + struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +-struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, +- struct kfd_dev *dev); +-struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, +- struct kfd_dev *dev); + struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); + void device_queue_manager_uninit(struct device_queue_manager *dqm); + struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + enum kfd_queue_type type); + void kernel_queue_uninit(struct kernel_queue *kq); +-int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid); + + /* Process Queue Manager */ + struct process_queue_node { +@@ -910,36 +643,32 @@ struct process_queue_node { + struct list_head process_queue_list; + }; + +-void kfd_process_dequeue_from_device(struct kfd_process_device *pdd); +-void kfd_process_dequeue_from_all_devices(struct kfd_process *p); + int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p); + void pqm_uninit(struct process_queue_manager *pqm); + int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, ++ unsigned int flags, ++ enum kfd_queue_type type, + unsigned int *qid); + int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); + int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p); +-int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, +- struct queue_properties *p); + struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, + unsigned int qid); +-int pqm_get_wave_state(struct process_queue_manager *pqm, +- unsigned int qid, +- void __user *ctl_stack, +- u32 *ctl_stack_used_size, +- u32 *save_area_used_size); +-int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm); +-int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); ++ ++int amdkfd_fence_wait_timeout(unsigned int *fence_addr, ++ unsigned int fence_value, ++ unsigned long timeout); + + /* Packet Manager */ + ++#define KFD_HIQ_TIMEOUT (500) ++ + #define KFD_FENCE_COMPLETED (100) + #define KFD_FENCE_INIT (10) +- +-struct packet_manager_func; ++#define KFD_UNMAP_LATENCY (150) + + struct packet_manager { + struct device_queue_manager *dqm; +@@ -947,42 +676,9 @@ struct packet_manager { + struct mutex lock; + bool allocated; + struct kfd_mem_obj *ib_buffer_obj; +- unsigned int ib_size_bytes; +- +- struct packet_manager_funcs *pmf; +-}; +- +-struct packet_manager_funcs { +- /* Support different firmware versions for PM4 packets */ +- int (*map_process)(struct packet_manager *pm, uint32_t *buffer, +- struct qcm_process_device *qpd); +- int (*runlist)(struct packet_manager *pm, uint32_t *buffer, +- uint64_t ib, size_t ib_size_in_dwords, bool chain); +- int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, +- struct scheduling_resources *res); +- int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, +- struct queue *q, bool is_static); +- int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, +- enum kfd_queue_type type, +- enum kfd_unmap_queues_filter mode, +- uint32_t filter_param, bool reset, +- unsigned int sdma_engine); +- int (*query_status)(struct packet_manager *pm, uint32_t *buffer, +- uint64_t fence_address, uint32_t fence_value); +- uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); +- +- uint32_t (*get_map_process_packet_size)(void); +- uint32_t (*get_runlist_packet_size)(void); +- uint32_t (*get_set_resources_packet_size)(void); +- uint32_t (*get_map_queues_packet_size)(void); +- uint32_t (*get_unmap_queues_packet_size)(void); +- uint32_t (*get_query_status_packet_size)(void); +- uint32_t (*get_release_mem_packet_size)(void); +- + }; + +-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, +- uint16_t fw_ver); ++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); + void pm_uninit(struct packet_manager *pm); + int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res); +@@ -991,55 +687,18 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value); + + int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, +- enum kfd_unmap_queues_filter mode, ++ enum kfd_preempt_type_filter mode, + uint32_t filter_param, bool reset, + unsigned int sdma_engine); + + void pm_release_ib(struct packet_manager *pm); + +-/* Following PM funcs can be shared among CIK and VI */ +-unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); +-int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, +- uint64_t ib, size_t ib_size_in_dwords, bool chain); +-int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, +- struct queue *q, bool is_static); +-int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, +- struct scheduling_resources *res); +-int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, +- enum kfd_queue_type type, +- enum kfd_unmap_queues_filter filter, +- uint32_t filter_param, bool reset, +- unsigned int sdma_engine); +-int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, +- uint64_t fence_address, uint32_t fence_value); +-uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer); +- +-uint32_t pm_get_map_process_packet_size_vi(void); +-uint32_t pm_get_runlist_packet_size_vi(void); +-uint32_t pm_get_set_resources_packet_size_vi(void); +-uint32_t pm_get_map_queues_packet_size_vi(void); +-uint32_t pm_get_unmap_queues_packet_size_vi(void); +-uint32_t pm_get_query_status_packet_size_vi(void); +-uint32_t pm_get_release_mem_packet_size_vi(void); +- +- +-void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver); +-void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver); +- +-void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver); +- +- + uint64_t kfd_get_number_elems(struct kfd_dev *kfd); + phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process); +-int amdkfd_fence_wait_timeout(unsigned int *fence_addr, +- unsigned int fence_value, +- unsigned long timeout_ms); + + /* Events */ + extern const struct kfd_event_interrupt_class event_interrupt_class_cik; +-extern const struct kfd_event_interrupt_class event_interrupt_class_v9; +- + extern const struct kfd_device_global_init_class device_global_init_class_cik; + + enum kfd_event_wait_result { +@@ -1057,55 +716,18 @@ int kfd_wait_on_events(struct kfd_process *p, + enum kfd_event_wait_result *wait_result); + void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, + uint32_t valid_id_bits); +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + void kfd_signal_iommu_event(struct kfd_dev *dev, + unsigned int pasid, unsigned long address, + bool is_write_requested, bool is_execute_requested); +-#endif + void kfd_signal_hw_exception_event(unsigned int pasid); + int kfd_set_event(struct kfd_process *p, uint32_t event_id); + int kfd_reset_event(struct kfd_process *p, uint32_t event_id); + int kfd_event_create(struct file *devkfd, struct kfd_process *p, + uint32_t event_type, bool auto_reset, uint32_t node_id, + uint32_t *event_id, uint32_t *event_trigger_data, +- uint64_t *event_page_offset, uint32_t *event_slot_index, +- void *kern_addr); ++ uint64_t *event_page_offset, uint32_t *event_slot_index); + int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); +-void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle); +- +-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, +- struct kfd_vm_fault_info *info); +- +-void kfd_flush_tlb(struct kfd_dev *dev, uint32_t pasid); + + int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); + +-#define KFD_SCRATCH_KV_FW_VER 413 +- +-/* PeerDirect support */ +-void kfd_init_peer_direct(void); +-void kfd_close_peer_direct(void); +- +-/* IPC Support */ +-int kfd_ipc_init(void); +- +-/* Debugfs */ +-#if defined(CONFIG_DEBUG_FS) +- +-void kfd_debugfs_init(void); +-void kfd_debugfs_fini(void); +-int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data); +-int pqm_debugfs_mqds(struct seq_file *m, void *data); +-int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data); +-int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data); +-int kfd_debugfs_rls_by_device(struct seq_file *m, void *data); +-int pm_debugfs_runlist(struct seq_file *m, void *data); +- +-#else +- +-static inline void kfd_debugfs_init(void) {} +-static inline void kfd_debugfs_fini(void) {} +- +-#endif +- + #endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +index c798fa3..035bbc9 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +@@ -24,16 +24,10 @@ + #include <linux/log2.h> + #include <linux/sched.h> + #include <linux/sched/mm.h> +-#include <linux/sched/task.h> + #include <linux/slab.h> +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) + #include <linux/amd-iommu.h> +-#endif + #include <linux/notifier.h> + #include <linux/compat.h> +-#include <linux/mman.h> +-#include <linux/highmem.h> +-#include "kfd_ipc.h" + + struct mm_struct; + +@@ -41,6 +35,13 @@ struct mm_struct; + #include "kfd_dbgmgr.h" + + /* ++ * Initial size for the array of queues. ++ * The allocated size is doubled each time ++ * it is exceeded up to MAX_PROCESS_QUEUES. ++ */ ++#define INITIAL_QUEUE_ARRAY_SIZE 16 ++ ++/* + * List of struct kfd_process (field kfd_process). + * Unique/indexed by mm_struct* + */ +@@ -52,16 +53,13 @@ DEFINE_STATIC_SRCU(kfd_processes_srcu); + + static struct workqueue_struct *kfd_process_wq; + +-#define MIN_IDR_ID 1 +-#define MAX_IDR_ID 0 /*0 - for unlimited*/ +- +-static struct kfd_process *find_process(const struct task_struct *thread, +- bool ref); +-static void kfd_process_ref_release(struct kref *ref); +-static struct kfd_process *create_process(const struct task_struct *thread, +- struct file *filep); +-static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); ++struct kfd_process_release_work { ++ struct work_struct kfd_work; ++ struct kfd_process *p; ++}; + ++static struct kfd_process *find_process(const struct task_struct *thread); ++static struct kfd_process *create_process(const struct task_struct *thread); + + void kfd_process_create_wq(void) + { +@@ -77,144 +75,22 @@ void kfd_process_destroy_wq(void) + } + } + +-static void kfd_process_free_gpuvm(struct kgd_mem *mem, +- struct kfd_process_device *pdd) +-{ +- kfd_unmap_memory_from_gpu(mem, pdd); +- pdd->dev->kfd2kgd->free_memory_of_gpu(pdd->dev->kgd, mem, pdd->vm); +-} +- +-/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process +- * This function should be only called right after the process +- * is created and when kfd_processes_mutex is still being held +- * to avoid concurrency. Because of that exclusiveness, we do +- * not need to take p->mutex. +- */ +-static int kfd_process_alloc_gpuvm(struct kfd_process *p, +- struct kfd_dev *kdev, uint64_t gpu_va, uint32_t size, +- void **kptr, struct kfd_process_device *pdd, uint32_t flags) +-{ +- int err; +- void *mem = NULL; +- int handle; +- +- err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, +- pdd->vm, +- (struct kgd_mem **)&mem, NULL, flags); +- if (err) +- goto err_alloc_mem; +- +- err = kdev->kfd2kgd->map_memory_to_gpu( +- kdev->kgd, (struct kgd_mem *)mem, pdd->vm); +- if (err) +- goto err_map_mem; +- +- err = kdev->kfd2kgd->sync_memory(kdev->kgd, (struct kgd_mem *) mem, +- true); +- if (err) { +- pr_debug("Sync memory failed, wait interrupted by user signal\n"); +- goto sync_memory_failed; +- } +- +- kfd_flush_tlb(kdev, p->pasid); +- +- /* Create an obj handle so kfd_process_device_remove_obj_handle +- * will take care of the bo removal when the process finishes. +- * We do not need to take p->mutex, because the process is just +- * created and the ioctls have not had the chance to run. +- */ +- handle = kfd_process_device_create_obj_handle( +- pdd, mem, gpu_va, size, NULL); +- +- if (handle < 0) { +- err = handle; +- goto free_gpuvm; +- } +- +- if (kptr) { +- err = kdev->kfd2kgd->map_gtt_bo_to_kernel(kdev->kgd, +- (struct kgd_mem *)mem, kptr); +- if (err) { +- pr_debug("Map GTT BO to kernel failed\n"); +- goto free_obj_handle; +- } +- } +- +- return err; +- +-free_obj_handle: +- kfd_process_device_remove_obj_handle(pdd, handle); +-free_gpuvm: +-sync_memory_failed: +- kfd_process_free_gpuvm(mem, pdd); +- return err; +- +-err_map_mem: +- kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem, pdd->vm); +-err_alloc_mem: +- *kptr = NULL; +- return err; +-} +- +-/* kfd_process_reserve_ib_mem - Reserve memory inside the process for IB usage +- * The memory reserved is for KFD to submit IB to AMDGPU from kernel. +- * If the memory is reserved successfully, ib_kaddr_assigned will have +- * the CPU/kernel address. Check ib_kaddr_assigned before accessing the +- * memory. +- */ +-static int kfd_process_reserve_ib_mem(struct kfd_process *p) +-{ +- int ret = 0; +- struct kfd_process_device *temp, *pdd = NULL; +- struct kfd_dev *kdev = NULL; +- struct qcm_process_device *qpd = NULL; +- void *kaddr; +- uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | +- ALLOC_MEM_FLAGS_NO_SUBSTITUTE | +- ALLOC_MEM_FLAGS_EXECUTE_ACCESS; +- +- list_for_each_entry_safe(pdd, temp, &p->per_device_data, +- per_device_list) { +- kdev = pdd->dev; +- qpd = &pdd->qpd; +- if (!kdev->ib_size || qpd->ib_kaddr) +- continue; +- +- if (qpd->ib_base) { /* is dGPU */ +- ret = kfd_process_alloc_gpuvm(p, kdev, +- qpd->ib_base, kdev->ib_size, +- &kaddr, pdd, flags); +- if (!ret) +- qpd->ib_kaddr = kaddr; +- else +- /* In case of error, the kfd_bos for some pdds +- * which are already allocated successfully +- * will be freed in upper level function +- * i.e. create_process(). +- */ +- return ret; +- } else { +- /* FIXME: Support APU */ +- continue; +- } +- } +- +- return 0; +-} +- +-struct kfd_process *kfd_create_process(struct file *filep) ++struct kfd_process *kfd_create_process(const struct task_struct *thread) + { + struct kfd_process *process; + +- struct task_struct *thread = current; ++ BUG_ON(!kfd_process_wq); + +- if (!thread->mm) ++ if (thread->mm == NULL) + return ERR_PTR(-EINVAL); + + /* Only the pthreads threading model is supported. */ + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + ++ /* Take mmap_sem because we call __mmu_notifier_register inside */ ++ down_write(&thread->mm->mmap_sem); ++ + /* + * take kfd processes mutex before starting of process creation + * so there won't be a case where two threads of the same process +@@ -223,14 +99,17 @@ struct kfd_process *kfd_create_process(struct file *filep) + mutex_lock(&kfd_processes_mutex); + + /* A prior open of /dev/kfd could have already created the process. */ +- process = find_process(thread, false); ++ process = find_process(thread); + if (process) +- pr_debug("Process already found\n"); +- else +- process = create_process(thread, filep); ++ pr_debug("kfd: process already found\n"); ++ ++ if (!process) ++ process = create_process(thread); + + mutex_unlock(&kfd_processes_mutex); + ++ up_write(&thread->mm->mmap_sem); ++ + return process; + } + +@@ -238,14 +117,14 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) + { + struct kfd_process *process; + +- if (!thread->mm) ++ if (thread->mm == NULL) + return ERR_PTR(-EINVAL); + + /* Only the pthreads threading model is supported. */ + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + +- process = find_process(thread, false); ++ process = find_process(thread); + + return process; + } +@@ -262,158 +141,81 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) + return NULL; + } + +-static struct kfd_process *find_process(const struct task_struct *thread, +- bool ref) ++static struct kfd_process *find_process(const struct task_struct *thread) + { + struct kfd_process *p; + int idx; + + idx = srcu_read_lock(&kfd_processes_srcu); + p = find_process_by_mm(thread->mm); +- if (p && ref) +- kref_get(&p->ref); + srcu_read_unlock(&kfd_processes_srcu, idx); + + return p; + } + +-void kfd_unref_process(struct kfd_process *p) +-{ +- kref_put(&p->ref, kfd_process_ref_release); +-} +- +-/* This increments the process->ref counter. */ +-struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid) ++static void kfd_process_wq_release(struct work_struct *work) + { +- struct task_struct *task = NULL; +- struct kfd_process *p = NULL; +- +- if (!pid) +- task = current; +- else +- task = get_pid_task(pid, PIDTYPE_PID); ++ struct kfd_process_release_work *my_work; ++ struct kfd_process_device *pdd, *temp; ++ struct kfd_process *p; + +- if (task) +- p = find_process(task, true); ++ my_work = (struct kfd_process_release_work *) work; + +- return p; +-} ++ p = my_work->p; + +-static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p) +-{ +- struct kfd_process_device *pdd, *peer_pdd; +- struct kfd_bo *buf_obj; +- int id; +- +- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { +- /* +- * Remove all handles from idr and release appropriate +- * local memory object +- */ +- idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) { +- list_for_each_entry(peer_pdd, &p->per_device_data, +- per_device_list) { +- peer_pdd->dev->kfd2kgd->unmap_memory_to_gpu( +- peer_pdd->dev->kgd, +- buf_obj->mem, peer_pdd->vm); +- } +- +- run_rdma_free_callback(buf_obj); +- pdd->dev->kfd2kgd->free_memory_of_gpu( +- pdd->dev->kgd, buf_obj->mem, pdd->vm); +- kfd_process_device_remove_obj_handle(pdd, id); +- } +- } +-} ++ pr_debug("Releasing process (pasid %d) in workqueue\n", ++ p->pasid); + +-static void kfd_process_destroy_pdds(struct kfd_process *p) +-{ +- struct kfd_process_device *pdd, *temp; ++ mutex_lock(&p->mutex); + + list_for_each_entry_safe(pdd, temp, &p->per_device_data, +- per_device_list) { +- kfd_flush_tlb(pdd->dev, p->pasid); +- /* Destroy the GPUVM VM context */ +- if (pdd->vm) { +- dma_fence_put(p->ef); +- pdd->dev->kfd2kgd->destroy_process_vm( +- pdd->dev->kgd, pdd->vm); +- } +- list_del(&pdd->per_device_list); ++ per_device_list) { ++ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", ++ pdd->dev->id, p->pasid); + +- if (pdd->qpd.cwsr_pages) { +- kunmap(pdd->qpd.cwsr_pages); +- __free_pages(pdd->qpd.cwsr_pages, +- get_order(pdd->dev->cwsr_size)); +- } ++ if (pdd->reset_wavefronts) ++ dbgdev_wave_reset_wavefronts(pdd->dev, p); + +- kfree(pdd->qpd.doorbell_bitmap); +- idr_destroy(&pdd->alloc_idr); ++ amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); ++ list_del(&pdd->per_device_list); + + kfree(pdd); + } +-} +- +-/* No process locking is needed in this function, because the process +- * is not findable any more. We must assume that no other thread is +- * using it any more, otherwise we couldn't safely free the process +- * structure in the end. +- */ +-static void kfd_process_wq_release(struct work_struct *work) +-{ +- struct kfd_process *p = container_of(work, struct kfd_process, +- release_work); +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- struct kfd_process_device *pdd; +- +- pr_debug("Releasing process (pasid %d)\n", +- p->pasid); +- +- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { +- pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", +- pdd->dev->id, p->pasid); +- +- if (pdd->dev->device_info->is_need_iommu_device) { +- if (pdd->bound == PDD_BOUND) { +- amd_iommu_unbind_pasid(pdd->dev->pdev, +- p->pasid); +- pdd->bound = PDD_UNBOUND; +- } +- } +- } +-#endif +- +- kfd_process_free_outstanding_kfd_bos(p); +- +- kfd_process_destroy_pdds(p); + + kfd_event_free_process(p); + + kfd_pasid_free(p->pasid); + ++ mutex_unlock(&p->mutex); ++ + mutex_destroy(&p->mutex); + +- put_task_struct(p->lead_thread); ++ kfree(p->queues); + + kfree(p); ++ ++ kfree(work); + } + +-static void kfd_process_ref_release(struct kref *ref) ++static void kfd_process_destroy_delayed(struct rcu_head *rcu) + { +- struct kfd_process *p = container_of(ref, struct kfd_process, ref); ++ struct kfd_process_release_work *work; ++ struct kfd_process *p; + +- if (WARN_ON(!kfd_process_wq)) +- return; ++ BUG_ON(!kfd_process_wq); + +- INIT_WORK(&p->release_work, kfd_process_wq_release); +- queue_work(kfd_process_wq, &p->release_work); +-} ++ p = container_of(rcu, struct kfd_process, rcu); ++ BUG_ON(atomic_read(&p->mm->mm_count) <= 0); + +-static void kfd_process_destroy_delayed(struct rcu_head *rcu) +-{ +- struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); ++ mmdrop(p->mm); ++ ++ work = kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC); + +- kfd_unref_process(p); ++ if (work) { ++ INIT_WORK((struct work_struct *) work, kfd_process_wq_release); ++ work->p = p; ++ queue_work(kfd_process_wq, (struct work_struct *) work); ++ } + } + + static void kfd_process_notifier_release(struct mmu_notifier *mn, +@@ -421,19 +223,13 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, + { + struct kfd_process *p; + struct kfd_process_device *pdd = NULL; +- struct kfd_dev *dev = NULL; +- long status = -EFAULT; + + /* + * The kfd_process structure can not be free because the + * mmu_notifier srcu is read locked + */ + p = container_of(mn, struct kfd_process, mmu_notifier); +- if (WARN_ON(p->mm != mm)) +- return; +- +- cancel_delayed_work_sync(&p->eviction_work.dwork); +- cancel_delayed_work_sync(&p->restore_work); ++ BUG_ON(p->mm != mm); + + mutex_lock(&kfd_processes_mutex); + hash_del_rcu(&p->kfd_processes); +@@ -442,46 +238,33 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, + + mutex_lock(&p->mutex); + +- /* Iterate over all process device data structures and if the pdd is in +- * debug mode,we should first force unregistration, then we will be +- * able to destroy the queues +- */ +- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { +- dev = pdd->dev; +- mutex_lock(kfd_get_dbgmgr_mutex()); +- +- if (dev && dev->dbgmgr && (dev->dbgmgr->pasid == p->pasid)) { +- +- status = kfd_dbgmgr_unregister(dev->dbgmgr, p); +- if (status == 0) { +- kfd_dbgmgr_destroy(dev->dbgmgr); +- dev->dbgmgr = NULL; +- } +- } +- mutex_unlock(kfd_get_dbgmgr_mutex()); +- } +- +- kfd_process_dequeue_from_all_devices(p); +- +- /* now we can uninit the pqm: */ ++ /* In case our notifier is called before IOMMU notifier */ + pqm_uninit(&p->pqm); + + /* Iterate over all process device data structure and check +- * if we should delete debug managers ++ * if we should delete debug managers and reset all wavefronts + */ + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + if ((pdd->dev->dbgmgr) && + (pdd->dev->dbgmgr->pasid == p->pasid)) + kfd_dbgmgr_destroy(pdd->dev->dbgmgr); + ++ if (pdd->reset_wavefronts) { ++ pr_warn("amdkfd: Resetting all wave fronts\n"); ++ dbgdev_wave_reset_wavefronts(pdd->dev, p); ++ pdd->reset_wavefronts = false; ++ } + } + +- /* Indicate to other users that MM is no longer valid */ +- p->mm = NULL; +- + mutex_unlock(&p->mutex); + +- mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); ++ /* ++ * Because we drop mm_count inside kfd_process_destroy_delayed ++ * and because the mmu_notifier_unregister function also drop ++ * mm_count we need to take an extra count here. ++ */ ++ mmgrab(p->mm); ++ mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm); + mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); + } + +@@ -489,68 +272,7 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { + .release = kfd_process_notifier_release, + }; + +-static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep) +-{ +- int ret; +- unsigned long offset; +- struct kfd_process_device *temp, *pdd = NULL; +- struct kfd_dev *dev = NULL; +- struct qcm_process_device *qpd = NULL; +- void *kaddr; +- uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | +- ALLOC_MEM_FLAGS_NO_SUBSTITUTE | +- ALLOC_MEM_FLAGS_READONLY | +- ALLOC_MEM_FLAGS_EXECUTE_ACCESS; +- +- list_for_each_entry_safe(pdd, temp, &p->per_device_data, +- per_device_list) { +- dev = pdd->dev; +- qpd = &pdd->qpd; +- if (!dev->cwsr_enabled || qpd->cwsr_kaddr) +- continue; +- if (qpd->cwsr_base) { +- /* cwsr_base is only set for DGPU */ +- ret = kfd_process_alloc_gpuvm(p, dev, qpd->cwsr_base, +- dev->cwsr_size, &kaddr, pdd, flags); +- if (!ret) { +- qpd->cwsr_kaddr = kaddr; +- qpd->tba_addr = qpd->cwsr_base; +- } else +- /* In case of error, the kfd_bos for some pdds +- * which are already allocated successfully +- * will be freed in upper level function +- * i.e. create_process(). +- */ +- return ret; +- } else { +- offset = (dev->id | +- KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT; +- qpd->tba_addr = (uint64_t)vm_mmap(filep, 0, +- dev->cwsr_size, PROT_READ | PROT_EXEC, +- MAP_SHARED, offset); +- +- if (IS_ERR_VALUE(qpd->tba_addr)) { +- pr_err("Failure to set tba address. error -%d.\n", +- (int)qpd->tba_addr); +- qpd->tba_addr = 0; +- qpd->cwsr_kaddr = NULL; +- return -ENOMEM; +- } +- } +- +- memcpy(qpd->cwsr_kaddr, kmap(dev->cwsr_pages), PAGE_SIZE); +- kunmap(dev->cwsr_pages); +- +- qpd->tma_addr = qpd->tba_addr + dev->tma_offset; +- pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n", +- qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr); +- } +- +- return 0; +-} +- +-static struct kfd_process *create_process(const struct task_struct *thread, +- struct file *filep) ++static struct kfd_process *create_process(const struct task_struct *thread) + { + struct kfd_process *process; + int err = -ENOMEM; +@@ -560,20 +282,22 @@ static struct kfd_process *create_process(const struct task_struct *thread, + if (!process) + goto err_alloc_process; + +- process->bo_interval_tree = RB_ROOT_CACHED; ++ process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, ++ sizeof(process->queues[0]), GFP_KERNEL); ++ if (!process->queues) ++ goto err_alloc_queues; + + process->pasid = kfd_pasid_alloc(); + if (process->pasid == 0) + goto err_alloc_pasid; + +- kref_init(&process->ref); + mutex_init(&process->mutex); + + process->mm = thread->mm; + + /* register notifier */ + process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; +- err = mmu_notifier_register(&process->mmu_notifier, process->mm); ++ err = __mmu_notifier_register(&process->mmu_notifier, process->mm); + if (err) + goto err_mmu_notifier; + +@@ -581,7 +305,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, + (uintptr_t)process->mm); + + process->lead_thread = thread->group_leader; +- get_task_struct(process->lead_thread); ++ ++ process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE; + + INIT_LIST_HEAD(&process->per_device_data); + +@@ -597,28 +322,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, + if (err != 0) + goto err_init_apertures; + +- err = kfd_process_reserve_ib_mem(process); +- if (err) +- goto err_reserve_ib_mem; +- err = kfd_process_init_cwsr(process, filep); +- if (err) +- goto err_init_cwsr; +- +- INIT_DELAYED_WORK(&process->eviction_work.dwork, kfd_evict_bo_worker); +- INIT_DELAYED_WORK(&process->restore_work, kfd_restore_bo_worker); +- process->last_restore_timestamp = get_jiffies_64(); +- +- /* If PeerDirect interface was not detected try to detect it again +- * in case if network driver was loaded later. +- */ +- kfd_init_peer_direct(); +- + return process; + +-err_init_cwsr: +-err_reserve_ib_mem: +- kfd_process_free_outstanding_kfd_bos(process); +- kfd_process_destroy_pdds(process); + err_init_apertures: + pqm_uninit(&process->pqm); + err_process_pqm_init: +@@ -629,36 +334,13 @@ static struct kfd_process *create_process(const struct task_struct *thread, + mutex_destroy(&process->mutex); + kfd_pasid_free(process->pasid); + err_alloc_pasid: ++ kfree(process->queues); ++err_alloc_queues: + kfree(process); + err_alloc_process: + return ERR_PTR(err); + } + +-static int init_doorbell_bitmap(struct qcm_process_device *qpd, +- struct kfd_dev *dev) +-{ +- unsigned int i; +- +- if (!KFD_IS_SOC15(dev->device_info->asic_family)) +- return 0; +- +- qpd->doorbell_bitmap = +- kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, +- BITS_PER_BYTE), GFP_KERNEL); +- if (!qpd->doorbell_bitmap) +- return -ENOMEM; +- +- /* Mask out any reserved doorbells */ +- for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) +- if ((dev->shared_resources.reserved_doorbell_mask & i) == +- dev->shared_resources.reserved_doorbell_val) { +- set_bit(i, qpd->doorbell_bitmap); +- pr_debug("reserved doorbell 0x%03x\n", i); +- } +- +- return 0; +-} +- + struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p) + { +@@ -666,9 +348,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) + if (pdd->dev == dev) +- return pdd; ++ break; + +- return NULL; ++ return pdd; + } + + struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, +@@ -677,41 +359,16 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process_device *pdd = NULL; + + pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); +- if (!pdd) +- return NULL; +- +- pdd->dev = dev; +- INIT_LIST_HEAD(&pdd->qpd.queues_list); +- INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); +- pdd->qpd.dqm = dev->dqm; +- pdd->qpd.pqm = &p->pqm; +- pdd->qpd.evicted = 0; +- pdd->process = p; +- pdd->bound = PDD_UNBOUND; +- pdd->already_dequeued = false; +- list_add(&pdd->per_device_list, &p->per_device_data); +- +- /* Init idr used for memory handle translation */ +- idr_init(&pdd->alloc_idr); +- if (init_doorbell_bitmap(&pdd->qpd, dev)) { +- pr_err("Failed to init doorbell for process\n"); +- goto err_create_pdd; ++ if (pdd != NULL) { ++ pdd->dev = dev; ++ INIT_LIST_HEAD(&pdd->qpd.queues_list); ++ INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); ++ pdd->qpd.dqm = dev->dqm; ++ pdd->reset_wavefronts = false; ++ list_add(&pdd->per_device_list, &p->per_device_data); + } + +- /* Create the GPUVM context for this specific device */ +- if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm, +- &p->process_info, &p->ef)) { +- pr_err("Failed to create process VM object\n"); +- goto err_create_pdd; +- } + return pdd; +- +-err_create_pdd: +- kfree(pdd->qpd.doorbell_bitmap); +- idr_destroy(&pdd->alloc_idr); +- list_del(&pdd->per_device_list); +- kfree(pdd); +- return NULL; + } + + /* +@@ -725,6 +382,7 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + struct kfd_process *p) + { + struct kfd_process_device *pdd; ++ int err; + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { +@@ -732,89 +390,24 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + return ERR_PTR(-ENOMEM); + } + +- if (pdd->bound == PDD_BOUND) ++ if (pdd->bound) + return pdd; + +- if (pdd->bound == PDD_BOUND_SUSPENDED) { +- pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); +- return ERR_PTR(-EINVAL); +- } ++ err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); ++ if (err < 0) ++ return ERR_PTR(err); + +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- if (dev->device_info->is_need_iommu_device) { +- int err = amd_iommu_bind_pasid(dev->pdev, p->pasid, +- p->lead_thread); +- if (err < 0) +- return ERR_PTR(err); +- } +-#endif +- +- pdd->bound = PDD_BOUND; ++ pdd->bound = true; + + return pdd; + } + +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +-int kfd_bind_processes_to_device(struct kfd_dev *dev) ++void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) + { +- struct kfd_process_device *pdd; + struct kfd_process *p; +- unsigned int temp; +- int err = 0; +- +- int idx = srcu_read_lock(&kfd_processes_srcu); +- +- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +- mutex_lock(&p->mutex); +- pdd = kfd_get_process_device_data(dev, p); +- if (pdd->bound != PDD_BOUND_SUSPENDED) { +- mutex_unlock(&p->mutex); +- continue; +- } +- +- err = amd_iommu_bind_pasid(dev->pdev, p->pasid, +- p->lead_thread); +- if (err < 0) { +- pr_err("Unexpected pasid %d binding failure\n", +- p->pasid); +- mutex_unlock(&p->mutex); +- break; +- } +- +- pdd->bound = PDD_BOUND; +- mutex_unlock(&p->mutex); +- } +- +- srcu_read_unlock(&kfd_processes_srcu, idx); +- +- return err; +-} +- +-void kfd_unbind_processes_from_device(struct kfd_dev *dev) +-{ + struct kfd_process_device *pdd; +- struct kfd_process *p; +- unsigned int temp; +- +- int idx = srcu_read_lock(&kfd_processes_srcu); +- +- +- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +- mutex_lock(&p->mutex); +- pdd = kfd_get_process_device_data(dev, p); + +- if (pdd->bound == PDD_BOUND) +- pdd->bound = PDD_BOUND_SUSPENDED; +- mutex_unlock(&p->mutex); +- } +- +- srcu_read_unlock(&kfd_processes_srcu, idx); +-} +- +-void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) +-{ +- struct kfd_process *p; +- struct kfd_process_device *pdd; ++ BUG_ON(dev == NULL); + + /* + * Look for the process that matches the pasid. If there is no such +@@ -827,43 +420,43 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) + + pr_debug("Unbinding process %d from IOMMU\n", pasid); + +- mutex_lock(kfd_get_dbgmgr_mutex()); ++ if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) ++ kfd_dbgmgr_destroy(dev->dbgmgr); + +- if (dev->dbgmgr && (dev->dbgmgr->pasid == p->pasid)) { ++ pqm_uninit(&p->pqm); + +- if (kfd_dbgmgr_unregister(dev->dbgmgr, p) == 0) { +- kfd_dbgmgr_destroy(dev->dbgmgr); +- dev->dbgmgr = NULL; +- } +- } ++ pdd = kfd_get_process_device_data(dev, p); + +- mutex_unlock(kfd_get_dbgmgr_mutex()); ++ if (!pdd) { ++ mutex_unlock(&p->mutex); ++ return; ++ } + +- mutex_lock(&p->mutex); ++ if (pdd->reset_wavefronts) { ++ dbgdev_wave_reset_wavefronts(pdd->dev, p); ++ pdd->reset_wavefronts = false; ++ } + +- pdd = kfd_get_process_device_data(dev, p); +- if (pdd) +- /* For GPU relying on IOMMU, we need to dequeue here +- * when PASID is still bound. +- */ +- kfd_process_dequeue_from_device(pdd); ++ /* ++ * Just mark pdd as unbound, because we still need it ++ * to call amd_iommu_unbind_pasid() in when the ++ * process exits. ++ * We don't call amd_iommu_unbind_pasid() here ++ * because the IOMMU called us. ++ */ ++ pdd->bound = false; + + mutex_unlock(&p->mutex); +- +- kfd_unref_process(p); + } +-#endif /* CONFIG_AMD_IOMMU_V2 */ + +-struct kfd_process_device *kfd_get_first_process_device_data( +- struct kfd_process *p) ++struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p) + { + return list_first_entry(&p->per_device_data, + struct kfd_process_device, + per_device_list); + } + +-struct kfd_process_device *kfd_get_next_process_device_data( +- struct kfd_process *p, ++struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, + struct kfd_process_device *pdd) + { + if (list_is_last(&pdd->per_device_list, &p->per_device_data)) +@@ -876,272 +469,22 @@ bool kfd_has_process_device_data(struct kfd_process *p) + return !(list_empty(&p->per_device_data)); + } + +-/* Create specific handle mapped to mem from process local memory idr +- * Assumes that the process lock is held. +- */ +-int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, +- void *mem, uint64_t start, +- uint64_t length, +- struct kfd_ipc_obj *ipc_obj) +-{ +- int handle; +- struct kfd_bo *buf_obj; +- struct kfd_process *p; +- +- p = pdd->process; +- +- buf_obj = kzalloc(sizeof(*buf_obj), GFP_KERNEL); +- +- if (!buf_obj) +- return -ENOMEM; +- +- buf_obj->it.start = start; +- buf_obj->it.last = start + length - 1; +- interval_tree_insert(&buf_obj->it, &p->bo_interval_tree); +- +- buf_obj->mem = mem; +- buf_obj->dev = pdd->dev; +- buf_obj->kfd_ipc_obj = ipc_obj; +- +- INIT_LIST_HEAD(&buf_obj->cb_data_head); +- +- idr_preload(GFP_KERNEL); +- +- handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID, +- GFP_NOWAIT); +- +- idr_preload_end(); +- +- if (handle < 0) +- kfree(buf_obj); +- +- return handle; +-} +- +-struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, +- int handle) +-{ +- if (handle < 0) +- return NULL; +- +- return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle); +-} +- +-/* Translate specific handle from process local memory idr +- * Assumes that the process lock is held. +- */ +-void *kfd_process_device_translate_handle(struct kfd_process_device *pdd, +- int handle) +-{ +- struct kfd_bo *buf_obj; +- +- buf_obj = kfd_process_device_find_bo(pdd, handle); +- +- return buf_obj->mem; +-} +- +-void *kfd_process_find_bo_from_interval(struct kfd_process *p, +- uint64_t start_addr, +- uint64_t last_addr) +-{ +- struct interval_tree_node *it_node; +- struct kfd_bo *buf_obj; +- +- it_node = interval_tree_iter_first(&p->bo_interval_tree, +- start_addr, last_addr); +- if (!it_node) { +- pr_err("0x%llx-0x%llx does not relate to an existing buffer\n", +- start_addr, last_addr); +- return NULL; +- } +- +- if (interval_tree_iter_next(it_node, start_addr, last_addr)) { +- pr_err("0x%llx-0x%llx spans more than a single BO\n", +- start_addr, last_addr); +- return NULL; +- } +- +- buf_obj = container_of(it_node, struct kfd_bo, it); +- +- return buf_obj; +-} +- +-/* Remove specific handle from process local memory idr +- * Assumes that the process lock is held. +- */ +-void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, +- int handle) +-{ +- struct kfd_bo *buf_obj; +- struct kfd_process *p; +- +- p = pdd->process; +- +- if (handle < 0) +- return; +- +- buf_obj = kfd_process_device_find_bo(pdd, handle); +- +- if (buf_obj->kfd_ipc_obj) +- ipc_obj_put(&buf_obj->kfd_ipc_obj); +- +- idr_remove(&pdd->alloc_idr, handle); +- +- interval_tree_remove(&buf_obj->it, &p->bo_interval_tree); +- +- kfree(buf_obj); +-} +- +-/* This increments the process->ref counter. */ ++/* This returns with process->mutex locked. */ + struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) + { +- struct kfd_process *p, *ret_p = NULL; ++ struct kfd_process *p; + unsigned int temp; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + if (p->pasid == pasid) { +- kref_get(&p->ref); +- ret_p = p; ++ mutex_lock(&p->mutex); + break; + } + } + + srcu_read_unlock(&kfd_processes_srcu, idx); + +- return ret_p; +-} +- +-void kfd_suspend_all_processes(void) +-{ +- struct kfd_process *p; +- unsigned int temp; +- int idx = srcu_read_lock(&kfd_processes_srcu); +- +- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +- if (cancel_delayed_work_sync(&p->eviction_work.dwork)) +- dma_fence_put(p->eviction_work.quiesce_fence); +- cancel_delayed_work_sync(&p->restore_work); +- +- if (quiesce_process_mm(p)) +- pr_err("Failed to suspend process %d\n", p->pasid); +- dma_fence_signal(p->ef); +- dma_fence_put(p->ef); +- p->ef = NULL; +- } +- srcu_read_unlock(&kfd_processes_srcu, idx); +-} +- +-int kfd_resume_all_processes(void) +-{ +- struct kfd_process *p; +- unsigned int temp; +- int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); +- +- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +- if (!schedule_delayed_work(&p->restore_work, 0)) { +- pr_err("Restore process %d failed during resume\n", +- p->pasid); +- ret = -EFAULT; +- } +- } +- srcu_read_unlock(&kfd_processes_srcu, idx); +- return ret; +-} +- +-/* This increments the process->ref counter. */ +-struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) +-{ +- struct kfd_process *p; +- +- int idx = srcu_read_lock(&kfd_processes_srcu); +- +- p = find_process_by_mm(mm); +- if (p) +- kref_get(&p->ref); +- +- srcu_read_unlock(&kfd_processes_srcu, idx); +- + return p; + } +- +-int kfd_reserved_mem_mmap(struct kfd_process *process, +- struct vm_area_struct *vma) +-{ +- unsigned long pfn, i; +- int ret = 0; +- struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); +- struct kfd_process_device *temp, *pdd = NULL; +- struct qcm_process_device *qpd = NULL; +- +- if (!dev) +- return -EINVAL; +- if (((vma->vm_end - vma->vm_start) != dev->cwsr_size) || +- (vma->vm_start & (PAGE_SIZE - 1)) || +- (vma->vm_end & (PAGE_SIZE - 1))) { +- pr_err("KFD only support page aligned memory map and correct size.\n"); +- return -EINVAL; +- } +- +- pr_debug("kfd reserved mem mmap been called.\n"); +- +- list_for_each_entry_safe(pdd, temp, &process->per_device_data, +- per_device_list) { +- if (dev == pdd->dev) { +- qpd = &pdd->qpd; +- break; +- } +- } +- if (!qpd) +- return -EINVAL; +- +- qpd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, +- get_order(dev->cwsr_size)); +- if (!qpd->cwsr_pages) { +- pr_err("amdkfd: error alloc CWSR isa memory per process.\n"); +- return -ENOMEM; +- } +- qpd->cwsr_kaddr = kmap(qpd->cwsr_pages); +- +- vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND +- | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; +- for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); ++i) { +- pfn = page_to_pfn(&qpd->cwsr_pages[i]); +- /* mapping the page to user process */ +- ret = remap_pfn_range(vma, vma->vm_start + (i << PAGE_SHIFT), +- pfn, PAGE_SIZE, vma->vm_page_prot); +- if (ret) +- break; +- } +- return ret; +-} +- +-#if defined(CONFIG_DEBUG_FS) +- +-int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) +-{ +- struct kfd_process *p; +- unsigned int temp; +- int r = 0; +- +- int idx = srcu_read_lock(&kfd_processes_srcu); +- +- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +- seq_printf(m, "Process %d PASID %d:\n", +- p->lead_thread->tgid, p->pasid); +- +- mutex_lock(&p->mutex); +- r = pqm_debugfs_mqds(m, &p->pqm); +- mutex_unlock(&p->mutex); +- +- if (r != 0) +- break; +- } +- +- srcu_read_unlock(&kfd_processes_srcu, idx); +- +- return r; +-} +- +-#endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +index a87fcab..46f497e 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +@@ -32,9 +32,12 @@ static inline struct process_queue_node *get_queue_by_qid( + { + struct process_queue_node *pqn; + ++ BUG_ON(!pqm); ++ + list_for_each_entry(pqn, &pqm->queues, process_queue_list) { +- if ((pqn->q && pqn->q->properties.queue_id == qid) || +- (pqn->kq && pqn->kq->queue->properties.queue_id == qid)) ++ if (pqn->q && pqn->q->properties.queue_id == qid) ++ return pqn; ++ if (pqn->kq && pqn->kq->queue->properties.queue_id == qid) + return pqn; + } + +@@ -46,13 +49,17 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, + { + unsigned long found; + ++ BUG_ON(!pqm || !qid); ++ ++ pr_debug("kfd: in %s\n", __func__); ++ + found = find_first_zero_bit(pqm->queue_slot_bitmap, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + +- pr_debug("The new slot id %lu\n", found); ++ pr_debug("kfd: the new slot id %lu\n", found); + + if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { +- pr_info("Cannot open more queues for process with pasid %d\n", ++ pr_info("amdkfd: Can not open more queues for process with pasid %d\n", + pqm->process->pasid); + return -ENOMEM; + } +@@ -63,33 +70,15 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, + return 0; + } + +-void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) +-{ +- struct kfd_dev *dev = pdd->dev; +- int retval; +- +- if (pdd->already_dequeued) +- return; +- +- retval = dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd); +- pdd->already_dequeued = true; +-} +- +-void kfd_process_dequeue_from_all_devices(struct kfd_process *p) +-{ +- struct kfd_process_device *pdd; +- +- list_for_each_entry(pdd, &p->per_device_data, per_device_list) +- kfd_process_dequeue_from_device(pdd); +-} +- + int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) + { ++ BUG_ON(!pqm); ++ + INIT_LIST_HEAD(&pqm->queues); + pqm->queue_slot_bitmap = + kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + BITS_PER_BYTE), GFP_KERNEL); +- if (!pqm->queue_slot_bitmap) ++ if (pqm->queue_slot_bitmap == NULL) + return -ENOMEM; + pqm->process = p; + +@@ -98,14 +87,25 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) + + void pqm_uninit(struct process_queue_manager *pqm) + { ++ int retval; + struct process_queue_node *pqn, *next; + ++ BUG_ON(!pqm); ++ ++ pr_debug("In func %s\n", __func__); ++ + list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { +- uninit_queue(pqn->q); +- list_del(&pqn->process_queue_list); +- kfree(pqn); ++ retval = pqm_destroy_queue( ++ pqm, ++ (pqn->q != NULL) ? ++ pqn->q->properties.queue_id : ++ pqn->kq->queue->properties.queue_id); ++ ++ if (retval != 0) { ++ pr_err("kfd: failed to destroy queue\n"); ++ return; ++ } + } +- + kfree(pqm->queue_slot_bitmap); + pqm->queue_slot_bitmap = NULL; + } +@@ -117,39 +117,54 @@ static int create_cp_queue(struct process_queue_manager *pqm, + { + int retval; + ++ retval = 0; ++ + /* Doorbell initialized in user space*/ + q_properties->doorbell_ptr = NULL; + ++ q_properties->doorbell_off = ++ kfd_queue_id_to_doorbell(dev, pqm->process, qid); ++ + /* let DQM handle it*/ + q_properties->vmid = 0; + q_properties->queue_id = qid; + + retval = init_queue(q, q_properties); + if (retval != 0) +- return retval; ++ goto err_init_queue; + + (*q)->device = dev; + (*q)->process = pqm->process; + +- pr_debug("PQM After init queue"); ++ pr_debug("kfd: PQM After init queue"); + + return retval; ++ ++err_init_queue: ++ return retval; + } + + int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, ++ unsigned int flags, ++ enum kfd_queue_type type, + unsigned int *qid) + { + int retval; + struct kfd_process_device *pdd; ++ struct queue_properties q_properties; + struct queue *q; + struct process_queue_node *pqn; + struct kernel_queue *kq; +- enum kfd_queue_type type = properties->type; +- unsigned int max_queues = 127; /* HWS limit */ ++ int num_queues = 0; ++ struct queue *cur; ++ ++ BUG_ON(!pqm || !dev || !properties || !qid); + ++ memset(&q_properties, 0, sizeof(struct queue_properties)); ++ memcpy(&q_properties, properties, sizeof(struct queue_properties)); + q = NULL; + kq = NULL; + +@@ -165,21 +180,24 @@ int pqm_create_queue(struct process_queue_manager *pqm, + * If we are just about to create DIQ, the is_debug flag is not set yet + * Hence we also check the type as well + */ +- if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ)) +- max_queues = dev->device_info->max_no_of_hqd/2; +- +- if (pdd->qpd.queue_count >= max_queues) +- return -ENOSPC; ++ if ((pdd->qpd.is_debug) || ++ (type == KFD_QUEUE_TYPE_DIQ)) { ++ list_for_each_entry(cur, &pdd->qpd.queues_list, list) ++ num_queues++; ++ if (num_queues >= dev->device_info->max_no_of_hqd/2) ++ return (-ENOSPC); ++ } + + retval = find_available_queue_slot(pqm, qid); + if (retval != 0) + return retval; + +- if (list_empty(&pdd->qpd.queues_list) && +- list_empty(&pdd->qpd.priv_queue_list)) ++ if (list_empty(&pqm->queues)) { ++ pdd->qpd.pqm = pqm; + dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); ++ } + +- pqn = kzalloc(sizeof(*pqn), GFP_KERNEL); ++ pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL); + if (!pqn) { + retval = -ENOMEM; + goto err_allocate_pqn; +@@ -187,35 +205,18 @@ int pqm_create_queue(struct process_queue_manager *pqm, + + switch (type) { + case KFD_QUEUE_TYPE_SDMA: +- if (dev->dqm->sdma_queue_count >= CIK_SDMA_QUEUES) { +- pr_err("Over-subscription is not allowed for SDMA\n"); +- retval = -EPERM; +- goto err_create_queue; +- } +- +- retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); +- if (retval != 0) +- goto err_create_queue; +- pqn->q = q; +- pqn->kq = NULL; +- retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, +- &q->properties.vmid); +- pr_debug("DQM returned %d for create_queue\n", retval); +- print_queue(q); +- break; + + case KFD_QUEUE_TYPE_COMPUTE: + /* check if there is over subscription */ +- if ((dev->dqm->sched_policy == +- KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && +- ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || ++ if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && ++ ((dev->dqm->processes_count >= VMID_PER_DEVICE) || + (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) { +- pr_err("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); ++ pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); + retval = -EPERM; + goto err_create_queue; + } + +- retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); ++ retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; +@@ -227,7 +228,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, + break; + case KFD_QUEUE_TYPE_DIQ: + kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ); +- if (!kq) { ++ if (kq == NULL) { + retval = -ENOMEM; + goto err_create_queue; + } +@@ -238,31 +239,23 @@ int pqm_create_queue(struct process_queue_manager *pqm, + kq, &pdd->qpd); + break; + default: +- WARN(1, "Invalid queue type %d", type); +- retval = -EINVAL; ++ BUG(); ++ break; + } + + if (retval != 0) { +- pr_err("DQM create queue failed\n"); ++ pr_debug("Error dqm create queue\n"); + goto err_create_queue; + } + +- if (q) +- /* Return the doorbell offset within the doorbell page +- * to the caller so it can be passed up to user mode +- * (in bytes). +- */ +- properties->doorbell_off = +- (q->properties.doorbell_off * sizeof(uint32_t)) & +- (kfd_doorbell_process_slice(dev) - 1); +- +- pr_debug("PQM After DQM create queue\n"); ++ pr_debug("kfd: PQM After DQM create queue\n"); + + list_add(&pqn->process_queue_list, &pqm->queues); + + if (q) { +- pr_debug("PQM done creating queue\n"); +- print_queue_properties(&q->properties); ++ *properties = q->properties; ++ pr_debug("kfd: PQM done creating queue\n"); ++ print_queue_properties(properties); + } + + return retval; +@@ -272,8 +265,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, + err_allocate_pqn: + /* check if queues list is empty unregister process from device */ + clear_bit(*qid, pqm->queue_slot_bitmap); +- if (list_empty(&pdd->qpd.queues_list) && +- list_empty(&pdd->qpd.priv_queue_list)) ++ if (list_empty(&pqm->queues)) + dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); + return retval; + } +@@ -288,11 +280,14 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) + + dqm = NULL; + ++ BUG_ON(!pqm); + retval = 0; + ++ pr_debug("kfd: In Func %s\n", __func__); ++ + pqn = get_queue_by_qid(pqm, qid); +- if (!pqn) { +- pr_err("Queue id does not match any known queue\n"); ++ if (pqn == NULL) { ++ pr_err("kfd: queue id does not match any known queue\n"); + return -EINVAL; + } + +@@ -301,8 +296,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) + dev = pqn->kq->dev; + if (pqn->q) + dev = pqn->q->device; +- if (WARN_ON(!dev)) +- return -ENODEV; ++ BUG_ON(!dev); + + pdd = kfd_get_process_device_data(dev, pqm->process); + if (!pdd) { +@@ -319,9 +313,10 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) + + if (pqn->q) { + dqm = pqn->q->device->dqm; +- kfree(pqn->q->properties.cu_mask); +- pqn->q->properties.cu_mask = NULL; + retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); ++ if (retval != 0) ++ return retval; ++ + uninit_queue(pqn->q); + } + +@@ -329,8 +324,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) + kfree(pqn); + clear_bit(qid, pqm->queue_slot_bitmap); + +- if (list_empty(&pdd->qpd.queues_list) && +- list_empty(&pdd->qpd.priv_queue_list)) ++ if (list_empty(&pqm->queues)) + dqm->ops.unregister_process(dqm, &pdd->qpd); + + return retval; +@@ -342,9 +336,12 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + int retval; + struct process_queue_node *pqn; + ++ BUG_ON(!pqm); ++ + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { +- pr_debug("No queue %d exists for update operation\n", qid); ++ pr_debug("amdkfd: No queue %d exists for update operation\n", ++ qid); + return -EFAULT; + } + +@@ -361,40 +358,14 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + return 0; + } + +-int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, +- struct queue_properties *p) +-{ +- int retval; +- struct process_queue_node *pqn; +- +- pqn = get_queue_by_qid(pqm, qid); +- if (!pqn) { +- pr_debug("No queue %d exists for update operation\n", qid); +- return -EFAULT; +- } +- +- /* Free the old CU mask memory if it is already allocated, then +- * allocate memory for the new CU mask. +- */ +- kfree(pqn->q->properties.cu_mask); +- +- pqn->q->properties.cu_mask_count = p->cu_mask_count; +- pqn->q->properties.cu_mask = p->cu_mask; +- +- retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, +- pqn->q); +- if (retval != 0) +- return retval; +- +- return 0; +-} +- + struct kernel_queue *pqm_get_kernel_queue( + struct process_queue_manager *pqm, + unsigned int qid) + { + struct process_queue_node *pqn; + ++ BUG_ON(!pqm); ++ + pqn = get_queue_by_qid(pqm, qid); + if (pqn && pqn->kq) + return pqn->kq; +@@ -402,89 +373,4 @@ struct kernel_queue *pqm_get_kernel_queue( + return NULL; + } + +-int pqm_get_wave_state(struct process_queue_manager *pqm, +- unsigned int qid, +- void __user *ctl_stack, +- u32 *ctl_stack_used_size, +- u32 *save_area_used_size) +-{ +- struct process_queue_node *pqn; +- +- pqn = get_queue_by_qid(pqm, qid); +- if (!pqn) { +- pr_debug("amdkfd: No queue %d exists for operation\n", +- qid); +- return -EFAULT; +- } +- +- return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm, +- pqn->q, +- ctl_stack, +- ctl_stack_used_size, +- save_area_used_size); +-} +- +-#if defined(CONFIG_DEBUG_FS) +- +-int pqm_debugfs_mqds(struct seq_file *m, void *data) +-{ +- struct process_queue_manager *pqm = data; +- struct process_queue_node *pqn; +- struct queue *q; +- enum KFD_MQD_TYPE mqd_type; +- struct mqd_manager *mqd_manager; +- int r = 0; +- +- list_for_each_entry(pqn, &pqm->queues, process_queue_list) { +- if (pqn->q) { +- q = pqn->q; +- switch (q->properties.type) { +- case KFD_QUEUE_TYPE_SDMA: +- seq_printf(m, " SDMA queue on device %x\n", +- q->device->id); +- mqd_type = KFD_MQD_TYPE_SDMA; +- break; +- case KFD_QUEUE_TYPE_COMPUTE: +- seq_printf(m, " Compute queue on device %x\n", +- q->device->id); +- mqd_type = KFD_MQD_TYPE_CP; +- break; +- default: +- seq_printf(m, +- " Bad user queue type %d on device %x\n", +- q->properties.type, q->device->id); +- continue; +- } +- mqd_manager = q->device->dqm->ops.get_mqd_manager( +- q->device->dqm, mqd_type); +- } else if (pqn->kq) { +- q = pqn->kq->queue; +- mqd_manager = pqn->kq->mqd; +- switch (q->properties.type) { +- case KFD_QUEUE_TYPE_DIQ: +- seq_printf(m, " DIQ on device %x\n", +- pqn->kq->dev->id); +- mqd_type = KFD_MQD_TYPE_HIQ; +- break; +- default: +- seq_printf(m, +- " Bad kernel queue type %d on device %x\n", +- q->properties.type, +- pqn->kq->dev->id); +- continue; +- } +- } else { +- seq_printf(m, +- " Weird: Queue node with neither kernel nor user queue\n"); +- continue; +- } +- +- r = mqd_manager->debugfs_show_mqd(m, q->mqd); +- if (r != 0) +- break; +- } +- +- return r; +-} + +-#endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +index a5315d4..0ab1970 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +@@ -65,15 +65,17 @@ void print_queue(struct queue *q) + + int init_queue(struct queue **q, const struct queue_properties *properties) + { +- struct queue *tmp_q; ++ struct queue *tmp; + +- tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL); +- if (!tmp_q) ++ BUG_ON(!q); ++ ++ tmp = kzalloc(sizeof(struct queue), GFP_KERNEL); ++ if (!tmp) + return -ENOMEM; + +- memcpy(&tmp_q->properties, properties, sizeof(*properties)); ++ memcpy(&tmp->properties, properties, sizeof(struct queue_properties)); + +- *q = tmp_q; ++ *q = tmp; + return 0; + } + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c +deleted file mode 100644 +index 2f5cdb9..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c ++++ /dev/null +@@ -1,294 +0,0 @@ +-/* +- * Copyright 2015 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#include <linux/device.h> +-#include <linux/export.h> +-#include <linux/pid.h> +-#include <linux/err.h> +-#include <linux/slab.h> +-#include "kfd_priv.h" +- +- +-struct rdma_cb { +- struct list_head node; +- struct amd_p2p_info amd_p2p_data; +- void (*free_callback)(void *client_priv); +- void *client_priv; +-}; +- +-/** +- * This function makes the pages underlying a range of GPU virtual memory +- * accessible for DMA operations from another PCIe device +- * +- * \param address - The start address in the Unified Virtual Address +- * space in the specified process +- * \param length - The length of requested mapping +- * \param pid - Pointer to structure pid to which address belongs. +- * Could be NULL for current process address space. +- * \param p2p_data - On return: Pointer to structure describing +- * underlying pages/locations +- * \param free_callback - Pointer to callback which will be called when access +- * to such memory must be stopped immediately: Memory +- * was freed, GECC events, etc. +- * Client should immediately stop any transfer +- * operations and returned as soon as possible. +- * After return all resources associated with address +- * will be release and no access will be allowed. +- * \param client_priv - Pointer to be passed as parameter on +- * 'free_callback; +- * +- * \return 0 if operation was successful +- */ +-static int get_pages(uint64_t address, uint64_t length, struct pid *pid, +- struct amd_p2p_info **amd_p2p_data, +- void (*free_callback)(void *client_priv), +- void *client_priv) +-{ +- struct kfd_bo *buf_obj; +- struct kgd_mem *mem; +- struct sg_table *sg_table_tmp; +- struct kfd_dev *dev; +- uint64_t last = address + length - 1; +- uint64_t offset; +- struct kfd_process *p; +- struct rdma_cb *rdma_cb_data; +- int ret = 0; +- +- p = kfd_lookup_process_by_pid(pid); +- if (!p) { +- pr_err("Could not find the process\n"); +- return -EINVAL; +- } +- mutex_lock(&p->mutex); +- +- buf_obj = kfd_process_find_bo_from_interval(p, address, last); +- if (!buf_obj) { +- pr_err("Cannot find a kfd_bo for the range\n"); +- ret = -EINVAL; +- goto out; +- } +- +- rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL); +- if (!rdma_cb_data) { +- *amd_p2p_data = NULL; +- ret = -ENOMEM; +- goto out; +- } +- +- mem = buf_obj->mem; +- dev = buf_obj->dev; +- offset = address - buf_obj->it.start; +- +- ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem, +- offset, length, &sg_table_tmp); +- +- if (ret) { +- pr_err("pin_get_sg_table_bo failed.\n"); +- *amd_p2p_data = NULL; +- goto free_mem; +- } +- +- rdma_cb_data->amd_p2p_data.va = address; +- rdma_cb_data->amd_p2p_data.size = length; +- rdma_cb_data->amd_p2p_data.pid = pid; +- rdma_cb_data->amd_p2p_data.priv = buf_obj; +- rdma_cb_data->amd_p2p_data.pages = sg_table_tmp; +- +- rdma_cb_data->free_callback = free_callback; +- rdma_cb_data->client_priv = client_priv; +- +- list_add(&rdma_cb_data->node, &buf_obj->cb_data_head); +- +- *amd_p2p_data = &rdma_cb_data->amd_p2p_data; +- +- goto out; +- +-free_mem: +- kfree(rdma_cb_data); +-out: +- mutex_unlock(&p->mutex); +- kfd_unref_process(p); +- +- return ret; +-} +- +-static int put_pages_helper(struct amd_p2p_info *p2p_data) +-{ +- struct kfd_bo *buf_obj; +- struct kfd_dev *dev; +- struct sg_table *sg_table_tmp; +- struct rdma_cb *rdma_cb_data; +- +- if (!p2p_data) { +- pr_err("amd_p2p_info pointer is invalid.\n"); +- return -EINVAL; +- } +- +- rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data); +- +- buf_obj = p2p_data->priv; +- dev = buf_obj->dev; +- sg_table_tmp = p2p_data->pages; +- +- list_del(&rdma_cb_data->node); +- kfree(rdma_cb_data); +- +- dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp); +- +- +- return 0; +-} +- +-void run_rdma_free_callback(struct kfd_bo *buf_obj) +-{ +- struct rdma_cb *tmp, *rdma_cb_data; +- +- list_for_each_entry_safe(rdma_cb_data, tmp, +- &buf_obj->cb_data_head, node) { +- if (rdma_cb_data->free_callback) +- rdma_cb_data->free_callback( +- rdma_cb_data->client_priv); +- +- put_pages_helper(&rdma_cb_data->amd_p2p_data); +- } +-} +- +-/** +- * +- * This function release resources previously allocated by get_pages() call. +- * +- * \param p_p2p_data - A pointer to pointer to amd_p2p_info entries +- * allocated by get_pages() call. +- * +- * \return 0 if operation was successful +- */ +-static int put_pages(struct amd_p2p_info **p_p2p_data) +-{ +- struct kfd_process *p = NULL; +- int ret = 0; +- +- if (!(*p_p2p_data)) { +- pr_err("amd_p2p_info pointer is invalid.\n"); +- return -EINVAL; +- } +- +- p = kfd_lookup_process_by_pid((*p_p2p_data)->pid); +- if (!p) { +- pr_err("Could not find the process\n"); +- return -EINVAL; +- } +- +- ret = put_pages_helper(*p_p2p_data); +- +- if (!ret) +- *p_p2p_data = NULL; +- +- kfd_unref_process(p); +- +- return ret; +-} +- +-/** +- * Check if given address belongs to GPU address space. +- * +- * \param address - Address to check +- * \param pid - Process to which given address belongs. +- * Could be NULL if current one. +- * +- * \return 0 - This is not GPU address managed by AMD driver +- * 1 - This is GPU address managed by AMD driver +- */ +-static int is_gpu_address(uint64_t address, struct pid *pid) +-{ +- struct kfd_bo *buf_obj; +- struct kfd_process *p; +- +- p = kfd_lookup_process_by_pid(pid); +- if (!p) { +- pr_debug("Could not find the process\n"); +- return 0; +- } +- +- buf_obj = kfd_process_find_bo_from_interval(p, address, address); +- +- kfd_unref_process(p); +- if (!buf_obj) +- return 0; +- +- return 1; +-} +- +-/** +- * Return the single page size to be used when building scatter/gather table +- * for given range. +- * +- * \param address - Address +- * \param length - Range length +- * \param pid - Process id structure. Could be NULL if current one. +- * \param page_size - On return: Page size +- * +- * \return 0 if operation was successful +- */ +-static int get_page_size(uint64_t address, uint64_t length, struct pid *pid, +- unsigned long *page_size) +-{ +- /* +- * As local memory is always consecutive, we can assume the local +- * memory page size to be arbitrary. +- * Currently we assume the local memory page size to be the same +- * as system memory, which is 4KB. +- */ +- *page_size = PAGE_SIZE; +- +- return 0; +-} +- +- +-/** +- * Singleton object: rdma interface function pointers +- */ +-static const struct amd_rdma_interface rdma_ops = { +- .get_pages = get_pages, +- .put_pages = put_pages, +- .is_gpu_address = is_gpu_address, +- .get_page_size = get_page_size, +-}; +- +-/** +- * amdkfd_query_rdma_interface - Return interface (function pointers table) for +- * rdma interface +- * +- * +- * \param interace - OUT: Pointer to interface +- * +- * \return 0 if operation was successful. +- */ +-int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops) +-{ +- *ops = &rdma_ops; +- +- return 0; +-} +-EXPORT_SYMBOL(amdkfd_query_rdma_interface); +- +- +- +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +index d08e3de..1e50647 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +@@ -28,32 +28,27 @@ + #include <linux/hash.h> + #include <linux/cpufreq.h> + #include <linux/log2.h> +-#include <linux/dmi.h> +-#include <linux/atomic.h> + + #include "kfd_priv.h" + #include "kfd_crat.h" + #include "kfd_topology.h" +-#include "kfd_device_queue_manager.h" + +-/* topology_device_list - Master list of all topology devices */ + static struct list_head topology_device_list; ++static int topology_crat_parsed; + static struct kfd_system_properties sys_props; + + static DECLARE_RWSEM(topology_lock); +-static atomic_t topology_crat_proximity_domain; + +-struct kfd_topology_device *kfd_topology_device_by_proximity_domain( +- uint32_t proximity_domain) ++struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) + { + struct kfd_topology_device *top_dev; +- struct kfd_topology_device *device = NULL; ++ struct kfd_dev *device = NULL; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) +- if (top_dev->proximity_domain == proximity_domain) { +- device = top_dev; ++ if (top_dev->gpu_id == gpu_id) { ++ device = top_dev->gpu; + break; + } + +@@ -62,7 +57,7 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain( + return device; + } + +-struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) ++struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) + { + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; +@@ -70,7 +65,7 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) +- if (top_dev->gpu_id == gpu_id) { ++ if (top_dev->gpu->pdev == pdev) { + device = top_dev->gpu; + break; + } +@@ -80,49 +75,300 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) + return device; + } + +-struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) ++static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) + { +- struct kfd_topology_device *top_dev; +- struct kfd_dev *device = NULL; ++ struct acpi_table_header *crat_table; ++ acpi_status status; + +- down_read(&topology_lock); ++ if (!size) ++ return -EINVAL; + +- list_for_each_entry(top_dev, &topology_device_list, list) +- if (top_dev->gpu && top_dev->gpu->pdev == pdev) { +- device = top_dev->gpu; ++ /* ++ * Fetch the CRAT table from ACPI ++ */ ++ status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); ++ if (status == AE_NOT_FOUND) { ++ pr_warn("CRAT table not found\n"); ++ return -ENODATA; ++ } else if (ACPI_FAILURE(status)) { ++ const char *err = acpi_format_exception(status); ++ ++ pr_err("CRAT table error: %s\n", err); ++ return -EINVAL; ++ } ++ ++ if (*size >= crat_table->length && crat_image != NULL) ++ memcpy(crat_image, crat_table, crat_table->length); ++ ++ *size = crat_table->length; ++ ++ return 0; ++} ++ ++static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, ++ struct crat_subtype_computeunit *cu) ++{ ++ BUG_ON(!dev); ++ BUG_ON(!cu); ++ ++ dev->node_props.cpu_cores_count = cu->num_cpu_cores; ++ dev->node_props.cpu_core_id_base = cu->processor_id_low; ++ if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) ++ dev->node_props.capability |= HSA_CAP_ATS_PRESENT; ++ ++ pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, ++ cu->processor_id_low); ++} ++ ++static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, ++ struct crat_subtype_computeunit *cu) ++{ ++ BUG_ON(!dev); ++ BUG_ON(!cu); ++ ++ dev->node_props.simd_id_base = cu->processor_id_low; ++ dev->node_props.simd_count = cu->num_simd_cores; ++ dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; ++ dev->node_props.max_waves_per_simd = cu->max_waves_simd; ++ dev->node_props.wave_front_size = cu->wave_front_size; ++ dev->node_props.mem_banks_count = cu->num_banks; ++ dev->node_props.array_count = cu->num_arrays; ++ dev->node_props.cu_per_simd_array = cu->num_cu_per_array; ++ dev->node_props.simd_per_cu = cu->num_simd_per_cu; ++ dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; ++ if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) ++ dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; ++ pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, ++ cu->processor_id_low); ++} ++ ++/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ ++static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) ++{ ++ struct kfd_topology_device *dev; ++ int i = 0; ++ ++ BUG_ON(!cu); ++ ++ pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", ++ cu->proximity_domain, cu->hsa_capability); ++ list_for_each_entry(dev, &topology_device_list, list) { ++ if (cu->proximity_domain == i) { ++ if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) ++ kfd_populated_cu_info_cpu(dev, cu); ++ ++ if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) ++ kfd_populated_cu_info_gpu(dev, cu); + break; + } ++ i++; ++ } + +- up_read(&topology_lock); ++ return 0; ++} + +- return device; ++/* ++ * kfd_parse_subtype_mem is called when the topology mutex is ++ * already acquired ++ */ ++static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) ++{ ++ struct kfd_mem_properties *props; ++ struct kfd_topology_device *dev; ++ int i = 0; ++ ++ BUG_ON(!mem); ++ ++ pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", ++ mem->promixity_domain); ++ list_for_each_entry(dev, &topology_device_list, list) { ++ if (mem->promixity_domain == i) { ++ props = kfd_alloc_struct(props); ++ if (props == NULL) ++ return -ENOMEM; ++ ++ if (dev->node_props.cpu_cores_count == 0) ++ props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; ++ else ++ props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; ++ ++ if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) ++ props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; ++ if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) ++ props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; ++ ++ props->size_in_bytes = ++ ((uint64_t)mem->length_high << 32) + ++ mem->length_low; ++ props->width = mem->width; ++ ++ dev->mem_bank_count++; ++ list_add_tail(&props->list, &dev->mem_props); ++ ++ break; ++ } ++ i++; ++ } ++ ++ return 0; + } + +-struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd) ++/* ++ * kfd_parse_subtype_cache is called when the topology mutex ++ * is already acquired ++ */ ++static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) + { +- struct kfd_topology_device *top_dev; +- struct kfd_dev *device = NULL; ++ struct kfd_cache_properties *props; ++ struct kfd_topology_device *dev; ++ uint32_t id; + +- down_read(&topology_lock); ++ BUG_ON(!cache); ++ ++ id = cache->processor_id_low; ++ ++ pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); ++ list_for_each_entry(dev, &topology_device_list, list) ++ if (id == dev->node_props.cpu_core_id_base || ++ id == dev->node_props.simd_id_base) { ++ props = kfd_alloc_struct(props); ++ if (props == NULL) ++ return -ENOMEM; ++ ++ props->processor_id_low = id; ++ props->cache_level = cache->cache_level; ++ props->cache_size = cache->cache_size; ++ props->cacheline_size = cache->cache_line_size; ++ props->cachelines_per_tag = cache->lines_per_tag; ++ props->cache_assoc = cache->associativity; ++ props->cache_latency = cache->cache_latency; ++ ++ if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) ++ props->cache_type |= HSA_CACHE_TYPE_DATA; ++ if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) ++ props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; ++ if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) ++ props->cache_type |= HSA_CACHE_TYPE_CPU; ++ if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) ++ props->cache_type |= HSA_CACHE_TYPE_HSACU; ++ ++ dev->cache_count++; ++ dev->node_props.caches_count++; ++ list_add_tail(&props->list, &dev->cache_props); + +- list_for_each_entry(top_dev, &topology_device_list, list) +- if (top_dev->gpu && top_dev->gpu->kgd == kgd) { +- device = top_dev->gpu; + break; + } + +- up_read(&topology_lock); ++ return 0; ++} + +- return device; ++/* ++ * kfd_parse_subtype_iolink is called when the topology mutex ++ * is already acquired ++ */ ++static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) ++{ ++ struct kfd_iolink_properties *props; ++ struct kfd_topology_device *dev; ++ uint32_t i = 0; ++ uint32_t id_from; ++ uint32_t id_to; ++ ++ BUG_ON(!iolink); ++ ++ id_from = iolink->proximity_domain_from; ++ id_to = iolink->proximity_domain_to; ++ ++ pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); ++ list_for_each_entry(dev, &topology_device_list, list) { ++ if (id_from == i) { ++ props = kfd_alloc_struct(props); ++ if (props == NULL) ++ return -ENOMEM; ++ ++ props->node_from = id_from; ++ props->node_to = id_to; ++ props->ver_maj = iolink->version_major; ++ props->ver_min = iolink->version_minor; ++ ++ /* ++ * weight factor (derived from CDIR), currently always 1 ++ */ ++ props->weight = 1; ++ ++ props->min_latency = iolink->minimum_latency; ++ props->max_latency = iolink->maximum_latency; ++ props->min_bandwidth = iolink->minimum_bandwidth_mbs; ++ props->max_bandwidth = iolink->maximum_bandwidth_mbs; ++ props->rec_transfer_size = ++ iolink->recommended_transfer_size; ++ ++ dev->io_link_count++; ++ dev->node_props.io_links_count++; ++ list_add_tail(&props->list, &dev->io_link_props); ++ ++ break; ++ } ++ i++; ++ } ++ ++ return 0; ++} ++ ++static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) ++{ ++ struct crat_subtype_computeunit *cu; ++ struct crat_subtype_memory *mem; ++ struct crat_subtype_cache *cache; ++ struct crat_subtype_iolink *iolink; ++ int ret = 0; ++ ++ BUG_ON(!sub_type_hdr); ++ ++ switch (sub_type_hdr->type) { ++ case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: ++ cu = (struct crat_subtype_computeunit *)sub_type_hdr; ++ ret = kfd_parse_subtype_cu(cu); ++ break; ++ case CRAT_SUBTYPE_MEMORY_AFFINITY: ++ mem = (struct crat_subtype_memory *)sub_type_hdr; ++ ret = kfd_parse_subtype_mem(mem); ++ break; ++ case CRAT_SUBTYPE_CACHE_AFFINITY: ++ cache = (struct crat_subtype_cache *)sub_type_hdr; ++ ret = kfd_parse_subtype_cache(cache); ++ break; ++ case CRAT_SUBTYPE_TLB_AFFINITY: ++ /* ++ * For now, nothing to do here ++ */ ++ pr_info("Found TLB entry in CRAT table (not processing)\n"); ++ break; ++ case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: ++ /* ++ * For now, nothing to do here ++ */ ++ pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); ++ break; ++ case CRAT_SUBTYPE_IOLINK_AFFINITY: ++ iolink = (struct crat_subtype_iolink *)sub_type_hdr; ++ ret = kfd_parse_subtype_iolink(iolink); ++ break; ++ default: ++ pr_warn("Unknown subtype (%d) in CRAT\n", ++ sub_type_hdr->type); ++ } ++ ++ return ret; + } + +-/* Called with write topology_lock acquired */ + static void kfd_release_topology_device(struct kfd_topology_device *dev) + { + struct kfd_mem_properties *mem; + struct kfd_cache_properties *cache; + struct kfd_iolink_properties *iolink; +- struct kfd_perf_properties *perf; ++ ++ BUG_ON(!dev); + + list_del(&dev->list); + +@@ -147,40 +393,30 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) + kfree(iolink); + } + +- while (dev->perf_props.next != &dev->perf_props) { +- perf = container_of(dev->perf_props.next, +- struct kfd_perf_properties, list); +- list_del(&perf->list); +- kfree(perf); +- } +- + kfree(dev); ++ ++ sys_props.num_devices--; + } + +-void kfd_release_topology_device_list(struct list_head *device_list) ++static void kfd_release_live_view(void) + { + struct kfd_topology_device *dev; + +- while (!list_empty(device_list)) { +- dev = list_first_entry(device_list, +- struct kfd_topology_device, list); ++ while (topology_device_list.next != &topology_device_list) { ++ dev = container_of(topology_device_list.next, ++ struct kfd_topology_device, list); + kfd_release_topology_device(dev); +- } + } + +-static void kfd_release_live_view(void) +-{ +- kfd_release_topology_device_list(&topology_device_list); + memset(&sys_props, 0, sizeof(sys_props)); + } + +-struct kfd_topology_device *kfd_create_topology_device( +- struct list_head *device_list) ++static struct kfd_topology_device *kfd_create_topology_device(void) + { + struct kfd_topology_device *dev; + + dev = kfd_alloc_struct(dev); +- if (!dev) { ++ if (dev == NULL) { + pr_err("No memory to allocate a topology device"); + return NULL; + } +@@ -188,13 +424,66 @@ struct kfd_topology_device *kfd_create_topology_device( + INIT_LIST_HEAD(&dev->mem_props); + INIT_LIST_HEAD(&dev->cache_props); + INIT_LIST_HEAD(&dev->io_link_props); +- INIT_LIST_HEAD(&dev->perf_props); + +- list_add_tail(&dev->list, device_list); ++ list_add_tail(&dev->list, &topology_device_list); ++ sys_props.num_devices++; + + return dev; + } + ++static int kfd_parse_crat_table(void *crat_image) ++{ ++ struct kfd_topology_device *top_dev; ++ struct crat_subtype_generic *sub_type_hdr; ++ uint16_t node_id; ++ int ret; ++ struct crat_header *crat_table = (struct crat_header *)crat_image; ++ uint16_t num_nodes; ++ uint32_t image_len; ++ ++ if (!crat_image) ++ return -EINVAL; ++ ++ num_nodes = crat_table->num_domains; ++ image_len = crat_table->length; ++ ++ pr_info("Parsing CRAT table with %d nodes\n", num_nodes); ++ ++ for (node_id = 0; node_id < num_nodes; node_id++) { ++ top_dev = kfd_create_topology_device(); ++ if (!top_dev) { ++ kfd_release_live_view(); ++ return -ENOMEM; ++ } ++ } ++ ++ sys_props.platform_id = ++ (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; ++ sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); ++ sys_props.platform_rev = crat_table->revision; ++ ++ sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); ++ while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < ++ ((char *)crat_image) + image_len) { ++ if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { ++ ret = kfd_parse_subtype(sub_type_hdr); ++ if (ret != 0) { ++ kfd_release_live_view(); ++ return ret; ++ } ++ } ++ ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ sub_type_hdr->length); ++ } ++ ++ sys_props.generation_count++; ++ topology_crat_parsed = 1; ++ ++ return 0; ++} ++ ++ + #define sysfs_show_gen_prop(buffer, fmt, ...) \ + snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) + #define sysfs_show_32bit_prop(buffer, name, value) \ +@@ -203,8 +492,6 @@ struct kfd_topology_device *kfd_create_topology_device( + sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) + #define sysfs_show_32bit_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%u\n", value) +-#define sysfs_show_64bit_val(buffer, value) \ +- sysfs_show_gen_prop(buffer, "%llu\n", value) + #define sysfs_show_str_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%s\n", value) + +@@ -232,17 +519,11 @@ static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, + return ret; + } + +-static void kfd_topology_kobj_release(struct kobject *kobj) +-{ +- kfree(kobj); +-} +- + static const struct sysfs_ops sysprops_ops = { + .show = sysprops_show, + }; + + static struct kobj_type sysprops_type = { +- .release = kfd_topology_kobj_release, + .sysfs_ops = &sysprops_ops, + }; + +@@ -278,7 +559,6 @@ static const struct sysfs_ops iolink_ops = { + }; + + static struct kobj_type iolink_type = { +- .release = kfd_topology_kobj_release, + .sysfs_ops = &iolink_ops, + }; + +@@ -287,23 +567,11 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, + { + ssize_t ret; + struct kfd_mem_properties *mem; +- uint64_t used_mem; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + +- if (strcmp(attr->name, "used_memory") == 0) { +- mem = container_of(attr, struct kfd_mem_properties, +- attr_used); +- if (mem->gpu) { +- used_mem = mem->gpu->kfd2kgd->get_vram_usage(mem->gpu->kgd); +- return sysfs_show_64bit_val(buffer, used_mem); +- } +- /* TODO: Report APU/CPU-allocated memory; For now return 0 */ +- return 0; +- } +- +- mem = container_of(attr, struct kfd_mem_properties, attr_props); ++ mem = container_of(attr, struct kfd_mem_properties, attr); + sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); + sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); + sysfs_show_32bit_prop(buffer, "flags", mem->flags); +@@ -318,7 +586,6 @@ static const struct sysfs_ops mem_ops = { + }; + + static struct kobj_type mem_type = { +- .release = kfd_topology_kobj_release, + .sysfs_ops = &mem_ops, + }; + +@@ -326,7 +593,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, + char *buffer) + { + ssize_t ret; +- uint32_t i, j; ++ uint32_t i; + struct kfd_cache_properties *cache; + + /* Making sure that the buffer is an empty string */ +@@ -344,18 +611,12 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, + sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); + sysfs_show_32bit_prop(buffer, "type", cache->cache_type); + snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); +- for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) +- for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { +- /* Check each bit */ +- if (cache->sibling_map[i] & (1 << j)) +- ret = snprintf(buffer, PAGE_SIZE, +- "%s%d%s", buffer, 1, ","); +- else +- ret = snprintf(buffer, PAGE_SIZE, +- "%s%d%s", buffer, 0, ","); +- } +- /* Replace the last "," with end of line */ +- *(buffer + strlen(buffer) - 1) = 0xA; ++ for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) ++ ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", ++ buffer, cache->sibling_map[i], ++ (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? ++ "\n" : ","); ++ + return ret; + } + +@@ -364,43 +625,9 @@ static const struct sysfs_ops cache_ops = { + }; + + static struct kobj_type cache_type = { +- .release = kfd_topology_kobj_release, + .sysfs_ops = &cache_ops, + }; + +-/****** Sysfs of Performance Counters ******/ +- +-struct kfd_perf_attr { +- struct kobj_attribute attr; +- uint32_t data; +-}; +- +-static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, +- char *buf) +-{ +- struct kfd_perf_attr *attr; +- +- buf[0] = 0; +- attr = container_of(attrs, struct kfd_perf_attr, attr); +- if (!attr->data) /* invalid data for PMC */ +- return 0; +- else +- return sysfs_show_32bit_val(buf, attr->data); +-} +- +-#define KFD_PERF_DESC(_name, _data) \ +-{ \ +- .attr = __ATTR(_name, 0444, perf_show, NULL), \ +- .data = _data, \ +-} +- +-static struct kfd_perf_attr perf_attr_iommu[] = { +- KFD_PERF_DESC(max_concurrent, 0), +- KFD_PERF_DESC(num_counters, 0), +- KFD_PERF_DESC(counter_ids, 0), +-}; +-/****************************************/ +- + static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + char *buffer) + { +@@ -408,7 +635,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; + uint32_t i; + uint32_t log_max_watch_addr; +- struct kfd_local_mem_info local_mem_info; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; +@@ -438,8 +664,18 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + dev->node_props.cpu_cores_count); + sysfs_show_32bit_prop(buffer, "simd_count", + dev->node_props.simd_count); +- sysfs_show_32bit_prop(buffer, "mem_banks_count", +- dev->node_props.mem_banks_count); ++ ++ if (dev->mem_bank_count < dev->node_props.mem_banks_count) { ++ pr_info_once("kfd: mem_banks_count truncated from %d to %d\n", ++ dev->node_props.mem_banks_count, ++ dev->mem_bank_count); ++ sysfs_show_32bit_prop(buffer, "mem_banks_count", ++ dev->mem_bank_count); ++ } else { ++ sysfs_show_32bit_prop(buffer, "mem_banks_count", ++ dev->node_props.mem_banks_count); ++ } ++ + sysfs_show_32bit_prop(buffer, "caches_count", + dev->node_props.caches_count); + sysfs_show_32bit_prop(buffer, "io_links_count", +@@ -487,28 +723,17 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); + } + +- if (dev->gpu->device_info->asic_family == CHIP_TONGA) +- dev->node_props.capability |= +- HSA_CAP_AQL_QUEUE_DOUBLE_MAP; +- + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", +- dev->node_props.max_engine_clk_fcompute); ++ dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( ++ dev->gpu->kgd)); + +- /* +- * If the ASIC is CZ, set local memory size to 0 to disable +- * local memory support +- */ +- if (dev->gpu->device_info->asic_family != CHIP_CARRIZO) { +- dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, +- &local_mem_info); +- sysfs_show_64bit_prop(buffer, "local_mem_size", +- local_mem_info.local_mem_size_private + +- local_mem_info.local_mem_size_public); +- } else +- sysfs_show_64bit_prop(buffer, "local_mem_size", 0ULL); ++ sysfs_show_64bit_prop(buffer, "local_mem_size", ++ (unsigned long long int) 0); + + sysfs_show_32bit_prop(buffer, "fw_version", +- dev->gpu->mec_fw_version); ++ dev->gpu->kfd2kgd->get_fw_version( ++ dev->gpu->kgd, ++ KGD_ENGINE_MEC1)); + sysfs_show_32bit_prop(buffer, "capability", + dev->node_props.capability); + } +@@ -522,7 +747,6 @@ static const struct sysfs_ops node_ops = { + }; + + static struct kobj_type node_type = { +- .release = kfd_topology_kobj_release, + .sysfs_ops = &node_ops, + }; + +@@ -538,7 +762,8 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; +- struct kfd_perf_properties *perf; ++ ++ BUG_ON(!dev); + + if (dev->kobj_iolink) { + list_for_each_entry(iolink, &dev->io_link_props, list) +@@ -567,12 +792,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) + if (dev->kobj_mem) { + list_for_each_entry(mem, &dev->mem_props, list) + if (mem->kobj) { +- /* TODO: Remove when CPU/APU supported */ +- if (dev->node_props.cpu_cores_count == 0) +- sysfs_remove_file(mem->kobj, +- &mem->attr_used); +- kfd_remove_sysfs_file(mem->kobj, +- &mem->attr_props); ++ kfd_remove_sysfs_file(mem->kobj, &mem->attr); + mem->kobj = NULL; + } + kobject_del(dev->kobj_mem); +@@ -580,16 +800,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) + dev->kobj_mem = NULL; + } + +- if (dev->kobj_perf) { +- list_for_each_entry(perf, &dev->perf_props, list) { +- kfree(perf->attr_group); +- perf->attr_group = NULL; +- } +- kobject_del(dev->kobj_perf); +- kobject_put(dev->kobj_perf); +- dev->kobj_perf = NULL; +- } +- + if (dev->kobj_node) { + sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); + sysfs_remove_file(dev->kobj_node, &dev->attr_name); +@@ -606,18 +816,15 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; +- struct kfd_perf_properties *perf; +- uint32_t num_attrs; +- struct attribute **attrs; + int ret; + uint32_t i; + +- if (WARN_ON(dev->kobj_node)) +- return -EEXIST; ++ BUG_ON(!dev); + + /* + * Creating the sysfs folders + */ ++ BUG_ON(dev->kobj_node); + dev->kobj_node = kfd_alloc_struct(dev->kobj_node); + if (!dev->kobj_node) + return -ENOMEM; +@@ -639,10 +846,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + if (!dev->kobj_iolink) + return -ENOMEM; + +- dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node); +- if (!dev->kobj_perf) +- return -ENOMEM; +- + /* + * Creating sysfs files for node properties + */ +@@ -675,23 +878,12 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + if (ret < 0) + return ret; + +- mem->attr_props.name = "properties"; +- mem->attr_props.mode = KFD_SYSFS_FILE_MODE; +- sysfs_attr_init(&mem->attr_props); +- ret = sysfs_create_file(mem->kobj, &mem->attr_props); ++ mem->attr.name = "properties"; ++ mem->attr.mode = KFD_SYSFS_FILE_MODE; ++ sysfs_attr_init(&mem->attr); ++ ret = sysfs_create_file(mem->kobj, &mem->attr); + if (ret < 0) + return ret; +- +- /* TODO: Support APU/CPU memory usage */ +- if (dev->node_props.cpu_cores_count == 0) { +- mem->attr_used.name = "used_memory"; +- mem->attr_used.mode = KFD_SYSFS_FILE_MODE; +- sysfs_attr_init(&mem->attr_used); +- ret = sysfs_create_file(mem->kobj, &mem->attr_used); +- if (ret < 0) +- return ret; +- } +- + i++; + } + +@@ -731,38 +923,11 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + if (ret < 0) + return ret; + i++; +- } +- +- /* All hardware blocks have the same number of attributes. */ +- num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr); +- list_for_each_entry(perf, &dev->perf_props, list) { +- perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr) +- * num_attrs + sizeof(struct attribute_group), +- GFP_KERNEL); +- if (!perf->attr_group) +- return -ENOMEM; +- +- attrs = (struct attribute **)(perf->attr_group + 1); +- if (!strcmp(perf->block_name, "iommu")) { +- /* Information of IOMMU's num_counters and counter_ids is shown +- * under /sys/bus/event_source/devices/amd_iommu. We don't +- * duplicate here. +- */ +- perf_attr_iommu[0].data = perf->max_concurrent; +- for (i = 0; i < num_attrs; i++) +- attrs[i] = &perf_attr_iommu[i].attr.attr; +- } +- perf->attr_group->name = perf->block_name; +- perf->attr_group->attrs = attrs; +- ret = sysfs_create_group(dev->kobj_perf, perf->attr_group); +- if (ret < 0) +- return ret; +- } ++} + + return 0; + } + +-/* Called with write topology lock acquired */ + static int kfd_build_sysfs_node_tree(void) + { + struct kfd_topology_device *dev; +@@ -779,7 +944,6 @@ static int kfd_build_sysfs_node_tree(void) + return 0; + } + +-/* Called with write topology lock acquired */ + static void kfd_remove_sysfs_node_tree(void) + { + struct kfd_topology_device *dev; +@@ -793,7 +957,7 @@ static int kfd_topology_update_sysfs(void) + int ret; + + pr_info("Creating topology SYSFS entries\n"); +- if (!sys_props.kobj_topology) { ++ if (sys_props.kobj_topology == NULL) { + sys_props.kobj_topology = + kfd_alloc_struct(sys_props.kobj_topology); + if (!sys_props.kobj_topology) +@@ -851,251 +1015,75 @@ static void kfd_topology_release_sysfs(void) + } + } + +-/* Called with write topology_lock acquired */ +-static void kfd_topology_update_device_list(struct list_head *temp_list, +- struct list_head *master_list) +-{ +- while (!list_empty(temp_list)) { +- list_move_tail(temp_list->next, master_list); +- sys_props.num_devices++; +- } +-} +- +-static void kfd_debug_print_topology(void) +-{ +- struct kfd_topology_device *dev; +- +- down_read(&topology_lock); +- +- dev = list_last_entry(&topology_device_list, +- struct kfd_topology_device, list); +- if (dev) { +- if (dev->node_props.cpu_cores_count && +- dev->node_props.simd_count) { +- pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", +- dev->node_props.device_id, +- dev->node_props.vendor_id); +- } else if (dev->node_props.cpu_cores_count) +- pr_info("Topology: Add CPU node\n"); +- else if (dev->node_props.simd_count) +- pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", +- dev->node_props.device_id, +- dev->node_props.vendor_id); +- } +- up_read(&topology_lock); +-} +- +-/* Helper function for intializing platform_xx members of kfd_system_properties +- */ +-static void kfd_update_system_properties(void) +-{ +- struct kfd_topology_device *dev; +- +- down_read(&topology_lock); +- dev = list_last_entry(&topology_device_list, +- struct kfd_topology_device, list); +- if (dev) { +- sys_props.platform_id = +- (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; +- sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); +- sys_props.platform_rev = dev->oem_revision; +- } +- up_read(&topology_lock); +-} +- +-static void find_system_memory(const struct dmi_header *dm, +- void *private) +-{ +- struct kfd_mem_properties *mem; +- u16 mem_width, mem_clock; +- struct kfd_topology_device *kdev = +- (struct kfd_topology_device *)private; +- const u8 *dmi_data = (const u8 *)(dm + 1); +- +- if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { +- mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); +- mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); +- list_for_each_entry(mem, &kdev->mem_props, list) { +- if (mem_width != 0xFFFF && mem_width != 0) +- mem->width = mem_width; +- if (mem_clock != 0) +- mem->mem_clk_max = mem_clock; +- } +- } +-} +- +-/* +- * Performance counters information is not part of CRAT but we would like to +- * put them in the sysfs under topology directory for Thunk to get the data. +- * This function is called before updating the sysfs. +- */ +-static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) +-{ +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +- struct kfd_perf_properties *props; +- +- if (amd_iommu_pc_supported()) { +- props = kfd_alloc_struct(props); +- if (!props) +- return -ENOMEM; +- strcpy(props->block_name, "iommu"); +- props->max_concurrent = amd_iommu_pc_get_max_banks(0) * +- amd_iommu_pc_get_max_counters(0); /* assume one iommu */ +- list_add_tail(&props->list, &kdev->perf_props); +- } +-#endif +- +- return 0; +-} +- +-/* kfd_add_non_crat_information - Add information that is not currently +- * defined in CRAT but is necessary for KFD topology +- * @dev - topology device to which addition info is added +- */ +-static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) +-{ +- /* Check if CPU only node. */ +- if (!kdev->gpu) { +- /* Add system memory information */ +- dmi_walk(find_system_memory, kdev); +- } +- /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ +-} +- +-#ifdef CONFIG_ACPI +-/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices. +- * Ignore CRAT for all other devices. AMD APU is identified if both CPU +- * and GPU cores are present. +- * @device_list - topology device list created by parsing ACPI CRAT table. +- * @return - TRUE if invalid, FALSE is valid. +- */ +-static bool kfd_is_acpi_crat_invalid(struct list_head *device_list) +-{ +- struct kfd_topology_device *dev; +- +- list_for_each_entry(dev, device_list, list) { +- if (dev->node_props.cpu_cores_count && +- dev->node_props.simd_count) +- return false; +- } +- pr_info("Ignoring ACPI CRAT on non-APU system\n"); +- return true; +-} +-#endif +- + int kfd_topology_init(void) + { + void *crat_image = NULL; + size_t image_size = 0; + int ret; +- struct list_head temp_topology_device_list; +- int cpu_only_node = 0; +- struct kfd_topology_device *kdev; +- int proximity_domain; +- +- /* topology_device_list - Master list of all topology devices +- * temp_topology_device_list - temporary list created while parsing CRAT +- * or VCRAT. Once parsing is complete the contents of list is moved to +- * topology_device_list +- */ + +- /* Initialize the head for the both the lists */ ++ /* ++ * Initialize the head for the topology device list ++ */ + INIT_LIST_HEAD(&topology_device_list); +- INIT_LIST_HEAD(&temp_topology_device_list); + init_rwsem(&topology_lock); ++ topology_crat_parsed = 0; + + memset(&sys_props, 0, sizeof(sys_props)); + +- /* Proximity domains in ACPI CRAT tables start counting at +- * 0. The same should be true for virtual CRAT tables created +- * at this stage. GPUs added later in kfd_topology_add_device +- * use a counter. +- */ +- proximity_domain = 0; +- + /* +- * Get the CRAT image from the ACPI. If ACPI doesn't have one +- * or if ACPI CRAT is invalid create a virtual CRAT. +- * NOTE: The current implementation expects all AMD APUs to have +- * CRAT. If no CRAT is available, it is assumed to be a CPU ++ * Get the CRAT image from the ACPI + */ +-#ifdef CONFIG_ACPI +- ret = kfd_create_crat_image_acpi(&crat_image, &image_size); +- if (ret == 0) { +- ret = kfd_parse_crat_table(crat_image, +- &temp_topology_device_list, +- proximity_domain); +- if (ret || +- kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { +- +- kfd_release_topology_device_list( +- &temp_topology_device_list); +- kfd_destroy_crat_image(crat_image); +- crat_image = NULL; +- } +- } +-#endif +- if (!crat_image) { +- ret = kfd_create_crat_image_virtual(&crat_image, &image_size, +- COMPUTE_UNIT_CPU, NULL, +- proximity_domain); +- cpu_only_node = 1; +- if (ret) { +- pr_err("Error creating VCRAT table for CPU\n"); +- return ret; +- } +- +- ret = kfd_parse_crat_table(crat_image, +- &temp_topology_device_list, +- proximity_domain); +- if (ret) { +- pr_err("Error parsing VCRAT table for CPU\n"); ++ ret = kfd_topology_get_crat_acpi(crat_image, &image_size); ++ if (ret == 0 && image_size > 0) { ++ pr_info("Found CRAT image with size=%zd\n", image_size); ++ crat_image = kmalloc(image_size, GFP_KERNEL); ++ if (!crat_image) { ++ ret = -ENOMEM; ++ pr_err("No memory for allocating CRAT image\n"); + goto err; + } +- } +- +- kdev = list_first_entry(&temp_topology_device_list, +- struct kfd_topology_device, list); +- kfd_add_perf_to_topology(kdev); +- +- down_write(&topology_lock); +- kfd_topology_update_device_list(&temp_topology_device_list, +- &topology_device_list); +- atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1); +- ret = kfd_topology_update_sysfs(); +- up_write(&topology_lock); +- +- if (ret == 0) { +- sys_props.generation_count++; +- kfd_update_system_properties(); +- kfd_debug_print_topology(); +- pr_info("Finished initializing topology\n"); +- } else +- pr_err("Failed to update topology in sysfs ret=%d\n", ret); +- +- /* For nodes with GPU, this information gets added +- * when GPU is detected (kfd_topology_add_device). +- */ +- if (cpu_only_node) { +- /* Add additional information to CPU only node created above */ +- down_write(&topology_lock); +- kdev = list_first_entry(&topology_device_list, +- struct kfd_topology_device, list); +- up_write(&topology_lock); +- kfd_add_non_crat_information(kdev); ++ ret = kfd_topology_get_crat_acpi(crat_image, &image_size); ++ ++ if (ret == 0) { ++ down_write(&topology_lock); ++ ret = kfd_parse_crat_table(crat_image); ++ if (ret == 0) ++ ret = kfd_topology_update_sysfs(); ++ up_write(&topology_lock); ++ } else { ++ pr_err("Couldn't get CRAT table size from ACPI\n"); ++ } ++ kfree(crat_image); ++ } else if (ret == -ENODATA) { ++ ret = 0; ++ } else { ++ pr_err("Couldn't get CRAT table size from ACPI\n"); + } + + err: +- kfd_destroy_crat_image(crat_image); ++ pr_info("Finished initializing topology ret=%d\n", ret); + return ret; + } + + void kfd_topology_shutdown(void) + { +- down_write(&topology_lock); + kfd_topology_release_sysfs(); + kfd_release_live_view(); +- up_write(&topology_lock); ++} ++ ++static void kfd_debug_print_topology(void) ++{ ++ struct kfd_topology_device *dev; ++ uint32_t i = 0; ++ ++ pr_info("DEBUG PRINT OF TOPOLOGY:"); ++ list_for_each_entry(dev, &topology_device_list, list) { ++ pr_info("Node: %d\n", i); ++ pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); ++ pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); ++ pr_info("\tSIMD count: %d", dev->node_props.simd_count); ++ i++; ++ } + } + + static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) +@@ -1104,15 +1092,11 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) + uint32_t buf[7]; + uint64_t local_mem_size; + int i; +- struct kfd_local_mem_info local_mem_info; + + if (!gpu) + return 0; + +- gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info); +- +- local_mem_size = local_mem_info.local_mem_size_private + +- local_mem_info.local_mem_size_public; ++ local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd); + + buf[0] = gpu->pdev->devfn; + buf[1] = gpu->pdev->subsystem_vendor; +@@ -1127,32 +1111,20 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) + + return hashout; + } +-/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If +- * the GPU device is not already present in the topology device +- * list then return NULL. This means a new topology device has to +- * be created for this GPU. +- * TODO: Rather than assiging @gpu to first topology device withtout +- * gpu attached, it will better to have more stringent check. +- */ ++ + static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) + { + struct kfd_topology_device *dev; + struct kfd_topology_device *out_dev = NULL; +- struct kfd_mem_properties *mem; + +- down_write(&topology_lock); ++ BUG_ON(!gpu); ++ + list_for_each_entry(dev, &topology_device_list, list) +- if (!dev->gpu && (dev->node_props.simd_count > 0)) { ++ if (dev->gpu == NULL && dev->node_props.simd_count > 0) { + dev->gpu = gpu; + out_dev = dev; +- +- /* Assign mem->gpu */ +- list_for_each_entry(mem, &dev->mem_props, list) +- mem->gpu = dev->gpu; +- + break; + } +- up_write(&topology_lock); + + return out_dev; + } +@@ -1165,202 +1137,88 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) + */ + } + +-/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, +- * patch this after CRAT parsing. +- */ +-static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) +-{ +- struct kfd_mem_properties *mem; +- struct kfd_local_mem_info local_mem_info; +- +- if (!dev) +- return; +- +- /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with +- * single bank of VRAM local memory. +- * for dGPUs - VCRAT reports only one bank of Local Memory +- * for APUs - If CRAT from ACPI reports more than one bank, then +- * all the banks will report the same mem_clk_max information +- */ +- dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, +- &local_mem_info); +- +- list_for_each_entry(mem, &dev->mem_props, list) +- mem->mem_clk_max = local_mem_info.mem_clk_max; +-} +- +-static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) +-{ +- struct kfd_iolink_properties *link; +- +- if (!dev || !dev->gpu) +- return; +- +- /* GPU only creates direck links so apply flags setting to all */ +- if (dev->gpu->device_info->asic_family == CHIP_HAWAII) +- list_for_each_entry(link, &dev->io_link_props, list) +- link->flags = CRAT_IOLINK_FLAGS_ENABLED | +- CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | +- CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; +-} +- + int kfd_topology_add_device(struct kfd_dev *gpu) + { + uint32_t gpu_id; + struct kfd_topology_device *dev; +- struct kfd_cu_info cu_info; +- int res = 0; +- struct list_head temp_topology_device_list; +- void *crat_image = NULL; +- size_t image_size = 0; +- int proximity_domain; ++ int res; + +- INIT_LIST_HEAD(&temp_topology_device_list); ++ BUG_ON(!gpu); + + gpu_id = kfd_generate_gpu_id(gpu); + +- pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); +- +- proximity_domain = atomic_inc_return(& +- topology_crat_proximity_domain); ++ pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + +- /* Check to see if this gpu device exists in the topology_device_list. +- * If so, assign the gpu to that device, +- * else create a Virtual CRAT for this gpu device and then parse that +- * CRAT to create a new topology device. Once created assign the gpu to +- * that topology device ++ down_write(&topology_lock); ++ /* ++ * Try to assign the GPU to existing topology device (generated from ++ * CRAT table + */ + dev = kfd_assign_gpu(gpu); + if (!dev) { +- res = kfd_create_crat_image_virtual(&crat_image, &image_size, +- COMPUTE_UNIT_GPU, +- gpu, proximity_domain); +- if (res) { +- pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", +- gpu_id); +- return res; +- } +- res = kfd_parse_crat_table(crat_image, +- &temp_topology_device_list, proximity_domain); +- if (res) { +- pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", +- gpu_id); ++ pr_info("GPU was not found in the current topology. Extending.\n"); ++ kfd_debug_print_topology(); ++ dev = kfd_create_topology_device(); ++ if (!dev) { ++ res = -ENOMEM; + goto err; + } ++ dev->gpu = gpu; + +- down_write(&topology_lock); +- kfd_topology_update_device_list(&temp_topology_device_list, +- &topology_device_list); ++ /* ++ * TODO: Make a call to retrieve topology information from the ++ * GPU vBIOS ++ */ + +- /* Update the SYSFS tree, since we added another topology +- * device ++ /* ++ * Update the SYSFS tree, since we added another topology device + */ +- res = kfd_topology_update_sysfs(); +- up_write(&topology_lock); +- +- if (res == 0) +- sys_props.generation_count++; +- else +- pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", +- gpu_id, res); +- dev = kfd_assign_gpu(gpu); +- if (!dev) { +- pr_err("Could not assign GPU\n"); +- res = -ENODEV; +- goto err; +- } ++ if (kfd_topology_update_sysfs() < 0) ++ kfd_topology_release_sysfs(); ++ + } + + dev->gpu_id = gpu_id; + gpu->id = gpu_id; +- +- /* TODO: Move the following lines to function +- * kfd_add_non_crat_information +- */ +- +- /* Fill-in additional information that is not available in CRAT but +- * needed for the topology +- */ +- +- dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); +- dev->node_props.simd_arrays_per_engine = +- cu_info.num_shader_arrays_per_engine; +- + dev->node_props.vendor_id = gpu->pdev->vendor; + dev->node_props.device_id = gpu->pdev->device; +- dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, +- gpu->pdev->devfn); +- dev->node_props.max_engine_clk_fcompute = +- dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); +- dev->node_props.max_engine_clk_ccompute = +- cpufreq_quick_get_max(0) / 1000; +- +- kfd_fill_mem_clk_max_info(dev); +- kfd_fill_iolink_non_crat_info(dev); +- +- switch (dev->gpu->device_info->asic_family) { +- case CHIP_KAVERI: +- case CHIP_HAWAII: +- case CHIP_TONGA: +- dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << +- HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & +- HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); +- break; +- case CHIP_CARRIZO: +- case CHIP_FIJI: +- case CHIP_POLARIS10: +- case CHIP_POLARIS11: +- pr_debug("Adding doorbell packet type capability\n"); +- dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << +- HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & +- HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); +- break; +- case CHIP_VEGA10: +- case CHIP_RAVEN: +- dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << +- HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & +- HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); +- break; +- default: +- BUG(); +- } +- +- /* Fix errors in CZ CRAT. +- * simd_count: Carrizo CRAT reports wrong simd_count, probably because +- * it doesn't consider masked out CUs +- * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd. +- * capability flag: Carrizo CRAT doesn't report IOMMU flags. ++ dev->node_props.location_id = (gpu->pdev->bus->number << 24) + ++ (gpu->pdev->devfn & 0xffffff); ++ /* ++ * TODO: Retrieve max engine clock values from KGD + */ ++ + if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { +- dev->node_props.simd_count = +- cu_info.simd_per_cu * cu_info.cu_active_number; +- dev->node_props.max_waves_per_simd = 10; +- dev->node_props.capability |= HSA_CAP_ATS_PRESENT; ++ dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; ++ pr_info("amdkfd: adding doorbell packet type capability\n"); + } + +- kfd_debug_print_topology(); ++ res = 0; + +- if (!res) +- kfd_notify_gpu_change(gpu_id, 1); + err: +- kfd_destroy_crat_image(crat_image); ++ up_write(&topology_lock); ++ ++ if (res == 0) ++ kfd_notify_gpu_change(gpu_id, 1); ++ + return res; + } + + int kfd_topology_remove_device(struct kfd_dev *gpu) + { +- struct kfd_topology_device *dev, *tmp; ++ struct kfd_topology_device *dev; + uint32_t gpu_id; + int res = -ENODEV; + ++ BUG_ON(!gpu); ++ + down_write(&topology_lock); + +- list_for_each_entry_safe(dev, tmp, &topology_device_list, list) ++ list_for_each_entry(dev, &topology_device_list, list) + if (dev->gpu == gpu) { + gpu_id = dev->gpu_id; + kfd_remove_sysfs_node_entry(dev); + kfd_release_topology_device(dev); +- sys_props.num_devices--; + res = 0; + if (kfd_topology_update_sysfs() < 0) + kfd_topology_release_sysfs(); +@@ -1375,26 +1233,22 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) + return res; + } + +-/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD +- * topology. If GPU device is found @idx, then valid kfd_dev pointer is +- * returned through @kdev +- * Return - 0: On success (@kdev will be NULL for non GPU nodes) +- * -1: If end of list ++/* ++ * When idx is out of bounds, the function will return NULL + */ +-int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) ++struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) + { + + struct kfd_topology_device *top_dev; ++ struct kfd_dev *device = NULL; + uint8_t device_idx = 0; + +- *kdev = NULL; + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) { + if (device_idx == idx) { +- *kdev = top_dev->gpu; +- up_read(&topology_lock); +- return 0; ++ device = top_dev->gpu; ++ break; + } + + device_idx++; +@@ -1402,89 +1256,6 @@ int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) + + up_read(&topology_lock); + +- return -1; +- +-} +- +-static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) +-{ +- int first_cpu_of_numa_node; +- +- if (!cpumask || (cpumask == cpu_none_mask)) +- return -1; +- first_cpu_of_numa_node = cpumask_first(cpumask); +- if (first_cpu_of_numa_node >= nr_cpu_ids) +- return -1; +-#ifdef CONFIG_X86_64 +- return cpu_data(first_cpu_of_numa_node).apicid; +-#else +- return first_cpu_of_numa_node; +-#endif +-} +- +-/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor +- * of the given NUMA node (numa_node_id) +- * Return -1 on failure +- */ +-int kfd_numa_node_to_apic_id(int numa_node_id) +-{ +- if (numa_node_id == -1) { +- pr_warn("Invalid NUMA Node. Use online CPU mask\n"); +- return kfd_cpumask_to_apic_id(cpu_online_mask); +- } +- return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); +-} +- +-#if defined(CONFIG_DEBUG_FS) +- +-int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) +-{ +- struct kfd_topology_device *dev; +- unsigned int i = 0; +- int r = 0; +- +- down_read(&topology_lock); +- +- list_for_each_entry(dev, &topology_device_list, list) { +- if (!dev->gpu) { +- i++; +- continue; +- } +- +- seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); +- r = device_queue_manager_debugfs_hqds(m, dev->gpu->dqm); +- if (r != 0) +- break; +- } +- +- up_read(&topology_lock); +- +- return r; +-} +- +-int kfd_debugfs_rls_by_device(struct seq_file *m, void *data) +-{ +- struct kfd_topology_device *dev; +- unsigned int i = 0; +- int r = 0; +- +- down_read(&topology_lock); +- +- list_for_each_entry(dev, &topology_device_list, list) { +- if (!dev->gpu) { +- i++; +- continue; +- } +- +- seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); +- r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets); +- if (r != 0) +- break; +- } +- +- up_read(&topology_lock); ++ return device; + +- return r; + } +- +-#endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +index f22d420..c3ddb9b 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +@@ -39,17 +39,8 @@ + #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 + #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 + #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 +-#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 +-#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 +-#define HSA_CAP_RESERVED 0xffffc000 +- +-#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 +-#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 +-#define HSA_CAP_DOORBELL_TYPE_2_0 0x2 +-#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 +-#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 ++#define HSA_CAP_RESERVED 0xfffff000 + #define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 +-#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 + + struct kfd_node_properties { + uint32_t cpu_cores_count; +@@ -97,11 +88,11 @@ struct kfd_mem_properties { + uint32_t width; + uint32_t mem_clk_max; + struct kobject *kobj; +- struct kfd_dev *gpu; +- struct attribute attr_props; +- struct attribute attr_used; ++ struct attribute attr; + }; + ++#define KFD_TOPOLOGY_CPU_SIBLINGS 256 ++ + #define HSA_CACHE_TYPE_DATA 0x00000001 + #define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 + #define HSA_CACHE_TYPE_CPU 0x00000004 +@@ -118,7 +109,7 @@ struct kfd_cache_properties { + uint32_t cache_assoc; + uint32_t cache_latency; + uint32_t cache_type; +- uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; ++ uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; + struct kobject *kobj; + struct attribute attr; + }; +@@ -141,36 +132,24 @@ struct kfd_iolink_properties { + struct attribute attr; + }; + +-struct kfd_perf_properties { +- struct list_head list; +- char block_name[16]; +- uint32_t max_concurrent; +- struct attribute_group *attr_group; +-}; +- + struct kfd_topology_device { + struct list_head list; + uint32_t gpu_id; +- uint32_t proximity_domain; + struct kfd_node_properties node_props; ++ uint32_t mem_bank_count; + struct list_head mem_props; + uint32_t cache_count; + struct list_head cache_props; + uint32_t io_link_count; + struct list_head io_link_props; +- struct list_head perf_props; + struct kfd_dev *gpu; + struct kobject *kobj_node; + struct kobject *kobj_mem; + struct kobject *kobj_cache; + struct kobject *kobj_iolink; +- struct kobject *kobj_perf; + struct attribute attr_gpuid; + struct attribute attr_name; + struct attribute attr_props; +- uint8_t oem_id[CRAT_OEMID_LENGTH]; +- uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; +- uint32_t oem_revision; + }; + + struct kfd_system_properties { +@@ -185,14 +164,6 @@ struct kfd_system_properties { + struct attribute attr_props; + }; + +-struct kfd_topology_device *kfd_create_topology_device( +- struct list_head *device_list); +-void kfd_release_topology_device_list(struct list_head *device_list); + +-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) +-extern bool amd_iommu_pc_supported(void); +-extern u8 amd_iommu_pc_get_max_banks(u16 devid); +-extern u8 amd_iommu_pc_get_max_counters(u16 devid); +-#endif + + #endif /* __KFD_TOPOLOGY_H__ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h +deleted file mode 100644 +index e00d03d..0000000 +--- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h ++++ /dev/null +@@ -1,84 +0,0 @@ +-/* +- * Copyright 2016 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#ifndef HSA_SOC15_INT_H_INCLUDED +-#define HSA_SOC15_INT_H_INCLUDED +-/* +- * vega10+ IH clients +- */ +-enum soc15_ih_client_id { +- SOC15_IH_CLIENTID_IH = 0x00, +- SOC15_IH_CLIENTID_ACP = 0x01, +- SOC15_IH_CLIENTID_ATHUB = 0x02, +- SOC15_IH_CLIENTID_BIF = 0x03, +- SOC15_IH_CLIENTID_DCE = 0x04, +- SOC15_IH_CLIENTID_ISP = 0x05, +- SOC15_IH_CLIENTID_PCIE0 = 0x06, +- SOC15_IH_CLIENTID_RLC = 0x07, +- SOC15_IH_CLIENTID_SDMA0 = 0x08, +- SOC15_IH_CLIENTID_SDMA1 = 0x09, +- SOC15_IH_CLIENTID_SE0SH = 0x0a, +- SOC15_IH_CLIENTID_SE1SH = 0x0b, +- SOC15_IH_CLIENTID_SE2SH = 0x0c, +- SOC15_IH_CLIENTID_SE3SH = 0x0d, +- SOC15_IH_CLIENTID_SYSHUB = 0x0e, +- SOC15_IH_CLIENTID_THM = 0x0f, +- SOC15_IH_CLIENTID_UVD = 0x10, +- SOC15_IH_CLIENTID_VCE0 = 0x11, +- SOC15_IH_CLIENTID_VMC = 0x12, +- SOC15_IH_CLIENTID_XDMA = 0x13, +- SOC15_IH_CLIENTID_GRBM_CP = 0x14, +- SOC15_IH_CLIENTID_ATS = 0x15, +- SOC15_IH_CLIENTID_ROM_SMUIO = 0x16, +- SOC15_IH_CLIENTID_DF = 0x17, +- SOC15_IH_CLIENTID_VCE1 = 0x18, +- SOC15_IH_CLIENTID_PWR = 0x19, +- SOC15_IH_CLIENTID_UTCL2 = 0x1b, +- SOC15_IH_CLIENTID_EA = 0x1c, +- SOC15_IH_CLIENTID_UTCL2LOG = 0x1d, +- SOC15_IH_CLIENTID_MP0 = 0x1e, +- SOC15_IH_CLIENTID_MP1 = 0x1f, +- +- SOC15_IH_CLIENTID_MAX +-}; +- +- +-#define SOC15_INTSRC_CP_END_OF_PIPE 181 +-#define SOC15_INTSRC_CP_BAD_OPCODE 183 +-#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 +-#define SOC15_INTSRC_VMC_FAULT 0 +-#define SOC15_INTSRC_SDMA_TRAP 224 +- +- +-#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) +-#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) +-#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) +-#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) +-#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) +-#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) +-#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) +-#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) +-#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) +-#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) +- +-#endif +- +diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +old mode 100755 +new mode 100644 +index b6cf2d5..36f3766 +--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h ++++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +@@ -30,7 +30,6 @@ + + #include <linux/types.h> + #include <linux/bitmap.h> +-#include <linux/dma-buf.h> + + struct pci_dev; + +@@ -41,46 +40,6 @@ struct kfd_dev; + struct kgd_dev; + + struct kgd_mem; +-struct kfd_process_device; +-struct amdgpu_bo; +- +-enum kfd_preempt_type { +- KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN = 0, +- KFD_PREEMPT_TYPE_WAVEFRONT_RESET, +-}; +- +-struct kfd_vm_fault_info { +- uint64_t page_addr; +- uint32_t vmid; +- uint32_t mc_id; +- uint32_t status; +- bool prot_valid; +- bool prot_read; +- bool prot_write; +- bool prot_exec; +-}; +- +-struct kfd_cu_info { +- uint32_t num_shader_engines; +- uint32_t num_shader_arrays_per_engine; +- uint32_t num_cu_per_sh; +- uint32_t cu_active_number; +- uint32_t cu_ao_mask; +- uint32_t simd_per_cu; +- uint32_t max_waves_per_simd; +- uint32_t wave_front_size; +- uint32_t max_scratch_slots_per_cu; +- uint32_t lds_size; +- uint32_t cu_bitmap[4][4]; +-}; +- +-/* For getting GPU local memory information from KGD */ +-struct kfd_local_mem_info { +- uint64_t local_mem_size_private; +- uint64_t local_mem_size_public; +- uint32_t vram_width; +- uint32_t mem_clk_max; +-}; + + enum kgd_memory_pool { + KGD_POOL_SYSTEM_CACHEABLE = 1, +@@ -113,21 +72,6 @@ struct kgd2kfd_shared_resources { + /* Bit n == 1 means Queue n is available for KFD */ + DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); + +- /* Doorbell assignments (SOC15 and later chips only). Only +- * specific doorbells are routed to each SDMA engine. Others +- * are routed to IH and VCN. They are not usable by the CP. +- * +- * Any doorbell number D that satisfies the following condition +- * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val +- * +- * KFD currently uses 1024 (= 0x3ff) doorbells per process. If +- * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means +- * mask would be set to 0x1f8 and val set to 0x0f0. +- */ +- unsigned int sdma_doorbell[2][2]; +- unsigned int reserved_doorbell_mask; +- unsigned int reserved_doorbell_val; +- + /* Base address of doorbell aperture. */ + phys_addr_t doorbell_physical_address; + +@@ -136,41 +80,8 @@ struct kgd2kfd_shared_resources { + + /* Number of bytes at start of aperture reserved for KGD. */ + size_t doorbell_start_offset; +- +- /* GPUVM address space size in bytes */ +- uint64_t gpuvm_size; + }; + +-struct tile_config { +- uint32_t *tile_config_ptr; +- uint32_t *macro_tile_config_ptr; +- uint32_t num_tile_configs; +- uint32_t num_macro_tile_configs; +- +- uint32_t gb_addr_config; +- uint32_t num_banks; +- uint32_t num_ranks; +-}; +- +-/* +- * Allocation flag domains currently only VRAM and GTT domain supported +- */ +-#define ALLOC_MEM_FLAGS_VRAM (1 << 0) +-#define ALLOC_MEM_FLAGS_GTT (1 << 1) +-#define ALLOC_MEM_FLAGS_USERPTR (1 << 2) +-#define ALLOC_MEM_FLAGS_DOORBELL (1 << 3) +- +-/* +- * Allocation flags attributes/access options. +- */ +-#define ALLOC_MEM_FLAGS_NONPAGED (1 << 31) +-#define ALLOC_MEM_FLAGS_READONLY (1 << 30) +-#define ALLOC_MEM_FLAGS_PUBLIC (1 << 29) +-#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) +-#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) +-#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) +-#define ALLOC_MEM_FLAGS_COHERENT (1 << 25) +- + /** + * struct kfd2kgd_calls + * +@@ -179,7 +90,7 @@ struct tile_config { + * + * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture + * +- * @get_local_mem_info: Retrieves information about GPU local memory ++ * @get_vmem_size: Retrieves (physical) size of VRAM + * + * @get_gpu_clock_counter: Retrieves GPU clock counter + * +@@ -201,12 +112,6 @@ struct tile_config { + * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot. + * used only for no HWS mode. + * +- * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs. +- * Array is allocated with kmalloc, needs to be freed with kfree by caller. +- * +- * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs. +- * Array is allocated with kmalloc, needs to be freed with kfree by caller. +- * + * @hqd_is_occupies: Checks if a hqd slot is occupied. + * + * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot. +@@ -216,34 +121,8 @@ struct tile_config { + * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that + * SDMA hqd slot. + * +- * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs +- * +- * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs +- * + * @get_fw_version: Returns FW versions from the header + * +- * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to +- * IOMMU when address translation failed +- * +- * @get_cu_info: Retrieves activated cu info +- * +- * @get_dmabuf_info: Returns information about a dmabuf if it was +- * created by the GPU driver +- * +- * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object +- * Supports only DMA buffers created by GPU driver on the same GPU +- * +- * @export_dmabuf: Emports a KFD BO for sharing with other process +- * +- * @submit_ib: Submits an IB to the engine specified by inserting the IB to +- * the corresonded ring (ring type). +- * +- * @restore_process_bos: Restore all BOs that belongs to the process +- * +- * @copy_mem_to_mem: Copies size bytes from source BO to destination BO +- * +- * @get_vram_usage: Returns current VRAM usage +- * + * This structure contains function pointers to services that the kgd driver + * provides to amdkfd driver. + * +@@ -255,23 +134,11 @@ struct kfd2kgd_calls { + + void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); + +- void(*get_local_mem_info)(struct kgd_dev *kgd, +- struct kfd_local_mem_info *mem_info); ++ uint64_t (*get_vmem_size)(struct kgd_dev *kgd); + uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd); + + uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd); + +- int (*create_process_vm)(struct kgd_dev *kgd, void **vm, +- void **process_info, struct dma_fence **ef); +- void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm); +- +- int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem); +- void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem); +- +- uint32_t (*get_process_page_dir)(void *vm); +- +- int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); +- + /* Register access functions */ + void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid, + uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, +@@ -284,28 +151,16 @@ struct kfd2kgd_calls { + uint32_t hpd_size, uint64_t hpd_gpu_addr); + + int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); +- + + int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr, +- uint32_t wptr_shift, uint32_t wptr_mask, +- struct mm_struct *mm); +- +- int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd, +- uint32_t __user *wptr, struct mm_struct *mm); +- +- int (*hqd_dump)(struct kgd_dev *kgd, +- uint32_t pipe_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs); ++ uint32_t queue_id, uint32_t __user *wptr); + +- int (*hqd_sdma_dump)(struct kgd_dev *kgd, +- uint32_t engine_id, uint32_t queue_id, +- uint32_t (**dump)[2], uint32_t *n_regs); ++ int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd); + + bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address, + uint32_t pipe_id, uint32_t queue_id); + +- int (*hqd_destroy)(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, ++ int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id); + +@@ -313,7 +168,7 @@ struct kfd2kgd_calls { + + int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd, + unsigned int timeout); +- ++ + int (*address_watch_disable)(struct kgd_dev *kgd); + int (*address_watch_execute)(struct kgd_dev *kgd, + unsigned int watch_point_id, +@@ -332,72 +187,11 @@ struct kfd2kgd_calls { + uint16_t (*get_atc_vmid_pasid_mapping_pasid)( + struct kgd_dev *kgd, + uint8_t vmid); +- uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd); + void (*write_vmid_invalidate_request)(struct kgd_dev *kgd, + uint8_t vmid); + +- int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid); +- +- int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); +- +- int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va, +- uint64_t size, void *vm, +- struct kgd_mem **mem, uint64_t *offset, +- uint32_t flags); +- int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, +- void *vm); +- int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, +- void *vm); +- int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, +- void *vm); +- + uint16_t (*get_fw_version)(struct kgd_dev *kgd, + enum kgd_engine_type type); +- +- void (*set_num_of_requests)(struct kgd_dev *kgd, +- uint8_t num_of_requests); +- int (*alloc_memory_of_scratch)(struct kgd_dev *kgd, +- uint64_t va, uint32_t vmid); +- int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable, +- uint8_t element_size, uint8_t index_stride, uint8_t mtype); +- void (*get_cu_info)(struct kgd_dev *kgd, +- struct kfd_cu_info *cu_info); +- int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma); +- int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd, +- struct kgd_mem *mem, void **kptr); +- void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid, +- uint32_t page_table_base); +- +- int (*pin_get_sg_table_bo)(struct kgd_dev *kgd, +- struct kgd_mem *mem, uint64_t offset, +- uint64_t size, struct sg_table **ret_sg); +- void (*unpin_put_sg_table_bo)(struct kgd_mem *mem, +- struct sg_table *sg); +- +- int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd, +- struct kgd_dev **dma_buf_kgd, uint64_t *bo_size, +- void *metadata_buffer, size_t buffer_size, +- uint32_t *metadata_size, uint32_t *flags); +- int (*import_dmabuf)(struct kgd_dev *kgd, struct dma_buf *dmabuf, +- uint64_t va, void *vm, struct kgd_mem **mem, +- uint64_t *size, uint64_t *mmap_offset); +- int (*export_dmabuf)(struct kgd_dev *kgd, void *vm, struct kgd_mem *mem, +- struct dma_buf **dmabuf); +- +- int (*get_vm_fault_info)(struct kgd_dev *kgd, +- struct kfd_vm_fault_info *info); +- int (*submit_ib)(struct kgd_dev *kgd, enum kgd_engine_type engine, +- uint32_t vmid, uint64_t gpu_addr, +- uint32_t *ib_cmd, uint32_t ib_len); +- int (*get_tile_config)(struct kgd_dev *kgd, +- struct tile_config *config); +- +- int (*restore_process_bos)(void *process_info, struct dma_fence **ef); +- int (*copy_mem_to_mem)(struct kgd_dev *kgd, struct kgd_mem *src_mem, +- uint64_t src_offset, struct kgd_mem *dst_mem, +- uint64_t dest_offset, uint64_t size, +- struct dma_fence **f, uint64_t *actual_size); +- uint64_t (*get_vram_usage)(struct kgd_dev *kgd); + }; + + /** +@@ -416,13 +210,6 @@ struct kfd2kgd_calls { + * + * @resume: Notifies amdkfd about a resume action done to a kgd device + * +- * @quiesce_mm: Quiesce all user queue access to specified MM address space +- * +- * @resume_mm: Resume user queue access to specified MM address space +- * +- * @schedule_evict_and_restore_process: Schedules work queue that will prepare +- * for safe eviction of KFD BOs that belong to the specified process. +- * + * This structure contains function callback pointers so the kgd driver + * will notify to the amdkfd about certain status changes. + * +@@ -437,13 +224,9 @@ struct kgd2kfd_calls { + void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); + void (*suspend)(struct kfd_dev *kfd); + int (*resume)(struct kfd_dev *kfd); +- int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm); +- int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm); +- int (*schedule_evict_and_restore_process)(struct mm_struct *mm, +- struct dma_fence *fence); + }; + + int kgd2kfd_init(unsigned interface_version, + const struct kgd2kfd_calls **g2f); + +-#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ ++#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ +diff --git a/drivers/gpu/drm/drm_pci.c b/drivers/gpu/drm/drm_pci.c +index 1235c98..7e5a1fe 100644 +--- a/drivers/gpu/drm/drm_pci.c ++++ b/drivers/gpu/drm/drm_pci.c +@@ -149,6 +149,7 @@ int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master) + master->unique_len = strlen(master->unique); + return 0; + } ++EXPORT_SYMBOL(drm_pci_set_busid); + + static int drm_pci_irq_by_busid(struct drm_device *dev, struct drm_irq_busid *p) + { +diff --git a/drivers/gpu/drm/radeon/radeon_kfd.c b/drivers/gpu/drm/radeon/radeon_kfd.c +index c7d2e7a..a2ab6dc 100755 +--- a/drivers/gpu/drm/radeon/radeon_kfd.c ++++ b/drivers/gpu/drm/radeon/radeon_kfd.c +@@ -75,15 +75,12 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, + uint32_t hpd_size, uint64_t hpd_gpu_addr); + static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr, +- uint32_t wptr_shift, uint32_t wptr_mask, +- struct mm_struct *mm); +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, +- uint32_t __user *wptr, struct mm_struct *mm); ++ uint32_t queue_id, uint32_t __user *wptr); ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + uint32_t pipe_id, uint32_t queue_id); + +-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, ++static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id); + static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); +@@ -110,6 +107,7 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); + static const struct kfd2kgd_calls kfd2kgd = { + .init_gtt_mem_allocation = alloc_gtt_mem, + .free_gtt_mem = free_gtt_mem, ++ .get_vmem_size = get_vmem_size, + .get_gpu_clock_counter = get_gpu_clock_counter, + .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, + .program_sh_mem_settings = kgd_program_sh_mem_settings, +@@ -484,9 +482,7 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) + } + + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, +- uint32_t queue_id, uint32_t __user *wptr, +- uint32_t wptr_shift, uint32_t wptr_mask, +- struct mm_struct *mm) ++ uint32_t queue_id, uint32_t __user *wptr) + { + uint32_t wptr_shadow, is_wptr_shadow_valid; + struct cik_mqd *m; +@@ -562,8 +558,7 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, + return 0; + } + +-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, +- uint32_t __user *wptr, struct mm_struct *mm) ++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) + { + struct cik_sdma_rlc_registers *m; + uint32_t sdma_base_addr; +@@ -641,7 +636,7 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) + return false; + } + +-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, ++static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) + { +diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h +index 3053049..f08273c 100644 +--- a/include/drm/drm_drv.h ++++ b/include/drm/drm_drv.h +@@ -174,6 +174,8 @@ struct drm_driver { + * to finalize the device and then freeing the struct themselves. + */ + void (*release) (struct drm_device *); ++ ++ int (*set_busid)(struct drm_device *dev, struct drm_master *master); + + /** + * @get_vblank_counter: +diff --git a/include/drm/drm_pci.h b/include/drm/drm_pci.h +index 6745990..4d5daa8 100644 +--- a/include/drm/drm_pci.h ++++ b/include/drm/drm_pci.h +@@ -49,6 +49,7 @@ void drm_legacy_pci_exit(struct drm_driver *driver, struct pci_driver *pdriver); + int drm_get_pci_dev(struct pci_dev *pdev, + const struct pci_device_id *ent, + struct drm_driver *driver); ++int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master); + #else + static inline int drm_get_pci_dev(struct pci_dev *pdev, + const struct pci_device_id *ent, +@@ -56,6 +57,12 @@ static inline int drm_get_pci_dev(struct pci_dev *pdev, + { + return -ENOSYS; + } ++ ++static inline int drm_pci_set_busid(struct drm_device *dev, ++ struct drm_master *master) ++{ ++ return -ENOSYS; ++} + #endif + + #define DRM_PCIE_SPEED_25 1 +-- +2.7.4 + |