diff options
Diffstat (limited to 'meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch')
-rw-r--r-- | meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch | 31349 |
1 files changed, 0 insertions, 31349 deletions
diff --git a/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch deleted file mode 100644 index f6937453..00000000 --- a/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1368-drm-amdkfd-revert-kfd-part-to-a-previous-state.patch +++ /dev/null @@ -1,31349 +0,0 @@ -From 3538bdf4c8b2d8f1f93fd806656bad0c82c6e60f Mon Sep 17 00:00:00 2001 -From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> -Date: Thu, 18 Oct 2018 18:06:09 +0530 -Subject: [PATCH 1368/4131] drm/amdkfd: revert kfd part to a previous state - -Revert following files to "2ba6b00 drm/amd/powerplay: add profile mode for vega10.": - - - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd* - - drivers/gpu/drm/amd/amdkfd/* - - drivers/gpu/drm/amd/include/kgd_kfd_interface.h - - include/uapi/linux/kfd_ioctl.h - - drivers/gpu/drm/radeon/radeon_kfd* - -Due to upstream, porting kfd patches to 4.13 all-open has many conflicts. -It's hard to elegantly fix these conflicts. So we revert the kfd part to a -previous commit, where we began to first port dkms patches in 4.12 hybrid. -Then sequentially port all kfd patches. - -Change-Id: I75eda45f41ced2f4c444ded126e2b80b53d15f2a -Signed-off-by: Le.Ma <Le.Ma@amd.com> -Acked-by: Junwei Zhang <Jerry.Zhang@amd.com> -Signed-off-by: kalyan.alle <kalyan.alle@amd.com> -Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com> ---- - drivers/gpu/drm/amd/amdgpu/Makefile | 4 +- - drivers/gpu/drm/amd/amdgpu/amdgpu.h | 97 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 354 +-- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 184 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 196 -- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 542 +--- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 642 +---- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1227 ---------- - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2578 -------------------- - drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 246 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 50 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h | 1 + - drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 13 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 82 + - drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 133 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 106 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 22 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 5 +- - drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 - - drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 21 +- - drivers/gpu/drm/amd/amdgpu/vid.h | 6 - - drivers/gpu/drm/amd/amdkfd/Kconfig | 3 +- - drivers/gpu/drm/amd/amdkfd/Makefile | 23 +- - drivers/gpu/drm/amd/amdkfd/backport/Makefile | 7 - - drivers/gpu/drm/amd/amdkfd/backport/backport.h | 6 - - drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 73 +- - drivers/gpu/drm/amd/amdkfd/cik_int.h | 24 +- - drivers/gpu/drm/amd/amdkfd/cik_regs.h | 3 +- - .../gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h | 1384 ----------- - .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 1388 ----------- - drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1361 +---------- - drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 1304 ---------- - drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 42 +- - drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 219 +- - drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h | 32 - - drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c | 24 +- - drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h | 27 +- - drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 75 - - drivers/gpu/drm/amd/amdkfd/kfd_device.c | 890 +------ - .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1222 +++------- - .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 56 +- - .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c | 80 +- - .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 90 - - .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c | 123 +- - drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 106 +- - drivers/gpu/drm/amd/amdkfd/kfd_events.c | 253 +- - drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 124 +- - drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 133 - - drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 91 +- - drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 275 --- - drivers/gpu/drm/amd/amdkfd/kfd_ipc.h | 51 - - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 149 +- - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 17 +- - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | 128 - - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 377 --- - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 361 --- - drivers/gpu/drm/amd/amdkfd/kfd_module.c | 61 +- - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 54 - - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 18 +- - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 240 +- - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 528 ---- - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 329 +-- - drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 556 +++-- - drivers/gpu/drm/amd/amdkfd/kfd_pasid.c | 7 +- - drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c | 513 ---- - drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h | 330 ++- - drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h | 583 ----- - drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h | 97 + - drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h | 140 +- - drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 546 +---- - drivers/gpu/drm/amd/amdkfd/kfd_process.c | 919 +------ - .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 290 +-- - drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 12 +- - drivers/gpu/drm/amd/amdkfd/kfd_rdma.c | 294 --- - drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1177 ++++----- - drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 41 +- - drivers/gpu/drm/amd/amdkfd/soc15_int.h | 84 - - drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 231 +- - drivers/gpu/drm/drm_pci.c | 1 + - drivers/gpu/drm/radeon/radeon_kfd.c | 19 +- - include/drm/drm_drv.h | 2 + - include/drm/drm_pci.h | 7 + - 82 files changed, 3407 insertions(+), 20703 deletions(-) - delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c - mode change 100755 => 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c - mode change 100755 => 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c - delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c - delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c - mode change 100755 => 100644 drivers/gpu/drm/amd/amdkfd/Makefile - delete mode 100644 drivers/gpu/drm/amd/amdkfd/backport/Makefile - delete mode 100644 drivers/gpu/drm/amd/amdkfd/backport/backport.h - delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h - delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_crat.c - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.c - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.h - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h - mode change 100755 => 100644 drivers/gpu/drm/amd/amdkfd/kfd_priv.h - delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_rdma.c - delete mode 100644 drivers/gpu/drm/amd/amdkfd/soc15_int.h - mode change 100755 => 100644 drivers/gpu/drm/amd/include/kgd_kfd_interface.h - -diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile -index 6b373d0..bc6f49e 100755 ---- a/drivers/gpu/drm/amd/amdgpu/Makefile -+++ b/drivers/gpu/drm/amd/amdgpu/Makefile -@@ -32,7 +32,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \ - amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \ - amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ - amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \ -- amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o amdgpu_amdkfd_fence.o -+ amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o - - # add asic specific block - amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \ -@@ -110,8 +110,6 @@ amdgpu-y += \ - amdgpu_amdkfd.o \ - amdgpu_amdkfd_gfx_v7.o \ - amdgpu_amdkfd_gfx_v8.o \ -- amdgpu_amdkfd_gfx_v9.o \ -- amdgpu_amdkfd_gpuvm.o - - # add cgs - amdgpu-y += amdgpu_cgs.o -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h -index bcf95e7..b07c90e 100755 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h -@@ -67,7 +67,6 @@ - #include "amdgpu_vce.h" - #include "amdgpu_vcn.h" - #include "amdgpu_dm.h" --#include "amdgpu_mn.h" - - #include "gpu_scheduler.h" - #include "amdgpu_virt.h" -@@ -125,7 +124,6 @@ extern int amdgpu_cntl_sb_buf_per_se; - extern int amdgpu_param_buf_per_se; - extern int amdgpu_job_hang_limit; - extern int amdgpu_lbpw; --extern int amdgpu_compute_multipipe; - - #ifdef CONFIG_DRM_AMDGPU_SI - extern int amdgpu_si_support; -@@ -184,8 +182,7 @@ struct amdgpu_cs_parser; - struct amdgpu_job; - struct amdgpu_irq_src; - struct amdgpu_fpriv; --struct kfd_vm_fault_info; --struct amdgpu_bo_va_mapping; -+struct kfd_process_device; - - enum amdgpu_cp_irq { - AMDGPU_CP_IRQ_GFX_EOP = 0, -@@ -300,25 +297,14 @@ struct amdgpu_buffer_funcs { - - /* provided by hw blocks that can write ptes, e.g., sdma */ - struct amdgpu_vm_pte_funcs { -- /* number of dw to reserve per operation */ -- unsigned copy_pte_num_dw; -- - /* copy pte entries from GART */ - void (*copy_pte)(struct amdgpu_ib *ib, - uint64_t pe, uint64_t src, - unsigned count); -- - /* write pte one entry at a time with addr mapping */ - void (*write_pte)(struct amdgpu_ib *ib, uint64_t pe, - uint64_t value, unsigned count, - uint32_t incr); -- -- /* maximum nums of PTEs/PDEs in a single operation */ -- uint32_t set_max_nums_pte_pde; -- -- /* number of dw to reserve per operation */ -- unsigned set_pte_pde_num_dw; -- - /* for linear pte/pde updates without addr mapping */ - void (*set_pte_pde)(struct amdgpu_ib *ib, - uint64_t pe, -@@ -397,15 +383,7 @@ struct amdgpu_clock { - */ - - #define AMDGPU_GEM_DOMAIN_MAX 0x3 -- --struct amdgpu_gem_object { -- struct drm_gem_object base; -- struct list_head list; -- struct amdgpu_bo *bo; --}; -- --struct kgd_mem; --#define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_gem_object, base)->bo -+#define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_bo, gem_base) - - void amdgpu_gem_object_free(struct drm_gem_object *obj); - int amdgpu_gem_object_open(struct drm_gem_object *obj, -@@ -421,8 +399,6 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, - struct dma_buf *amdgpu_gem_prime_export(struct drm_device *dev, - struct drm_gem_object *gobj, - int flags); --struct drm_gem_object * --amdgpu_gem_prime_foreign_bo(struct amdgpu_device *adev, struct amdgpu_bo *bo); - int amdgpu_gem_prime_pin(struct drm_gem_object *obj); - void amdgpu_gem_prime_unpin(struct drm_gem_object *obj); - struct reservation_object *amdgpu_gem_prime_res_obj(struct drm_gem_object *); -@@ -484,10 +460,9 @@ struct amdgpu_sa_bo { - */ - void amdgpu_gem_force_release(struct amdgpu_device *adev); - int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size, -- int alignment, u32 initial_domain, -- u64 flags, bool kernel, -- struct reservation_object *resv, -- struct drm_gem_object **obj); -+ int alignment, u32 initial_domain, -+ u64 flags, bool kernel, -+ struct drm_gem_object **obj); - - int amdgpu_mode_dumb_create(struct drm_file *file_priv, - struct drm_device *dev, -@@ -545,9 +520,6 @@ struct amdgpu_mc { - u64 private_aperture_end; - /* protects concurrent invalidation */ - spinlock_t invalidate_lock; -- -- struct kfd_vm_fault_info *vm_fault_info; -- atomic_t vm_fault_info_updated; - }; - - /* -@@ -730,7 +702,7 @@ int amdgpu_queue_mgr_fini(struct amdgpu_device *adev, - struct amdgpu_queue_mgr *mgr); - int amdgpu_queue_mgr_map(struct amdgpu_device *adev, - struct amdgpu_queue_mgr *mgr, -- u32 hw_ip, u32 instance, u32 ring, -+ int hw_ip, int instance, int ring, - struct amdgpu_ring **out_ring); - - /* -@@ -966,7 +938,6 @@ struct amdgpu_gfx_config { - }; - - struct amdgpu_cu_info { -- uint32_t simd_per_cu; - uint32_t max_waves_per_simd; - uint32_t wave_front_size; - uint32_t max_scratch_slots_per_cu; -@@ -1094,7 +1065,6 @@ struct amdgpu_cs_parser { - /* buffer objects */ - struct ww_acquire_ctx ticket; - struct amdgpu_bo_list *bo_list; -- struct amdgpu_mn *mn; - struct amdgpu_bo_list_entry vm_pd; - struct list_head validated; - struct dma_fence *fence; -@@ -1236,6 +1206,20 @@ void amdgpu_benchmark(struct amdgpu_device *adev, int test_number); - void amdgpu_test_moves(struct amdgpu_device *adev); - - /* -+ * MMU Notifier -+ */ -+#if defined(CONFIG_MMU_NOTIFIER) -+int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr); -+void amdgpu_mn_unregister(struct amdgpu_bo *bo); -+#else -+static inline int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) -+{ -+ return -ENODEV; -+} -+static inline void amdgpu_mn_unregister(struct amdgpu_bo *bo) {} -+#endif -+ -+/* - * Debugfs - */ - struct amdgpu_debugfs { -@@ -1435,7 +1419,10 @@ struct amdgpu_direct_gma { - }; - - #if defined(CONFIG_ZONE_DEVICE) && \ -- (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) || defined(OS_NAME_RHEL_7_3) || defined(OS_NAME_SLE)) -+ (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) || \ -+ defined(OS_NAME_RHEL_7_3) || \ -+ defined(OS_NAME_RHEL_7_4) || \ -+ defined(OS_NAME_SLE)) - #define CONFIG_ENABLE_SSG - #endif - -@@ -1603,14 +1590,18 @@ struct amdgpu_device { - /* sdma */ - struct amdgpu_sdma sdma; - -- /* uvd */ -- struct amdgpu_uvd uvd; -+ union { -+ struct { -+ /* uvd */ -+ struct amdgpu_uvd uvd; - -- /* vce */ -- struct amdgpu_vce vce; -+ /* vce */ -+ struct amdgpu_vce vce; -+ }; - -- /* vcn */ -- struct amdgpu_vcn vcn; -+ /* vcn */ -+ struct amdgpu_vcn vcn; -+ }; - - /* firmwares */ - struct amdgpu_firmware firmware; -@@ -1655,7 +1646,6 @@ struct amdgpu_device { - /* record hw reset is performed */ - bool has_hw_reset; - u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; -- spinlock_t tlb_invalidation_lock; - - /* record last mm index being written through WREG32*/ - unsigned long last_mm_index; -@@ -1861,6 +1851,18 @@ void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes, - u64 num_vis_bytes); - void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain); - bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo); -+int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages); -+int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr, -+ uint32_t flags); -+bool amdgpu_ttm_tt_has_userptr(struct ttm_tt *ttm); -+struct mm_struct *amdgpu_ttm_tt_get_usermm(struct ttm_tt *ttm); -+bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start, -+ unsigned long end); -+bool amdgpu_ttm_tt_userptr_invalidated(struct ttm_tt *ttm, -+ int *last_invalidated); -+bool amdgpu_ttm_tt_is_readonly(struct ttm_tt *ttm); -+uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt *ttm, -+ struct ttm_mem_reg *mem); - void amdgpu_vram_location(struct amdgpu_device *adev, struct amdgpu_mc *mc, u64 base); - void amdgpu_gart_location(struct amdgpu_device *adev, struct amdgpu_mc *mc); - void amdgpu_ttm_set_active_vram_size(struct amdgpu_device *adev, u64 size); -@@ -1943,9 +1945,10 @@ static inline int amdgpu_acpi_init(struct amdgpu_device *adev) { return 0; } - static inline void amdgpu_acpi_fini(struct amdgpu_device *adev) { } - #endif - --int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, -- uint64_t addr, struct amdgpu_bo **bo, -- struct amdgpu_bo_va_mapping **mapping); -+struct amdgpu_bo_va_mapping * -+amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, -+ uint64_t addr, struct amdgpu_bo **bo); -+int amdgpu_cs_sysvm_access_required(struct amdgpu_cs_parser *parser); - - #if defined(CONFIG_DRM_AMD_DC) - int amdgpu_dm_display_resume(struct amdgpu_device *adev ); -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c -index ec8141f..ef56352 100755 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c -@@ -20,29 +20,23 @@ - * OTHER DEALINGS IN THE SOFTWARE. - */ - --#undef pr_fmt --#define pr_fmt(fmt) "kfd2kgd: " fmt -- - #include "amdgpu_amdkfd.h" --#include <linux/dma-buf.h> -+#include "amd_shared.h" - #include <drm/drmP.h> - #include "amdgpu.h" - #include "amdgpu_gfx.h" - #include <linux/module.h> - --#define AMDKFD_SKIP_UNCOMPILED_CODE 1 -- -+const struct kfd2kgd_calls *kfd2kgd; - const struct kgd2kfd_calls *kgd2kfd; --bool (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); -- --unsigned int global_compute_vmid_bitmap = 0xFF00; -+bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); - - int amdgpu_amdkfd_init(void) - { - int ret; - - #if defined(CONFIG_HSA_AMD_MODULE) -- int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); -+ int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); - - kgd2kfd_init_p = symbol_request(kgd2kfd_init); - -@@ -63,68 +57,56 @@ int amdgpu_amdkfd_init(void) - #else - ret = -ENOENT; - #endif -- amdgpu_amdkfd_gpuvm_init_mem_limits(); -- return ret; --} - --void amdgpu_amdkfd_fini(void) --{ -- if (kgd2kfd) { -- kgd2kfd->exit(); -- symbol_put(kgd2kfd_init); -- } -+ return ret; - } - --void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) -+bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev) - { -- const struct kfd2kgd_calls *kfd2kgd; -- -- if (!kgd2kfd) -- return; -- - switch (adev->asic_type) { - #ifdef CONFIG_DRM_AMDGPU_CIK - case CHIP_KAVERI: -- case CHIP_HAWAII: - kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions(); - break; - #endif - case CHIP_CARRIZO: -- case CHIP_TONGA: -- case CHIP_FIJI: -- case CHIP_POLARIS10: -- case CHIP_POLARIS11: - kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); - break; -- case CHIP_VEGA10: -- case CHIP_RAVEN: -- kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions(); -- break; - default: -- dev_info(adev->dev, "kfd not supported on this ASIC\n"); -- return; -+ return false; -+ } -+ -+ return true; -+} -+ -+void amdgpu_amdkfd_fini(void) -+{ -+ if (kgd2kfd) { -+ kgd2kfd->exit(); -+ symbol_put(kgd2kfd_init); - } -+} - -- adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, -- adev->pdev, kfd2kgd); -+void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) -+{ -+ if (kgd2kfd) -+ adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, -+ adev->pdev, kfd2kgd); - } - - void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) - { - int i; - int last_valid_bit; -- - if (adev->kfd) { - struct kgd2kfd_shared_resources gpu_resources = { -- .compute_vmid_bitmap = global_compute_vmid_bitmap, -+ .compute_vmid_bitmap = 0xFF00, - .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec, -- .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe, -- .gpuvm_size = (uint64_t)amdgpu_vm_size << 30 -+ .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe - }; - - /* this is going to have a few of the MSBs set that we need to -- * clear -- */ -+ * clear */ - bitmap_complement(gpu_resources.queue_bitmap, - adev->gfx.mec.queue_bitmap, - KGD_MAX_QUEUES); -@@ -138,8 +120,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) - gpu_resources.queue_bitmap); - - /* According to linux/bitmap.h we shouldn't use bitmap_clear if -- * nbits is not compile time constant -- */ -+ * nbits is not compile time constant */ - last_valid_bit = 1 /* only first MEC can have compute queues */ - * adev->gfx.mec.num_pipe_per_mec - * adev->gfx.mec.num_queue_per_pipe; -@@ -150,28 +131,6 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) - &gpu_resources.doorbell_physical_address, - &gpu_resources.doorbell_aperture_size, - &gpu_resources.doorbell_start_offset); -- if (adev->asic_type >= CHIP_VEGA10) { -- /* On SOC15 the BIF is involved in routing -- * doorbells using the low 12 bits of the -- * address. Communicate the assignments to -- * KFD. KFD uses two doorbell pages per -- * process in case of 64-bit doorbells so we -- * can use each doorbell assignment twice. -- */ -- gpu_resources.sdma_doorbell[0][0] = -- AMDGPU_DOORBELL64_sDMA_ENGINE0; -- gpu_resources.sdma_doorbell[0][1] = -- AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200; -- gpu_resources.sdma_doorbell[1][0] = -- AMDGPU_DOORBELL64_sDMA_ENGINE1; -- gpu_resources.sdma_doorbell[1][1] = -- AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200; -- /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for -- * SDMA, IH and VCN. So don't use them for the CP. -- */ -- gpu_resources.reserved_doorbell_mask = 0x1f0; -- gpu_resources.reserved_doorbell_val = 0x0f0; -- } - - kgd2kfd->device_init(adev->kfd, &gpu_resources); - } -@@ -208,81 +167,24 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev) - return r; - } - --int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, -- uint32_t vmid, uint64_t gpu_addr, -- uint32_t *ib_cmd, uint32_t ib_len) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- struct amdgpu_job *job; -- struct amdgpu_ib *ib; -- struct amdgpu_ring *ring; -- struct dma_fence *f = NULL; -- int ret; -- -- switch (engine) { -- case KGD_ENGINE_MEC1: -- ring = &adev->gfx.compute_ring[0]; -- break; -- case KGD_ENGINE_SDMA1: -- ring = &adev->sdma.instance[0].ring; -- break; -- case KGD_ENGINE_SDMA2: -- ring = &adev->sdma.instance[1].ring; -- break; -- default: -- pr_err("Invalid engine in IB submission: %d\n", engine); -- ret = -EINVAL; -- goto err; -- } -- -- ret = amdgpu_job_alloc(adev, 1, &job, NULL); -- if (ret) -- goto err; -- -- ib = &job->ibs[0]; -- memset(ib, 0, sizeof(struct amdgpu_ib)); -- -- ib->gpu_addr = gpu_addr; -- ib->ptr = ib_cmd; -- ib->length_dw = ib_len; -- /* This works for NO_HWS. TODO: need to handle without knowing VMID */ -- job->vm_id = vmid; -- -- ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); -- if (ret) { -- DRM_ERROR("amdgpu: failed to schedule IB.\n"); -- goto err_ib_sched; -- } -- -- ret = dma_fence_wait(f, false); -- --err_ib_sched: -- dma_fence_put(f); -- amdgpu_job_free(job); --err: -- return ret; --} -- --u32 pool_to_domain(enum kgd_memory_pool p) --{ -- switch (p) { -- case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM; -- default: return AMDGPU_GEM_DOMAIN_GTT; -- } --} -- - int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr) - { - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- struct amdgpu_bo *bo = NULL; -+ struct kgd_mem **mem = (struct kgd_mem **) mem_obj; - int r; -- uint64_t gpu_addr_tmp = 0; -- void *cpu_ptr_tmp = NULL; -+ -+ BUG_ON(kgd == NULL); -+ BUG_ON(gpu_addr == NULL); -+ BUG_ON(cpu_ptr == NULL); -+ -+ *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL); -+ if ((*mem) == NULL) -+ return -ENOMEM; - - r = amdgpu_bo_create(adev, size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT, -- AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &bo); -+ AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &(*mem)->bo); - if (r) { - dev_err(adev->dev, - "failed to allocate BO for amdkfd (%d)\n", r); -@@ -290,87 +192,64 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - } - - /* map the buffer */ -- r = amdgpu_bo_reserve(bo, true); -+ r = amdgpu_bo_reserve((*mem)->bo, true); - if (r) { - dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r); - goto allocate_mem_reserve_bo_failed; - } - -- r = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT, -- &gpu_addr_tmp); -+ r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT, -+ &(*mem)->gpu_addr); - if (r) { - dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r); - goto allocate_mem_pin_bo_failed; - } -+ *gpu_addr = (*mem)->gpu_addr; - -- r = amdgpu_bo_kmap(bo, &cpu_ptr_tmp); -+ r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); - if (r) { - dev_err(adev->dev, - "(%d) failed to map bo to kernel for amdkfd\n", r); - goto allocate_mem_kmap_bo_failed; - } -+ *cpu_ptr = (*mem)->cpu_ptr; - -- *mem_obj = bo; -- *gpu_addr = gpu_addr_tmp; -- *cpu_ptr = cpu_ptr_tmp; -- -- amdgpu_bo_unreserve(bo); -+ amdgpu_bo_unreserve((*mem)->bo); - - return 0; - - allocate_mem_kmap_bo_failed: -- amdgpu_bo_unpin(bo); -+ amdgpu_bo_unpin((*mem)->bo); - allocate_mem_pin_bo_failed: -- amdgpu_bo_unreserve(bo); -+ amdgpu_bo_unreserve((*mem)->bo); - allocate_mem_reserve_bo_failed: -- amdgpu_bo_unref(&bo); -+ amdgpu_bo_unref(&(*mem)->bo); - - return r; - } - - void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) - { -- struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; -+ struct kgd_mem *mem = (struct kgd_mem *) mem_obj; - -- amdgpu_bo_reserve(bo, true); -- amdgpu_bo_kunmap(bo); -- amdgpu_bo_unpin(bo); -- amdgpu_bo_unreserve(bo); -- amdgpu_bo_unref(&(bo)); -+ BUG_ON(mem == NULL); -+ -+ amdgpu_bo_reserve(mem->bo, true); -+ amdgpu_bo_kunmap(mem->bo); -+ amdgpu_bo_unpin(mem->bo); -+ amdgpu_bo_unreserve(mem->bo); -+ amdgpu_bo_unref(&(mem->bo)); -+ kfree(mem); - } - --void get_local_mem_info(struct kgd_dev *kgd, -- struct kfd_local_mem_info *mem_info) -+uint64_t get_vmem_size(struct kgd_dev *kgd) - { -- uint64_t address_mask; -- resource_size_t aper_limit; -- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -+ struct amdgpu_device *adev = -+ (struct amdgpu_device *)kgd; - -- address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask : -- ~((1ULL << 32) - 1); -- aper_limit = adev->mc.aper_base + adev->mc.aper_size; -- -- memset(mem_info, 0, sizeof(*mem_info)); -- if (!(adev->mc.aper_base & address_mask || -- aper_limit & address_mask)) { -- mem_info->local_mem_size_public = adev->mc.visible_vram_size; -- mem_info->local_mem_size_private = adev->mc.real_vram_size - -- adev->mc.visible_vram_size; -- } else { -- mem_info->local_mem_size_public = 0; -- mem_info->local_mem_size_private = adev->mc.real_vram_size; -- } -- mem_info->vram_width = adev->mc.vram_width; -+ BUG_ON(kgd == NULL); - -- pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n", -- adev->mc.aper_base, aper_limit, -- mem_info->local_mem_size_public, -- mem_info->local_mem_size_private); -- -- if (amdgpu_sriov_vf(adev)) -- mem_info->mem_clk_max = adev->clock.default_mclk / 100; -- else -- mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100; -+ return adev->mc.real_vram_size; - } - - uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) -@@ -385,113 +264,6 @@ uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) - uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd) - { - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- -- /* the sclk is in quantas of 10kHz */ -- if (amdgpu_sriov_vf(adev)) -- return adev->clock.default_sclk / 100; -- -- return amdgpu_dpm_get_sclk(adev, false) / 100; --} -- --void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- struct amdgpu_cu_info acu_info = adev->gfx.cu_info; -- -- memset(cu_info, 0, sizeof(*cu_info)); -- if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap)) -- return; -- -- cu_info->cu_active_number = acu_info.number; -- cu_info->cu_ao_mask = acu_info.ao_cu_mask; -- memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], -- sizeof(acu_info.bitmap)); -- cu_info->num_shader_engines = adev->gfx.config.max_shader_engines; -- cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; -- cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh; -- cu_info->simd_per_cu = acu_info.simd_per_cu; -- cu_info->max_waves_per_simd = acu_info.max_waves_per_simd; -- cu_info->wave_front_size = acu_info.wave_front_size; -- cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu; -- cu_info->lds_size = acu_info.lds_size; --} -- --int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, -- struct kgd_dev **dma_buf_kgd, -- uint64_t *bo_size, void *metadata_buffer, -- size_t buffer_size, uint32_t *metadata_size, -- uint32_t *flags) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- struct dma_buf *dma_buf; -- struct drm_gem_object *obj; -- struct amdgpu_bo *bo; -- uint64_t metadata_flags; -- int r = -EINVAL; -- -- dma_buf = dma_buf_get(dma_buf_fd); -- if (IS_ERR(dma_buf)) -- return PTR_ERR(dma_buf); -- -- if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) -- /* Can't handle non-graphics buffers */ -- goto out_put; -- -- obj = dma_buf->priv; -- if (obj->dev->driver != adev->ddev->driver) -- /* Can't handle buffers from different drivers */ -- goto out_put; -- -- adev = obj->dev->dev_private; -- bo = gem_to_amdgpu_bo(obj); -- if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | -- AMDGPU_GEM_DOMAIN_GTT | -- AMDGPU_GEM_DOMAIN_DGMA))) -- /* Only VRAM, GTT and DGMA BOs are supported */ -- goto out_put; -- -- r = 0; -- if (dma_buf_kgd) -- *dma_buf_kgd = (struct kgd_dev *)adev; -- if (bo_size) -- *bo_size = amdgpu_bo_size(bo); -- if (metadata_size) -- *metadata_size = bo->metadata_size; -- if (metadata_buffer) -- r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size, -- metadata_size, &metadata_flags); -- if (flags) { -- /* If the preferred domain is DGMA, set flags to VRAM because -- * KFD doesn't support allocating DGMA memory -- */ -- *flags = (bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | -- AMDGPU_GEM_DOMAIN_DGMA)) ? -- ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT; -- -- if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) -- *flags |= ALLOC_MEM_FLAGS_PUBLIC; -- } -- --out_put: -- dma_buf_put(dma_buf); -- return r; --} -- --uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- uint64_t usage = -- amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]); -- return usage; --} -- --bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, -- u32 vmid) --{ -- if (adev->kfd) { -- if ((1 << vmid) & global_compute_vmid_bitmap) -- return true; -- } -- -- return false; -+ /* The sclk is in quantas of 10kHz */ -+ return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100; - } -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h -index b259ba7..8e8c10e 100755 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h -@@ -27,109 +27,21 @@ - - #include <linux/types.h> - #include <linux/mm.h> --#include <linux/workqueue.h> --#include <linux/mmu_context.h> - #include <kgd_kfd_interface.h> --#include "amdgpu.h" -- --extern const struct kgd2kfd_calls *kgd2kfd; - - struct amdgpu_device; - --struct kfd_bo_va_list { -- struct list_head bo_list; -- struct amdgpu_bo_va *bo_va; -- void *kgd_dev; -- bool is_mapped; -- bool map_fail; -- uint64_t va; -- uint64_t pte_flags; --}; -- - struct kgd_mem { -- struct mutex lock; - struct amdgpu_bo *bo; -- struct list_head bo_va_list; -- /* protected by amdkfd_process_info.lock */ -- struct ttm_validate_buffer validate_list; -- struct ttm_validate_buffer resv_list; -- uint32_t domain; -- unsigned int mapped_to_gpu_memory; -- void *kptr; -- uint64_t va; -- -- uint32_t mapping_flags; -- -- atomic_t invalid; -- struct amdkfd_process_info *process_info; -- struct page **user_pages; -- -- struct amdgpu_sync sync; -- -- /* flags bitfield */ -- bool coherent : 1; -- bool no_substitute : 1; -- bool aql_queue : 1; --}; -- --/* KFD Memory Eviction */ --struct amdgpu_amdkfd_fence { -- struct dma_fence base; -- void *mm; -- spinlock_t lock; -- char timeline_name[TASK_COMM_LEN]; --}; -- --struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, -- void *mm); --bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm); --struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); -- --struct amdkfd_process_info { -- /* List head of all VMs that belong to a KFD process */ -- struct list_head vm_list_head; -- /* List head for all KFD BOs that belong to a KFD process. */ -- struct list_head kfd_bo_list; -- /* List of userptr BOs that are valid or invalid */ -- struct list_head userptr_valid_list; -- struct list_head userptr_inval_list; -- /* Lock to protect kfd_bo_list */ -- struct mutex lock; -- -- /* Number of VMs */ -- unsigned int n_vms; -- /* Eviction Fence */ -- struct amdgpu_amdkfd_fence *eviction_fence; -- -- /* MMU-notifier related fields */ -- atomic_t evicted_bos; -- struct delayed_work work; -- struct pid *pid; --}; -- --/* struct amdkfd_vm - -- * For Memory Eviction KGD requires a mechanism to keep track of all KFD BOs -- * belonging to a KFD process. All the VMs belonging to the same process point -- * to the same amdkfd_process_info. -- */ --struct amdkfd_vm { -- /* Keep base as the first parameter for pointer compatibility between -- * amdkfd_vm and amdgpu_vm. -- */ -- struct amdgpu_vm base; -- -- /* List node in amdkfd_process_info.vm_list_head*/ -- struct list_head vm_list_node; -- -- struct amdgpu_device *adev; -- /* Points to the KFD process VM info*/ -- struct amdkfd_process_info *process_info; -+ uint64_t gpu_addr; -+ void *cpu_ptr; - }; - - - int amdgpu_amdkfd_init(void); - void amdgpu_amdkfd_fini(void); - -+bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev); - - void amdgpu_amdkfd_suspend(struct amdgpu_device *adev); - int amdgpu_amdkfd_resume(struct amdgpu_device *adev); -@@ -139,105 +51,17 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); - void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); - void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); - --int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); --int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, -- uint32_t vmid, uint64_t gpu_addr, -- uint32_t *ib_cmd, uint32_t ib_len); --int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, -- struct dma_fence **ef); - struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); - struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); --struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void); --int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, -- uint64_t src_offset, struct kgd_mem *dst_mem, -- uint64_t dest_offset, uint64_t size, struct dma_fence **f, -- uint64_t *actual_size); -- --bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, -- u32 vmid); - - /* Shared API */ --int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm, -- struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va); - int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr); - void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); --void get_local_mem_info(struct kgd_dev *kgd, -- struct kfd_local_mem_info *mem_info); -+uint64_t get_vmem_size(struct kgd_dev *kgd); - uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); - - uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd); --void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info); --int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, -- struct kgd_dev **dmabuf_kgd, -- uint64_t *bo_size, void *metadata_buffer, -- size_t buffer_size, uint32_t *metadata_size, -- uint32_t *flags); --uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd); -- --#define read_user_wptr(mmptr, wptr, dst) \ -- ({ \ -- bool valid = false; \ -- if ((mmptr) && (wptr)) { \ -- if ((mmptr) == current->mm) { \ -- valid = !get_user((dst), (wptr)); \ -- } else if (current->mm == NULL) { \ -- use_mm(mmptr); \ -- valid = !get_user((dst), (wptr)); \ -- unuse_mm(mmptr); \ -- } \ -- } \ -- valid; \ -- }) -- --/* GPUVM API */ --int amdgpu_amdkfd_gpuvm_sync_memory( -- struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); --int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( -- struct kgd_dev *kgd, uint64_t va, uint64_t size, -- void *vm, struct kgd_mem **mem, -- uint64_t *offset, uint32_t flags); --int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( -- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); --int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( -- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); --int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( -- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); - --int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, -- void **process_info, -- struct dma_fence **ef); --void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm); -- --uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm); -- --int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, -- struct kfd_vm_fault_info *info); -- --int amdgpu_amdkfd_gpuvm_mmap_bo( -- struct kgd_dev *kgd, struct vm_area_struct *vma); -- --int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, -- struct kgd_mem *mem, void **kptr); -- --int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, -- struct kgd_mem *mem, uint64_t offset, -- uint64_t size, struct sg_table **ret_sg); --void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( -- struct kgd_mem *mem, struct sg_table *sg); --int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, -- struct dma_buf *dmabuf, -- uint64_t va, void *vm, -- struct kgd_mem **mem, uint64_t *size, -- uint64_t *mmap_offset); --int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, -- struct kgd_mem *mem, -- struct dma_buf **dmabuf); --int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm); --int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm); -- --void amdgpu_amdkfd_gpuvm_init_mem_limits(void); --void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo); - #endif /* AMDGPU_AMDKFD_H_INCLUDED */ -- -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c -deleted file mode 100644 -index 3961937..0000000 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c -+++ /dev/null -@@ -1,196 +0,0 @@ --/* -- * Copyright 2016 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- --#include <linux/spinlock.h> --#include <linux/atomic.h> --#include <linux/stacktrace.h> --#include <linux/sched.h> --#include <linux/slab.h> --#include "amdgpu_amdkfd.h" -- --const struct dma_fence_ops amd_kfd_fence_ops; --static atomic_t fence_seq = ATOMIC_INIT(0); -- --static int amd_kfd_fence_signal(struct dma_fence *f); -- --/* Eviction Fence -- * Fence helper functions to deal with KFD memory eviction. -- * Big Idea - Since KFD submissions are done by user queues, a BO cannot be -- * evicted unless all the user queues for that process are evicted. -- * -- * All the BOs in a process share an eviction fence. When process X wants -- * to map VRAM memory but TTM can't find enough space, TTM will attempt to -- * evict BOs from its LRU list. TTM checks if the BO is valuable to evict -- * by calling ttm_bo_driver->eviction_valuable(). -- * -- * ttm_bo_driver->eviction_valuable() - will return false if the BO belongs -- * to process X. Otherwise, it will return true to indicate BO can be -- * evicted by TTM. -- * -- * If ttm_bo_driver->eviction_valuable returns true, then TTM will continue -- * the evcition process for that BO by calling ttm_bo_evict --> amdgpu_bo_move -- * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler. -- * -- * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to -- * nofity when the BO is free to move. fence_add_callback --> enable_signaling -- * --> amdgpu_amdkfd_fence.enable_signaling -- * -- * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will quiesce -- * user queues and signal fence. The work item will also start another delayed -- * work item to restore BOs -- */ -- --struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, -- void *mm) --{ -- struct amdgpu_amdkfd_fence *fence = NULL; -- -- fence = kzalloc(sizeof(*fence), GFP_KERNEL); -- if (fence == NULL) -- return NULL; -- -- /* mm_struct mm is used as void pointer to identify the parent -- * KFD process. Don't dereference it. Fence and any threads using -- * mm is guranteed to be released before process termination. -- */ -- fence->mm = mm; -- get_task_comm(fence->timeline_name, current); -- spin_lock_init(&fence->lock); -- -- dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock, -- context, atomic_inc_return(&fence_seq)); -- -- return fence; --} -- --struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f) --{ -- struct amdgpu_amdkfd_fence *fence; -- -- if (!f) -- return NULL; -- -- fence = container_of(f, struct amdgpu_amdkfd_fence, base); -- if (fence && f->ops == &amd_kfd_fence_ops) -- return fence; -- -- return NULL; --} -- --static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f) --{ -- return "amdgpu_amdkfd_fence"; --} -- --static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f) --{ -- struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); -- -- return fence->timeline_name; --} -- --/** -- * amd_kfd_fence_enable_signaling - This gets called when TTM wants to evict -- * a KFD BO and schedules a job to move the BO. -- * If fence is already signaled return true. -- * If fence is not signaled schedule a evict KFD process work item. -- */ --static bool amd_kfd_fence_enable_signaling(struct dma_fence *f) --{ -- struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); -- -- if (!fence) -- return false; -- -- if (dma_fence_is_signaled(f)) -- return true; -- -- if (!kgd2kfd->schedule_evict_and_restore_process( -- (struct mm_struct *)fence->mm, f)) -- return true; -- -- return false; --} -- --static int amd_kfd_fence_signal(struct dma_fence *f) --{ -- unsigned long flags; -- int ret; -- -- spin_lock_irqsave(f->lock, flags); -- /* Set enabled bit so cb will called */ -- set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags); -- ret = dma_fence_signal_locked(f); -- spin_unlock_irqrestore(f->lock, flags); -- -- return ret; --} -- --/** -- * amd_kfd_fence_release - callback that fence can be freed -- * -- * @fence: fence -- * -- * This function is called when the reference count becomes zero. -- * It just RCU schedules freeing up the fence. --*/ --static void amd_kfd_fence_release(struct dma_fence *f) --{ -- struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); -- /* Unconditionally signal the fence. The process is getting -- * terminated. -- */ -- if (WARN_ON(!fence)) -- return; /* Not an amdgpu_amdkfd_fence */ -- -- amd_kfd_fence_signal(f); -- kfree_rcu(f, rcu); --} -- --/** -- * amd_kfd_fence_check_mm - Check if @mm is same as that of the fence @f -- * if same return TRUE else return FALSE. -- * -- * @f: [IN] fence -- * @mm: [IN] mm that needs to be verified --*/ --bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm) --{ -- struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); -- -- if (!fence) -- return false; -- else if (fence->mm == mm) -- return true; -- -- return false; --} -- --const struct dma_fence_ops amd_kfd_fence_ops = { -- .get_driver_name = amd_kfd_fence_get_driver_name, -- .get_timeline_name = amd_kfd_fence_get_timeline_name, -- .enable_signaling = amd_kfd_fence_enable_signaling, -- .signaled = NULL, -- .wait = dma_fence_default_wait, -- .release = amd_kfd_fence_release, --}; -- -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c -old mode 100755 -new mode 100644 -index 6964ece..f6acf48 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c -@@ -20,9 +20,6 @@ - * OTHER DEALINGS IN THE SOFTWARE. - */ - --#undef pr_fmt --#define pr_fmt(fmt) "kfd2kgd: " fmt -- - #include <linux/fdtable.h> - #include <linux/uaccess.h> - #include <linux/firmware.h> -@@ -42,14 +39,6 @@ - #include "gmc/gmc_7_1_sh_mask.h" - #include "cik_structs.h" - --#define AMDKFD_SKIP_UNCOMPILED_CODE 1 -- --enum hqd_dequeue_request_type { -- NO_ACTION = 0, -- DRAIN_PIPE, -- RESET_WAVES --}; -- - enum { - MAX_TRAPID = 8, /* 3 bits in the bitfield. */ - MAX_WATCH_ADDRESSES = 4 -@@ -66,8 +55,8 @@ enum { - enum { - ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, - ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, -- ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000, -- /* extend the mask to 26 bits in order to match the low address field */ -+ ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, -+ /* extend the mask to 26 bits to match the low address field */ - ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, - ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF - }; -@@ -92,42 +81,30 @@ union TCP_WATCH_CNTL_BITS { - float f32All; - }; - --static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -- int fd, uint32_t handle, struct kgd_mem **mem); -- --static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -- - /* - * Register access functions - */ - - static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, -- uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); -+ uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, -+ uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); -+ - static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, -- unsigned int vmid); -+ unsigned int vmid); -+ - static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, -- uint32_t hpd_size, uint64_t hpd_gpu_addr); -+ uint32_t hpd_size, uint64_t hpd_gpu_addr); - static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr, -- uint32_t wptr_shift, uint32_t wptr_mask, -- struct mm_struct *mm); --static int kgd_hqd_dump(struct kgd_dev *kgd, -- uint32_t pipe_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs); --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -- uint32_t __user *wptr, struct mm_struct *mm); --static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -- uint32_t engine_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs); -+ uint32_t queue_id, uint32_t __user *wptr); -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); - static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, -- uint32_t pipe_id, uint32_t queue_id); --static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); --static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -- enum kfd_preempt_type reset_type, -+ uint32_t pipe_id, uint32_t queue_id); -+ -+static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id); -+static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); - static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout); - static int kgd_address_watch_disable(struct kgd_dev *kgd); -@@ -147,60 +124,21 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid); - static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid); - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); --static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); --static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req); --static int alloc_memory_of_scratch(struct kgd_dev *kgd, -- uint64_t va, uint32_t vmid); --static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -- uint8_t element_size, uint8_t index_stride, uint8_t mtype); --static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t page_table_base); --static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd); -- --/* Because of REG_GET_FIELD() being used, we put this function in the -- * asic specific file. -- */ --static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, -- struct tile_config *config) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- -- config->gb_addr_config = adev->gfx.config.gb_addr_config; -- config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -- MC_ARB_RAMCFG, NOOFBANK); -- config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -- MC_ARB_RAMCFG, NOOFRANKS); - -- config->tile_config_ptr = adev->gfx.config.tile_mode_array; -- config->num_tile_configs = -- ARRAY_SIZE(adev->gfx.config.tile_mode_array); -- config->macro_tile_config_ptr = -- adev->gfx.config.macrotile_mode_array; -- config->num_macro_tile_configs = -- ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); -- -- -- return 0; --} -+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); - - static const struct kfd2kgd_calls kfd2kgd = { - .init_gtt_mem_allocation = alloc_gtt_mem, - .free_gtt_mem = free_gtt_mem, -- .get_local_mem_info = get_local_mem_info, -+ .get_vmem_size = get_vmem_size, - .get_gpu_clock_counter = get_gpu_clock_counter, - .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, -- .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, -- .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, -- .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, -- .open_graphic_handle = open_graphic_handle, - .program_sh_mem_settings = kgd_program_sh_mem_settings, - .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, - .init_pipeline = kgd_init_pipeline, - .init_interrupts = kgd_init_interrupts, - .hqd_load = kgd_hqd_load, - .hqd_sdma_load = kgd_hqd_sdma_load, -- .hqd_dump = kgd_hqd_dump, -- .hqd_sdma_dump = kgd_hqd_sdma_dump, - .hqd_is_occupied = kgd_hqd_is_occupied, - .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, - .hqd_destroy = kgd_hqd_destroy, -@@ -209,50 +147,17 @@ static const struct kfd2kgd_calls kfd2kgd = { - .address_watch_execute = kgd_address_watch_execute, - .wave_control_execute = kgd_wave_control_execute, - .address_watch_get_offset = kgd_address_watch_get_offset, -- .get_atc_vmid_pasid_mapping_pasid = -- get_atc_vmid_pasid_mapping_pasid, -- .get_atc_vmid_pasid_mapping_valid = -- get_atc_vmid_pasid_mapping_valid, -- .read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg, -+ .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid, -+ .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, - .write_vmid_invalidate_request = write_vmid_invalidate_request, -- .invalidate_tlbs = invalidate_tlbs, -- .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, -- .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, -- .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, -- .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, -- .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, -- .get_fw_version = get_fw_version, -- .set_num_of_requests = set_num_of_requests, -- .get_cu_info = get_cu_info, -- .alloc_memory_of_scratch = alloc_memory_of_scratch, -- .write_config_static_mem = write_config_static_mem, -- .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, -- .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, -- .set_vm_context_page_table_base = set_vm_context_page_table_base, -- .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, -- .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, -- .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, -- .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, -- .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, -- .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, -- .submit_ib = amdgpu_amdkfd_submit_ib, -- .get_tile_config = amdgpu_amdkfd_get_tile_config, -- .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, -- .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, -- .get_vram_usage = amdgpu_amdkfd_get_vram_usage -+ .get_fw_version = get_fw_version - }; - --struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions() -+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void) - { - return (struct kfd2kgd_calls *)&kfd2kgd; - } - --static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -- int fd, uint32_t handle, struct kgd_mem **mem) --{ -- return 0; --} -- - static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) - { - return (struct amdgpu_device *)kgd; -@@ -281,7 +186,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - -- uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - lock_srbm(kgd, mec, pipe, queue_id, 0); -@@ -317,12 +222,12 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, - - /* - * We have to assume that there is no outstanding mapping. -- * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a -- * mapping is in progress or because a mapping finished and the SW -- * cleared it. So the protocol is to always wait & clear. -+ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because -+ * a mapping is in progress or because a mapping finished and the -+ * SW cleared it. So the protocol is to always wait & clear. - */ -- uint32_t pasid_mapping = (pasid == 0) ? 0 : -- (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; -+ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | -+ ATC_VMID0_PASID_MAPPING__VALID_MASK; - - WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); - -@@ -368,7 +273,8 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) - - retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + - m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; -- pr_debug("sdma base address: 0x%x\n", retval); -+ -+ pr_debug("kfd: sdma base address: 0x%x\n", retval); - - return retval; - } -@@ -384,138 +290,42 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) - } - - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr, -- uint32_t wptr_shift, uint32_t wptr_mask, -- struct mm_struct *mm) -+ uint32_t queue_id, uint32_t __user *wptr) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); -+ uint32_t wptr_shadow, is_wptr_shadow_valid; - struct cik_mqd *m; -- uint32_t *mqd_hqd; -- uint32_t reg, wptr_val, data; -- bool valid_wptr = false; - - m = get_mqd(mqd); - -- acquire_queue(kgd, pipe_id, queue_id); -- -- /* HQD registers extend from CP_MQD_BASE_ADDR to CP_MQD_CONTROL. */ -- mqd_hqd = &m->cp_mqd_base_addr_lo; -- -- for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) -- WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); -- -- /* Copy userspace write pointer value to register. -- * Activate doorbell logic to monitor subsequent changes. -- */ -- data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, -- CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); -- WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); -- -- /* read_user_ptr may take the mm->mmap_sem. -- * release srbm_mutex to avoid circular dependency between -- * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. -- */ -- release_queue(kgd); -- valid_wptr = read_user_wptr(mm, wptr, wptr_val); -+ is_wptr_shadow_valid = !get_user(wptr_shadow, wptr); -+ if (is_wptr_shadow_valid) -+ m->cp_hqd_pq_wptr = wptr_shadow; - - acquire_queue(kgd, pipe_id, queue_id); -- if (valid_wptr) -- WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); -- -- data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); -- WREG32(mmCP_HQD_ACTIVE, data); -- - -+ gfx_v7_0_mqd_commit(adev, m); - release_queue(kgd); - - return 0; - } - --static int kgd_hqd_dump(struct kgd_dev *kgd, -- uint32_t pipe_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t i = 0, reg; --#define HQD_N_REGS (35+4) --#define DUMP_REG(addr) do { \ -- if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ -- break; \ -- (*dump)[i][0] = (addr) << 2; \ -- (*dump)[i++][1] = RREG32(addr); \ -- } while (0) -- -- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -- if (*dump == NULL) -- return -ENOMEM; -- -- acquire_queue(kgd, pipe_id, queue_id); -- -- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); -- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); -- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); -- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); -- -- for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) -- DUMP_REG(reg); -- -- release_queue(kgd); -- -- WARN_ON_ONCE(i != HQD_N_REGS); -- *n_regs = i; -- -- return 0; --} -- --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -- uint32_t __user *wptr, struct mm_struct *mm) -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct cik_sdma_rlc_registers *m; - unsigned long end_jiffies; - uint32_t sdma_base_addr; -- uint32_t data; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); - -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -- m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); -- -- while (true) { -- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -- break; -- if (timeout == 0) -- return -ETIME; -- msleep(10); -- timeout -= 10; -- } -- if (m->sdma_engine_id) { -- data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); -- data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, -- RESUME_CTX, 0); -- WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); -- } else { -- data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); -- data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, -- RESUME_CTX, 0); -- WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); -- } -- -- data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL, -- ENABLE, 1); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr); -- if (read_user_wptr(mm, wptr, data)) -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); -- else -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, -- m->sdma_rlc_rb_rptr); -- -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, -+ m->sdma_rlc_virtual_addr); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, -+ m->sdma_rlc_rb_base); - WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, - m->sdma_rlc_virtual_addr); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); - - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, - m->sdma_rlc_rb_base_hi); -@@ -523,35 +333,11 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, - m->sdma_rlc_rb_rptr_addr_lo); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, - m->sdma_rlc_rb_rptr_addr_hi); -- data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL, -- RB_ENABLE, 1); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); -- return 0; --} -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, -+ m->sdma_rlc_doorbell); - --static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -- uint32_t engine_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + -- queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; -- uint32_t i = 0, reg; --#undef HQD_N_REGS --#define HQD_N_REGS (19+4) -- -- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -- if (*dump == NULL) -- return -ENOMEM; -- -- for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) -- DUMP_REG(sdma_offset + reg); -- for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; -- reg++) -- DUMP_REG(sdma_offset + reg); -- -- WARN_ON_ONCE(i != HQD_N_REGS); -- *n_regs = i; -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -+ m->sdma_rlc_rb_cntl); - - return 0; - } -@@ -596,99 +382,30 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) - return false; - } - --static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -- enum kfd_preempt_type reset_type, -+static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t temp; -- enum hqd_dequeue_request_type type; -- unsigned long flags, end_jiffies; -- int retry; -+ int timeout = utimeout; - - acquire_queue(kgd, pipe_id, queue_id); - WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0); - -- switch (reset_type) { -- case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: -- type = DRAIN_PIPE; -- break; -- case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: -- type = RESET_WAVES; -- break; -- default: -- type = DRAIN_PIPE; -- break; -- } -- -- /* Workaround: If IQ timer is active and the wait time is close to or -- * equal to 0, dequeueing is not safe. Wait until either the wait time -- * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is -- * cleared before continuing. Also, ensure wait times are set to at -- * least 0x3. -- */ -- local_irq_save(flags); -- preempt_disable(); -- retry = 5000; /* wait for 500 usecs at maximum */ -- while (true) { -- temp = RREG32(mmCP_HQD_IQ_TIMER); -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { -- pr_debug("HW is processing IQ\n"); -- goto loop; -- } -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) -- == 3) /* SEM-rearm is safe */ -- break; -- /* Wait time 3 is safe for CP, but our MMIO read/write -- * time is close to 1 microsecond, so check for 10 to -- * leave more buffer room -- */ -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) -- >= 10) -- break; -- pr_debug("IQ timer is active\n"); -- } else -- break; --loop: -- if (!retry) { -- pr_err("CP HQD IQ timer status time out\n"); -- break; -- } -- ndelay(100); -- --retry; -- } -- retry = 1000; -- while (true) { -- temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); -- if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) -- break; -- pr_debug("Dequeue request is pending\n"); -- -- if (!retry) { -- pr_err("CP HQD dequeue request time out\n"); -- break; -- } -- ndelay(100); -- --retry; -- } -- local_irq_restore(flags); -- preempt_enable(); -- -- WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); -+ WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); - -- end_jiffies = (utimeout * HZ / 1000) + jiffies; - while (true) { - temp = RREG32(mmCP_HQD_ACTIVE); -- if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) -+ if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) - break; -- if (time_after(jiffies, end_jiffies)) { -- pr_err("cp queue preemption time out\n"); -+ if (timeout <= 0) { -+ pr_err("kfd: cp queue preemption time out.\n"); - release_queue(kgd); - return -ETIME; - } -- usleep_range(500, 1000); -+ msleep(20); -+ timeout -= 20; - } - - release_queue(kgd); -@@ -702,7 +419,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - struct cik_sdma_rlc_registers *m; - uint32_t sdma_base_addr; - uint32_t temp; -- unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; -+ int timeout = utimeout; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); -@@ -713,19 +430,18 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - - while (true) { - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -+ if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) - break; -- if (time_after(jiffies, end_jiffies)) -+ if (timeout <= 0) - return -ETIME; -- usleep_range(500, 1000); -+ msleep(20); -+ timeout -= 20; - } - - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | -- SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); -- -- m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); - - return 0; - } -@@ -744,9 +460,8 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd) - - /* Turning off this address until we set all the registers */ - for (i = 0; i < MAX_WATCH_ADDRESSES; i++) -- WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_CNTL], -- cntl.u32All); -+ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + -+ ADDRESS_WATCH_REG_CNTL], cntl.u32All); - - return 0; - } -@@ -764,24 +479,20 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, - - /* Turning off this watch point until we set all the registers */ - cntl.bitfields.valid = 0; -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_CNTL], -- cntl.u32All); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -+ ADDRESS_WATCH_REG_CNTL], cntl.u32All); - -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_ADDR_HI], -- addr_hi); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -+ ADDRESS_WATCH_REG_ADDR_HI], addr_hi); - -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_ADDR_LO], -- addr_lo); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -+ ADDRESS_WATCH_REG_ADDR_LO], addr_lo); - - /* Enable the watch point */ - cntl.bitfields.valid = 1; - -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_CNTL], -- cntl.u32All); -+ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -+ ADDRESS_WATCH_REG_CNTL], cntl.u32All); - - return 0; - } -@@ -835,7 +546,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); -- return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; -+ return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; - } - - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) -@@ -845,90 +556,52 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) - WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); - } - --static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- int vmid; -- -- for (vmid = 0; vmid < 16; vmid++) { -- if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) -- continue; -- if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & -- ATC_VMID0_PASID_MAPPING__VALID_MASK) { -- if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & -- ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { -- WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); -- break; -- } -- } -- } -- -- return 0; --} -- --static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -- uint8_t element_size, uint8_t index_stride, uint8_t mtype) --{ -- uint32_t reg; -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- -- reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | -- element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | -- index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | -- mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; -- -- WREG32(mmSH_STATIC_MEM_CONFIG, reg); -- return 0; --} --static int alloc_memory_of_scratch(struct kgd_dev *kgd, -- uint64_t va, uint32_t vmid) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- -- lock_srbm(kgd, 0, 0, 0, vmid); -- WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); -- unlock_srbm(kgd); -- -- return 0; --} -- -- - static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - { - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - const union amdgpu_firmware_header *hdr; - -+ BUG_ON(kgd == NULL); -+ - switch (type) { - case KGD_ENGINE_PFP: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; -+ hdr = (const union amdgpu_firmware_header *) -+ adev->gfx.pfp_fw->data; - break; - - case KGD_ENGINE_ME: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; -+ hdr = (const union amdgpu_firmware_header *) -+ adev->gfx.me_fw->data; - break; - - case KGD_ENGINE_CE: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; -+ hdr = (const union amdgpu_firmware_header *) -+ adev->gfx.ce_fw->data; - break; - - case KGD_ENGINE_MEC1: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; -+ hdr = (const union amdgpu_firmware_header *) -+ adev->gfx.mec_fw->data; - break; - - case KGD_ENGINE_MEC2: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; -+ hdr = (const union amdgpu_firmware_header *) -+ adev->gfx.mec2_fw->data; - break; - - case KGD_ENGINE_RLC: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; -+ hdr = (const union amdgpu_firmware_header *) -+ adev->gfx.rlc_fw->data; - break; - - case KGD_ENGINE_SDMA1: -- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; -+ hdr = (const union amdgpu_firmware_header *) -+ adev->sdma.instance[0].fw->data; - break; - - case KGD_ENGINE_SDMA2: -- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; -+ hdr = (const union amdgpu_firmware_header *) -+ adev->sdma.instance[1].fw->data; - break; - - default: -@@ -942,42 +615,3 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - return hdr->common.ucode_version; - } - --static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req) --{ -- uint32_t value; -- struct amdgpu_device *adev = get_amdgpu_device(dev); -- -- value = RREG32(mmATC_ATS_DEBUG); -- value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK; -- value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT); -- -- WREG32(mmATC_ATS_DEBUG, value); --} -- --static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t page_table_base) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- /* TODO: Don't use hardcoded VMIDs */ -- if (vmid < 8 || vmid > 15) { -- pr_err("trying to set page table base for wrong VMID\n"); -- return; -- } -- WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); --} -- -- /** -- * read_vmid_from_vmfault_reg - read vmid from register -- * -- * adev: amdgpu_device pointer -- * @vmid: vmid pointer -- * read vmid from register (CIK). -- */ --static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- -- uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); -- -- return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID); --} -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c -old mode 100755 -new mode 100644 -index 2ff10e9..133d066 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c -@@ -20,9 +20,6 @@ - * OTHER DEALINGS IN THE SOFTWARE. - */ - --#undef pr_fmt --#define pr_fmt(fmt) "kfd2kgd: " fmt -- - #include <linux/module.h> - #include <linux/fdtable.h> - #include <linux/uaccess.h> -@@ -31,7 +28,7 @@ - #include "amdgpu.h" - #include "amdgpu_amdkfd.h" - #include "amdgpu_ucode.h" --#include "amdgpu_amdkfd_gfx_v8.h" -+#include "gfx_v8_0.h" - #include "gca/gfx_8_0_sh_mask.h" - #include "gca/gfx_8_0_d.h" - #include "gca/gfx_8_0_enum.h" -@@ -42,31 +39,7 @@ - #include "vi_structs.h" - #include "vid.h" - --enum hqd_dequeue_request_type { -- NO_ACTION = 0, -- DRAIN_PIPE, -- RESET_WAVES, -- SAVE_WAVES --}; -- --static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { -- mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, -- mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, -- mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, -- mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL --}; -- -- --struct vi_sdma_mqd; -- --static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -- void *vm, struct kgd_mem **mem); --static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); -- --static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -- int fd, uint32_t handle, struct kgd_mem **mem); -- --static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -+struct cik_sdma_rlc_registers; - - /* - * Register access functions -@@ -82,26 +55,17 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr); - static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr, -- uint32_t wptr_shift, uint32_t wptr_mask, -- struct mm_struct *mm); --static int kgd_hqd_dump(struct kgd_dev *kgd, -- uint32_t pipe_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs); --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -- uint32_t __user *wptr, struct mm_struct *mm); --static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -- uint32_t engine_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs); -+ uint32_t queue_id, uint32_t __user *wptr); -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); - static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); - static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); --static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -- enum kfd_preempt_type reset_type, -+static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id); - static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout); -+static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); - static int kgd_address_watch_disable(struct kgd_dev *kgd); - static int kgd_address_watch_execute(struct kgd_dev *kgd, - unsigned int watch_point_id, -@@ -120,61 +84,20 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, - static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid); - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); --static void set_num_of_requests(struct kgd_dev *kgd, -- uint8_t num_of_requests); --static int alloc_memory_of_scratch(struct kgd_dev *kgd, -- uint64_t va, uint32_t vmid); --static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -- uint8_t element_size, uint8_t index_stride, uint8_t mtype); --static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t page_table_base); --static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); -- --/* Because of REG_GET_FIELD() being used, we put this function in the -- * asic specific file. -- */ --static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, -- struct tile_config *config) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- -- config->gb_addr_config = adev->gfx.config.gb_addr_config; -- config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -- MC_ARB_RAMCFG, NOOFBANK); -- config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -- MC_ARB_RAMCFG, NOOFRANKS); -- -- config->tile_config_ptr = adev->gfx.config.tile_mode_array; -- config->num_tile_configs = -- ARRAY_SIZE(adev->gfx.config.tile_mode_array); -- config->macro_tile_config_ptr = -- adev->gfx.config.macrotile_mode_array; -- config->num_macro_tile_configs = -- ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); -- -- return 0; --} -+static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); - - static const struct kfd2kgd_calls kfd2kgd = { - .init_gtt_mem_allocation = alloc_gtt_mem, - .free_gtt_mem = free_gtt_mem, -- .get_local_mem_info = get_local_mem_info, -+ .get_vmem_size = get_vmem_size, - .get_gpu_clock_counter = get_gpu_clock_counter, - .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, -- .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, -- .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, -- .create_process_gpumem = create_process_gpumem, -- .destroy_process_gpumem = destroy_process_gpumem, -- .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, -- .open_graphic_handle = open_graphic_handle, - .program_sh_mem_settings = kgd_program_sh_mem_settings, - .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, - .init_pipeline = kgd_init_pipeline, - .init_interrupts = kgd_init_interrupts, - .hqd_load = kgd_hqd_load, - .hqd_sdma_load = kgd_hqd_sdma_load, -- .hqd_dump = kgd_hqd_dump, -- .hqd_sdma_dump = kgd_hqd_sdma_dump, - .hqd_is_occupied = kgd_hqd_is_occupied, - .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, - .hqd_destroy = kgd_hqd_destroy, -@@ -188,56 +111,14 @@ static const struct kfd2kgd_calls kfd2kgd = { - .get_atc_vmid_pasid_mapping_valid = - get_atc_vmid_pasid_mapping_valid, - .write_vmid_invalidate_request = write_vmid_invalidate_request, -- .invalidate_tlbs = invalidate_tlbs, -- .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, -- .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, -- .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, -- .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, -- .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, -- .get_fw_version = get_fw_version, -- .set_num_of_requests = set_num_of_requests, -- .get_cu_info = get_cu_info, -- .alloc_memory_of_scratch = alloc_memory_of_scratch, -- .write_config_static_mem = write_config_static_mem, -- .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, -- .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, -- .set_vm_context_page_table_base = set_vm_context_page_table_base, -- .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, -- .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, -- .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, -- .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, -- .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, -- .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, -- .submit_ib = amdgpu_amdkfd_submit_ib, -- .get_tile_config = amdgpu_amdkfd_get_tile_config, -- .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, -- .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, -- .get_vram_usage = amdgpu_amdkfd_get_vram_usage -+ .get_fw_version = get_fw_version - }; - --struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions() -+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void) - { - return (struct kfd2kgd_calls *)&kfd2kgd; - } - --static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -- void *vm, struct kgd_mem **mem) --{ -- return 0; --} -- --/* Destroys the GPU allocation and frees the kgd_mem structure */ --static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) --{ -- --} -- --static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -- int fd, uint32_t handle, struct kgd_mem **mem) --{ -- return 0; --} -- - static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) - { - return (struct amdgpu_device *)kgd; -@@ -266,7 +147,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - -- uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - lock_srbm(kgd, mec, pipe, queue_id, 0); -@@ -335,28 +216,21 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) - uint32_t mec; - uint32_t pipe; - -- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -+ mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - lock_srbm(kgd, mec, pipe, 0, 0); - -- WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | -- CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); -+ WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK); - - unlock_srbm(kgd); - - return 0; - } - --static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m) -+static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) - { -- uint32_t retval; -- -- retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + -- m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET; -- pr_debug("sdma base address: 0x%x\n", retval); -- -- return retval; -+ return 0; - } - - static inline struct vi_mqd *get_mqd(void *mqd) -@@ -364,224 +238,33 @@ static inline struct vi_mqd *get_mqd(void *mqd) - return (struct vi_mqd *)mqd; - } - --static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) -+static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) - { -- return (struct vi_sdma_mqd *)mqd; -+ return (struct cik_sdma_rlc_registers *)mqd; - } - - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr, -- uint32_t wptr_shift, uint32_t wptr_mask, -- struct mm_struct *mm) -+ uint32_t queue_id, uint32_t __user *wptr) - { -- struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct vi_mqd *m; -- uint32_t *mqd_hqd; -- uint32_t reg, wptr_val, data; -- bool valid_wptr = false; -+ uint32_t shadow_wptr, valid_wptr; -+ struct amdgpu_device *adev = get_amdgpu_device(kgd); - - m = get_mqd(mqd); - -- acquire_queue(kgd, pipe_id, queue_id); -- -- /* HIQ is set during driver init period with vmid set to 0. For SRIOV -- * world switching support let the RLC know about the HIQ. -- * -- * Workaround: This causes reboots on CZ. Disable this on CZ, which -- * doesn't support SRIOV anyway. -- */ -- if (m->cp_hqd_vmid == 0 && -- adev->asic_type != CHIP_CARRIZO) { -- uint32_t value, mec, pipe; -- -- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -- pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); -- -- pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", -- mec, pipe, queue_id); -- value = RREG32(mmRLC_CP_SCHEDULERS); -- value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, -- ((mec << 5) | (pipe << 3) | queue_id | 0x80)); -- WREG32(mmRLC_CP_SCHEDULERS, value); -- } -- -- /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ -- mqd_hqd = &m->cp_mqd_base_addr_lo; -+ valid_wptr = copy_from_user(&shadow_wptr, wptr, sizeof(shadow_wptr)); -+ if (valid_wptr == 0) -+ m->cp_hqd_pq_wptr = shadow_wptr; - -- for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_CONTROL; reg++) -- WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); -- -- /* Tonga errata: EOP RPTR/WPTR should be left unmodified. -- * This is safe since EOP RPTR==WPTR for any inactive HQD -- * on ASICs that do not support context-save. -- * EOP writes/reads can start anywhere in the ring. -- */ -- if (get_amdgpu_device(kgd)->asic_type != CHIP_TONGA) { -- WREG32(mmCP_HQD_EOP_RPTR, m->cp_hqd_eop_rptr); -- WREG32(mmCP_HQD_EOP_WPTR, m->cp_hqd_eop_wptr); -- WREG32(mmCP_HQD_EOP_WPTR_MEM, m->cp_hqd_eop_wptr_mem); -- } -- -- for (reg = mmCP_HQD_EOP_EVENTS; reg <= mmCP_HQD_ERROR; reg++) -- WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); -- -- /* Copy userspace write pointer value to register. -- * Activate doorbell logic to monitor subsequent changes. -- */ -- data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, -- CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); -- WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); -- -- /* read_user_ptr may take the mm->mmap_sem. -- * release srbm_mutex to avoid circular dependency between -- * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. -- */ -- release_queue(kgd); -- valid_wptr = read_user_wptr(mm, wptr, wptr_val); - acquire_queue(kgd, pipe_id, queue_id); -- if (valid_wptr) -- WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); -- -- data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); -- WREG32(mmCP_HQD_ACTIVE, data); -- -+ gfx_v8_0_mqd_commit(adev, mqd); - release_queue(kgd); - - return 0; - } - --static int kgd_hqd_dump(struct kgd_dev *kgd, -- uint32_t pipe_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs) -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) - { -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t i = 0, reg; --#define HQD_N_REGS (54+4) --#define DUMP_REG(addr) do { \ -- if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ -- break; \ -- (*dump)[i][0] = (addr) << 2; \ -- (*dump)[i++][1] = RREG32(addr); \ -- } while (0) -- -- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -- if (*dump == NULL) -- return -ENOMEM; -- -- acquire_queue(kgd, pipe_id, queue_id); -- -- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); -- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); -- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); -- DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); -- -- for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++) -- DUMP_REG(reg); -- -- release_queue(kgd); -- -- WARN_ON_ONCE(i != HQD_N_REGS); -- *n_regs = i; -- -- return 0; --} -- --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -- uint32_t __user *wptr, struct mm_struct *mm) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct vi_sdma_mqd *m; -- uint32_t sdma_base_addr; -- uint32_t temp, timeout = 2000; -- uint32_t data; -- -- m = get_sdma_mqd(mqd); -- sdma_base_addr = get_sdma_base_addr(m); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -- m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); -- -- while (true) { -- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -- break; -- if (timeout == 0) -- return -ETIME; -- msleep(10); -- timeout -= 10; -- } -- if (m->sdma_engine_id) { -- data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); -- data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, -- RESUME_CTX, 0); -- WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); -- } else { -- data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); -- data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, -- RESUME_CTX, 0); -- WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); -- } -- -- data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, -- ENABLE, 1); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); -- -- if (read_user_wptr(mm, wptr, data)) -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); -- else -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, -- m->sdmax_rlcx_rb_rptr); -- -- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, -- m->sdmax_rlcx_virtual_addr); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, -- m->sdmax_rlcx_rb_base_hi); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, -- m->sdmax_rlcx_rb_rptr_addr_lo); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, -- m->sdmax_rlcx_rb_rptr_addr_hi); -- -- data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, -- RB_ENABLE, 1); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); -- -- return 0; --} -- --static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -- uint32_t engine_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + -- queue_id * KFD_VI_SDMA_QUEUE_OFFSET; -- uint32_t i = 0, reg; --#undef HQD_N_REGS --#define HQD_N_REGS (19+4+2+3+7) -- -- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -- if (*dump == NULL) -- return -ENOMEM; -- -- for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) -- DUMP_REG(sdma_offset + reg); -- for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; -- reg++) -- DUMP_REG(sdma_offset + reg); -- for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; -- reg++) -- DUMP_REG(sdma_offset + reg); -- for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG; -- reg++) -- DUMP_REG(sdma_offset + reg); -- for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL; -- reg++) -- DUMP_REG(sdma_offset + reg); -- -- WARN_ON_ONCE(i != HQD_N_REGS); -- *n_regs = i; -- - return 0; - } - -@@ -610,7 +293,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct vi_sdma_mqd *m; -+ struct cik_sdma_rlc_registers *m; - uint32_t sdma_base_addr; - uint32_t sdma_rlc_rb_cntl; - -@@ -625,102 +308,29 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) - return false; - } - --static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -- enum kfd_preempt_type reset_type, -+static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t temp; -- enum hqd_dequeue_request_type type; -- unsigned long flags, end_jiffies; -- int retry; -- struct vi_mqd *m = get_mqd(mqd); -+ int timeout = utimeout; - - acquire_queue(kgd, pipe_id, queue_id); - -- if (m->cp_hqd_vmid == 0) -- WREG32_FIELD(RLC_CP_SCHEDULERS, scheduler1, 0); -- -- switch (reset_type) { -- case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: -- type = DRAIN_PIPE; -- break; -- case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: -- type = RESET_WAVES; -- break; -- default: -- type = DRAIN_PIPE; -- break; -- } -+ WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); - -- /* Workaround: If IQ timer is active and the wait time is close to or -- * equal to 0, dequeueing is not safe. Wait until either the wait time -- * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is -- * cleared before continuing. Also, ensure wait times are set to at -- * least 0x3. -- */ -- local_irq_save(flags); -- preempt_disable(); -- retry = 5000; /* wait for 500 usecs at maximum */ -- while (true) { -- temp = RREG32(mmCP_HQD_IQ_TIMER); -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { -- pr_debug("HW is processing IQ\n"); -- goto loop; -- } -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) -- == 3) /* SEM-rearm is safe */ -- break; -- /* Wait time 3 is safe for CP, but our MMIO read/write -- * time is close to 1 microsecond, so check for 10 to -- * leave more buffer room -- */ -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) -- >= 10) -- break; -- pr_debug("IQ timer is active\n"); -- } else -- break; --loop: -- if (!retry) { -- pr_err("CP HQD IQ timer status time out\n"); -- break; -- } -- ndelay(100); -- --retry; -- } -- retry = 1000; -- while (true) { -- temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); -- if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) -- break; -- pr_debug("Dequeue request is pending\n"); -- -- if (!retry) { -- pr_err("CP HQD dequeue request time out\n"); -- break; -- } -- ndelay(100); -- --retry; -- } -- local_irq_restore(flags); -- preempt_enable(); -- -- WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); -- -- end_jiffies = (utimeout * HZ / 1000) + jiffies; - while (true) { - temp = RREG32(mmCP_HQD_ACTIVE); -- if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) -+ if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) - break; -- if (time_after(jiffies, end_jiffies)) { -- pr_err("cp queue preemption time out.\n"); -+ if (timeout <= 0) { -+ pr_err("kfd: cp queue preemption time out.\n"); - release_queue(kgd); - return -ETIME; - } -- usleep_range(500, 1000); -+ msleep(20); -+ timeout -= 20; - } - - release_queue(kgd); -@@ -731,10 +341,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout) - { - struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct vi_sdma_mqd *m; -+ struct cik_sdma_rlc_registers *m; - uint32_t sdma_base_addr; - uint32_t temp; -- unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; -+ int timeout = utimeout; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); -@@ -745,19 +355,18 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - - while (true) { - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -+ if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) - break; -- if (time_after(jiffies, end_jiffies)) -+ if (timeout <= 0) - return -ETIME; -- usleep_range(500, 1000); -+ msleep(20); -+ timeout -= 20; - } - - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | -- SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); -- -- m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); -+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); - - return 0; - } -@@ -779,7 +388,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); -- return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; -+ return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; - } - - static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) -@@ -789,83 +398,8 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) - WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); - } - --/* -- * FIXME: Poliars test failed with this package, FIJI works fine -- * From the CP spec it does not official support the invalidation -- * with the specified pasid in the package, so disable it for V8 -- * -- */ --#ifdef V8_SUPPORT_IT_OFFICIAL --static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) --{ -- signed long r; -- struct dma_fence *f; -- struct amdgpu_ring *ring = &adev->gfx.kiq.ring; -- -- mutex_lock(&adev->gfx.kiq.ring_mutex); -- amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ -- amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); -- amdgpu_ring_write(ring, -- PACKET3_INVALIDATE_TLBS_DST_SEL(1) | -- PACKET3_INVALIDATE_TLBS_PASID(pasid)); -- amdgpu_fence_emit(ring, &f); -- amdgpu_ring_commit(ring); -- mutex_unlock(&adev->gfx.kiq.ring_mutex); -- -- r = dma_fence_wait(f, false); -- if (r) -- DRM_ERROR("wait for kiq fence error: %ld.\n", r); -- dma_fence_put(f); -- -- return r; --} --#endif --static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- int vmid; -- --#ifdef V8_SUPPORT_IT_OFFICIAL -- struct amdgpu_ring *ring = &adev->gfx.kiq.ring; -- -- if (ring->ready) -- return invalidate_tlbs_with_kiq(adev, pasid); --#endif -- -- for (vmid = 0; vmid < 16; vmid++) { -- if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) -- continue; -- if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & -- ATC_VMID0_PASID_MAPPING__VALID_MASK) { -- if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & -- ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { -- WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); -- break; -- } -- } -- } -- -- return 0; --} -- - static int kgd_address_watch_disable(struct kgd_dev *kgd) - { -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- union TCP_WATCH_CNTL_BITS cntl; -- unsigned int i; -- -- cntl.u32All = 0; -- -- cntl.bitfields.valid = 0; -- cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; -- cntl.bitfields.atc = 1; -- -- /* Turning off this address until we set all the registers */ -- for (i = 0; i < MAX_WATCH_ADDRESSES; i++) -- WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_CNTL], -- cntl.u32All); -- - return 0; - } - -@@ -875,32 +409,6 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, - uint32_t addr_hi, - uint32_t addr_lo) - { -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- union TCP_WATCH_CNTL_BITS cntl; -- -- cntl.u32All = cntl_val; -- -- /* Turning off this watch point until we set all the registers */ -- cntl.bitfields.valid = 0; -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_CNTL], -- cntl.u32All); -- -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_ADDR_HI], -- addr_hi); -- -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_ADDR_LO], -- addr_lo); -- -- /* Enable the watch point */ -- cntl.bitfields.valid = 1; -- -- WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX -- + ADDRESS_WATCH_REG_CNTL], -- cntl.u32All); -- - return 0; - } - -@@ -933,32 +441,6 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, - unsigned int watch_point_id, - unsigned int reg_offset) - { -- return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; --} -- --static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -- uint8_t element_size, uint8_t index_stride, uint8_t mtype) --{ -- uint32_t reg; -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- -- reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | -- element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | -- index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | -- mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; -- -- WREG32(mmSH_STATIC_MEM_CONFIG, reg); -- return 0; --} --static int alloc_memory_of_scratch(struct kgd_dev *kgd, -- uint64_t va, uint32_t vmid) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- -- lock_srbm(kgd, 0, 0, 0, vmid); -- WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); -- unlock_srbm(kgd); -- - return 0; - } - -@@ -967,45 +449,47 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - const union amdgpu_firmware_header *hdr; - -+ BUG_ON(kgd == NULL); -+ - switch (type) { - case KGD_ENGINE_PFP: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.pfp_fw->data; -+ adev->gfx.pfp_fw->data; - break; - - case KGD_ENGINE_ME: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.me_fw->data; -+ adev->gfx.me_fw->data; - break; - - case KGD_ENGINE_CE: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.ce_fw->data; -+ adev->gfx.ce_fw->data; - break; - - case KGD_ENGINE_MEC1: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.mec_fw->data; -+ adev->gfx.mec_fw->data; - break; - - case KGD_ENGINE_MEC2: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.mec2_fw->data; -+ adev->gfx.mec2_fw->data; - break; - - case KGD_ENGINE_RLC: - hdr = (const union amdgpu_firmware_header *) -- adev->gfx.rlc_fw->data; -+ adev->gfx.rlc_fw->data; - break; - - case KGD_ENGINE_SDMA1: - hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[0].fw->data; -+ adev->sdma.instance[0].fw->data; - break; - - case KGD_ENGINE_SDMA2: - hdr = (const union amdgpu_firmware_header *) -- adev->sdma.instance[1].fw->data; -+ adev->sdma.instance[1].fw->data; - break; - - default: -@@ -1018,21 +502,3 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) - /* Only 12 bit in use*/ - return hdr->common.ucode_version; - } -- --static void set_num_of_requests(struct kgd_dev *kgd, -- uint8_t num_of_requests) --{ -- pr_debug("This is a stub\n"); --} -- --static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t page_table_base) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- /* TODO: Don't use hardcoded VMIDs */ -- if (vmid < 8 || vmid > 15) { -- pr_err("trying to set page table base for wrong VMID\n"); -- return; -- } -- WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); --} -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c -deleted file mode 100644 -index edbae19..0000000 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c -+++ /dev/null -@@ -1,1227 +0,0 @@ --/* -- * Copyright 2014 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ --#undef pr_fmt --#define pr_fmt(fmt) "kfd2kgd: " fmt -- --#include <linux/module.h> --#include <linux/fdtable.h> --#include <linux/uaccess.h> --#include <linux/firmware.h> --#include <drm/drmP.h> --#include "amdgpu.h" --#include "amdgpu_amdkfd.h" --#include "amdgpu_ucode.h" --#include "amdgpu_amdkfd_gfx_v8.h" --#include "vega10/soc15ip.h" --#include "vega10/GC/gc_9_0_offset.h" --#include "vega10/GC/gc_9_0_sh_mask.h" --#include "vega10/vega10_enum.h" --#include "vega10/SDMA0/sdma0_4_0_offset.h" --#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" --#include "vega10/SDMA1/sdma1_4_0_offset.h" --#include "vega10/SDMA1/sdma1_4_0_sh_mask.h" --#include "vega10/ATHUB/athub_1_0_offset.h" --#include "vega10/ATHUB/athub_1_0_sh_mask.h" --#include "vega10/OSSSYS/osssys_4_0_offset.h" --#include "vega10/OSSSYS/osssys_4_0_sh_mask.h" --#include "soc15_common.h" --#include "v9_structs.h" --#include "soc15.h" --#include "soc15d.h" -- --/* HACK: MMHUB and GC both have VM-related register with the same -- * names but different offsets. Define the MMHUB register we need here -- * with a prefix. A proper solution would be to move the functions -- * programming these registers into gfx_v9_0.c and mmhub_v1_0.c -- * respectively. -- */ --#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3 --#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0 -- --#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705 --#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0 -- --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0 --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0 -- --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0 --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0 -- --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0 --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c --#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0 -- --#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727 --#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0 --#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728 --#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0 -- --enum hqd_dequeue_request_type { -- NO_ACTION = 0, -- DRAIN_PIPE, -- RESET_WAVES, -- SAVE_WAVES --}; -- --static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { -- mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, -- mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, -- mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, -- mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL --}; -- -- --static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -- void *vm, struct kgd_mem **mem); --static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); -- --static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -- int fd, uint32_t handle, struct kgd_mem **mem); -- --static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); -- --/* -- * Register access functions -- */ -- --static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t sh_mem_config, -- uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, -- uint32_t sh_mem_bases); --static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, -- unsigned int vmid); --static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, -- uint32_t hpd_size, uint64_t hpd_gpu_addr); --static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); --static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr, -- uint32_t wptr_shift, uint32_t wptr_mask, -- struct mm_struct *mm); --static int kgd_hqd_dump(struct kgd_dev *kgd, -- uint32_t pipe_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs); --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -- uint32_t __user *wptr, struct mm_struct *mm); --static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -- uint32_t engine_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs); --static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, -- uint32_t pipe_id, uint32_t queue_id); --static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); --static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -- enum kfd_preempt_type reset_type, -- unsigned int utimeout, uint32_t pipe_id, -- uint32_t queue_id); --static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, -- unsigned int utimeout); --static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); --static uint32_t get_watch_base_addr(void); --static int kgd_address_watch_disable(struct kgd_dev *kgd); --static int kgd_address_watch_execute(struct kgd_dev *kgd, -- unsigned int watch_point_id, -- uint32_t cntl_val, -- uint32_t addr_hi, -- uint32_t addr_lo); --static int kgd_wave_control_execute(struct kgd_dev *kgd, -- uint32_t gfx_index_val, -- uint32_t sq_cmd); --static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, -- unsigned int watch_point_id, -- unsigned int reg_offset); -- --static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, -- uint8_t vmid); --static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, -- uint8_t vmid); --static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); --static void set_num_of_requests(struct kgd_dev *kgd, -- uint8_t num_of_requests); --static int alloc_memory_of_scratch(struct kgd_dev *kgd, -- uint64_t va, uint32_t vmid); --static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -- uint8_t element_size, uint8_t index_stride, uint8_t mtype); --static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t page_table_base); --static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); -- --/* Because of REG_GET_FIELD() being used, we put this function in the -- * asic specific file. -- */ --static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, -- struct tile_config *config) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- -- config->gb_addr_config = adev->gfx.config.gb_addr_config; --#if 0 --/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but -- * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu -- * changes commented out related code, doing the same here for now but -- * need to sync with Ken et al -- */ -- config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -- MC_ARB_RAMCFG, NOOFBANK); -- config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, -- MC_ARB_RAMCFG, NOOFRANKS); --#endif -- -- config->tile_config_ptr = adev->gfx.config.tile_mode_array; -- config->num_tile_configs = -- ARRAY_SIZE(adev->gfx.config.tile_mode_array); -- config->macro_tile_config_ptr = -- adev->gfx.config.macrotile_mode_array; -- config->num_macro_tile_configs = -- ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); -- -- return 0; --} -- --static const struct kfd2kgd_calls kfd2kgd = { -- .init_gtt_mem_allocation = alloc_gtt_mem, -- .free_gtt_mem = free_gtt_mem, -- .get_local_mem_info = get_local_mem_info, -- .get_gpu_clock_counter = get_gpu_clock_counter, -- .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, -- .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, -- .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, -- .create_process_gpumem = create_process_gpumem, -- .destroy_process_gpumem = destroy_process_gpumem, -- .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, -- .open_graphic_handle = open_graphic_handle, -- .program_sh_mem_settings = kgd_program_sh_mem_settings, -- .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, -- .init_pipeline = kgd_init_pipeline, -- .init_interrupts = kgd_init_interrupts, -- .hqd_load = kgd_hqd_load, -- .hqd_sdma_load = kgd_hqd_sdma_load, -- .hqd_dump = kgd_hqd_dump, -- .hqd_sdma_dump = kgd_hqd_sdma_dump, -- .hqd_is_occupied = kgd_hqd_is_occupied, -- .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, -- .hqd_destroy = kgd_hqd_destroy, -- .hqd_sdma_destroy = kgd_hqd_sdma_destroy, -- .address_watch_disable = kgd_address_watch_disable, -- .address_watch_execute = kgd_address_watch_execute, -- .wave_control_execute = kgd_wave_control_execute, -- .address_watch_get_offset = kgd_address_watch_get_offset, -- .get_atc_vmid_pasid_mapping_pasid = -- get_atc_vmid_pasid_mapping_pasid, -- .get_atc_vmid_pasid_mapping_valid = -- get_atc_vmid_pasid_mapping_valid, -- .write_vmid_invalidate_request = write_vmid_invalidate_request, -- .invalidate_tlbs = invalidate_tlbs, -- .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, -- .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, -- .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, -- .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, -- .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, -- .get_fw_version = get_fw_version, -- .set_num_of_requests = set_num_of_requests, -- .get_cu_info = get_cu_info, -- .alloc_memory_of_scratch = alloc_memory_of_scratch, -- .write_config_static_mem = write_config_static_mem, -- .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, -- .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, -- .set_vm_context_page_table_base = set_vm_context_page_table_base, -- .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, -- .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, -- .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, -- .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, -- .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, -- .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, -- .submit_ib = amdgpu_amdkfd_submit_ib, -- .get_tile_config = amdgpu_amdkfd_get_tile_config, -- .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, -- .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, -- .get_vram_usage = amdgpu_amdkfd_get_vram_usage --}; -- --struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions() --{ -- return (struct kfd2kgd_calls *)&kfd2kgd; --} -- --static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, -- void *vm, struct kgd_mem **mem) --{ -- return 0; --} -- --/* Destroys the GPU allocation and frees the kgd_mem structure */ --static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) --{ -- --} -- --static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, -- int fd, uint32_t handle, struct kgd_mem **mem) --{ -- return 0; --} -- --static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) --{ -- return (struct amdgpu_device *)kgd; --} -- --static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, -- uint32_t queue, uint32_t vmid) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- -- mutex_lock(&adev->srbm_mutex); -- soc15_grbm_select(adev, mec, pipe, queue, vmid); --} -- --static void unlock_srbm(struct kgd_dev *kgd) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- -- soc15_grbm_select(adev, 0, 0, 0, 0); -- mutex_unlock(&adev->srbm_mutex); --} -- --static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, -- uint32_t queue_id) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- -- uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -- uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); -- -- lock_srbm(kgd, mec, pipe, queue_id, 0); --} -- --static uint32_t get_queue_mask(struct amdgpu_device *adev, -- uint32_t pipe_id, uint32_t queue_id) --{ -- unsigned int bit = (pipe_id * adev->gfx.mec.num_pipe_per_mec + -- queue_id) & 31; -- -- return ((uint32_t)1) << bit; --} -- --static void release_queue(struct kgd_dev *kgd) --{ -- unlock_srbm(kgd); --} -- --static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t sh_mem_config, -- uint32_t sh_mem_ape1_base, -- uint32_t sh_mem_ape1_limit, -- uint32_t sh_mem_bases) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- -- lock_srbm(kgd, 0, 0, 0, vmid); -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); -- /* APE1 no longer exists on GFX9 */ -- -- unlock_srbm(kgd); --} -- --static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, -- unsigned int vmid) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- -- /* -- * We have to assume that there is no outstanding mapping. -- * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because -- * a mapping is in progress or because a mapping finished -- * and the SW cleared it. -- * So the protocol is to always wait & clear. -- */ -- uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | -- ATC_VMID0_PASID_MAPPING__VALID_MASK; -- -- /* -- * need to do this twice, once for gfx and once for mmhub -- * for ATC add 16 to VMID for mmhub, for IH different registers. -- * ATC_VMID0..15 registers are separate from ATC_VMID16..31. -- */ -- -- WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, -- pasid_mapping); -- -- while (!(RREG32(SOC15_REG_OFFSET( -- ATHUB, 0, -- mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & -- (1U << vmid))) -- cpu_relax(); -- -- WREG32(SOC15_REG_OFFSET(ATHUB, 0, -- mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), -- 1U << vmid); -- -- /* Mapping vmid to pasid also for IH block */ -- WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, -- pasid_mapping); -- -- WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, -- pasid_mapping); -- -- while (!(RREG32(SOC15_REG_OFFSET( -- ATHUB, 0, -- mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & -- (1U << (vmid + 16)))) -- cpu_relax(); -- -- WREG32(SOC15_REG_OFFSET(ATHUB, 0, -- mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), -- 1U << (vmid + 16)); -- -- /* Mapping vmid to pasid also for IH block */ -- WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, -- pasid_mapping); -- return 0; --} -- --static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, -- uint32_t hpd_size, uint64_t hpd_gpu_addr) --{ -- /* amdgpu owns the per-pipe state */ -- return 0; --} -- --/* TODO - RING0 form of field is obsolete, seems to date back to SI -- * but still works -- */ -- --static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t mec; -- uint32_t pipe; -- -- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -- pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); -- -- lock_srbm(kgd, mec, pipe, 0, 0); -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), -- CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | -- CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); -- -- unlock_srbm(kgd); -- -- return 0; --} -- --static uint32_t get_sdma_base_addr(unsigned int engine_id, -- unsigned int queue_id) --{ -- static const uint32_t base[2] = { -- SOC15_REG_OFFSET(SDMA0, 0, -- mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, -- SOC15_REG_OFFSET(SDMA1, 0, -- mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL -- }; -- uint32_t retval; -- -- retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - -- mmSDMA0_RLC0_RB_CNTL); -- -- pr_debug("sdma base address: 0x%x\n", retval); -- -- return retval; --} -- --static uint32_t get_watch_base_addr(void) --{ -- uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) - -- mmTCP_WATCH0_ADDR_H; -- -- pr_debug("kfd: reg watch base address: 0x%x\n", retval); -- -- return retval; --} -- --static inline struct v9_mqd *get_mqd(void *mqd) --{ -- return (struct v9_mqd *)mqd; --} -- --static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) --{ -- return (struct v9_sdma_mqd *)mqd; --} -- --static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr, -- uint32_t wptr_shift, uint32_t wptr_mask, -- struct mm_struct *mm) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct v9_mqd *m; -- uint32_t *mqd_hqd; -- uint32_t reg, hqd_base, data; -- -- m = get_mqd(mqd); -- -- acquire_queue(kgd, pipe_id, queue_id); -- -- /* HIQ is set during driver init period with vmid set to 0*/ -- if (m->cp_hqd_vmid == 0) { -- uint32_t value, mec, pipe; -- -- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; -- pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); -- -- pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", -- mec, pipe, queue_id); -- value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); -- value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, -- ((mec << 5) | (pipe << 3) | queue_id | 0x80)); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); -- } -- -- /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ -- mqd_hqd = &m->cp_mqd_base_addr_lo; -- hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); -- -- for (reg = hqd_base; -- reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) -- WREG32(reg, mqd_hqd[reg - hqd_base]); -- -- -- /* Activate doorbell logic before triggering WPTR poll. */ -- data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, -- CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); -- -- if (wptr) { -- /* Don't read wptr with get_user because the user -- * context may not be accessible (if this function -- * runs in a work queue). Instead trigger a one-shot -- * polling read from memory in the CP. This assumes -- * that wptr is GPU-accessible in the queue's VMID via -- * ATC or SVM. WPTR==RPTR before starting the poll so -- * the CP starts fetching new commands from the right -- * place. -- * -- * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit -- * tricky. Assume that the queue didn't overflow. The -- * number of valid bits in the 32-bit RPTR depends on -- * the queue size. The remaining bits are taken from -- * the saved 64-bit WPTR. If the WPTR wrapped, add the -- * queue size. -- */ -- uint32_t queue_size = -- 2 << REG_GET_FIELD(m->cp_hqd_pq_control, -- CP_HQD_PQ_CONTROL, QUEUE_SIZE); -- uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); -- -- if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) -- guessed_wptr += queue_size; -- guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); -- guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), -- lower_32_bits(guessed_wptr)); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), -- upper_32_bits(guessed_wptr)); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), -- lower_32_bits((uint64_t)wptr)); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), -- upper_32_bits((uint64_t)wptr)); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), -- get_queue_mask(adev, pipe_id, queue_id)); -- } -- -- /* Start the EOP fetcher */ -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), -- REG_SET_FIELD(m->cp_hqd_eop_rptr, -- CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); -- -- data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); -- -- release_queue(kgd); -- -- return 0; --} -- --static int kgd_hqd_dump(struct kgd_dev *kgd, -- uint32_t pipe_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t i = 0, reg; --#define HQD_N_REGS 56 --#define DUMP_REG(addr) do { \ -- if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ -- break; \ -- (*dump)[i][0] = (addr) << 2; \ -- (*dump)[i++][1] = RREG32(addr); \ -- } while (0) -- -- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -- if (*dump == NULL) -- return -ENOMEM; -- -- acquire_queue(kgd, pipe_id, queue_id); -- -- for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); -- reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) -- DUMP_REG(reg); -- -- release_queue(kgd); -- -- WARN_ON_ONCE(i != HQD_N_REGS); -- *n_regs = i; -- -- return 0; --} -- --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -- uint32_t __user *wptr, struct mm_struct *mm) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct v9_sdma_mqd *m; -- uint32_t sdma_base_addr, sdmax_gfx_context_cntl; -- uint32_t temp, timeout = 2000; -- uint32_t data; -- uint64_t data64; -- uint64_t __user *wptr64 = (uint64_t __user *)wptr; -- -- m = get_sdma_mqd(mqd); -- sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, -- m->sdma_queue_id); -- sdmax_gfx_context_cntl = m->sdma_engine_id ? -- SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : -- SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); -- -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -- m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); -- -- while (true) { -- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -- break; -- if (timeout == 0) -- return -ETIME; -- msleep(10); -- timeout -= 10; -- } -- data = RREG32(sdmax_gfx_context_cntl); -- data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, -- RESUME_CTX, 0); -- WREG32(sdmax_gfx_context_cntl, data); -- -- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, -- m->sdmax_rlcx_doorbell_offset); -- -- data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, -- ENABLE, 1); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, -- m->sdmax_rlcx_rb_rptr_hi); -- -- WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); -- if (read_user_wptr(mm, wptr64, data64)) { -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, -- lower_32_bits(data64)); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, -- upper_32_bits(data64)); -- } else { -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, -- m->sdmax_rlcx_rb_rptr); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, -- m->sdmax_rlcx_rb_rptr_hi); -- } -- WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); -- -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, -- m->sdmax_rlcx_rb_base_hi); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, -- m->sdmax_rlcx_rb_rptr_addr_lo); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, -- m->sdmax_rlcx_rb_rptr_addr_hi); -- -- data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, -- RB_ENABLE, 1); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); -- -- return 0; --} -- --static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, -- uint32_t engine_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t sdma_base_addr = get_sdma_base_addr(engine_id, queue_id); -- uint32_t i = 0, reg; --#undef HQD_N_REGS --#define HQD_N_REGS (19+6+7+10) -- -- *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); -- if (*dump == NULL) -- return -ENOMEM; -- -- for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) -- DUMP_REG(sdma_base_addr + reg); -- for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) -- DUMP_REG(sdma_base_addr + reg); -- for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; -- reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) -- DUMP_REG(sdma_base_addr + reg); -- for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; -- reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) -- DUMP_REG(sdma_base_addr + reg); -- -- WARN_ON_ONCE(i != HQD_N_REGS); -- *n_regs = i; -- -- return 0; --} -- --static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, -- uint32_t pipe_id, uint32_t queue_id) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t act; -- bool retval = false; -- uint32_t low, high; -- -- acquire_queue(kgd, pipe_id, queue_id); -- act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); -- if (act) { -- low = lower_32_bits(queue_address >> 8); -- high = upper_32_bits(queue_address >> 8); -- -- if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && -- high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) -- retval = true; -- } -- release_queue(kgd); -- return retval; --} -- --static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct v9_sdma_mqd *m; -- uint32_t sdma_base_addr; -- uint32_t sdma_rlc_rb_cntl; -- -- m = get_sdma_mqd(mqd); -- sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, -- m->sdma_queue_id); -- -- sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); -- -- if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) -- return true; -- -- return false; --} -- --static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, -- enum kfd_preempt_type reset_type, -- unsigned int utimeout, uint32_t pipe_id, -- uint32_t queue_id) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- enum hqd_dequeue_request_type type; -- unsigned long end_jiffies; -- uint32_t temp; -- struct v9_mqd *m = get_mqd(mqd); -- --#if 0 -- unsigned long flags; -- int retry; --#endif -- -- acquire_queue(kgd, pipe_id, queue_id); -- -- if (m->cp_hqd_vmid == 0) -- WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); -- -- switch (reset_type) { -- case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: -- type = DRAIN_PIPE; -- break; -- case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: -- type = RESET_WAVES; -- break; -- default: -- type = DRAIN_PIPE; -- break; -- } -- --#if 0 /* Is this still needed? */ -- /* Workaround: If IQ timer is active and the wait time is close to or -- * equal to 0, dequeueing is not safe. Wait until either the wait time -- * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is -- * cleared before continuing. Also, ensure wait times are set to at -- * least 0x3. -- */ -- local_irq_save(flags); -- preempt_disable(); -- retry = 5000; /* wait for 500 usecs at maximum */ -- while (true) { -- temp = RREG32(mmCP_HQD_IQ_TIMER); -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { -- pr_debug("HW is processing IQ\n"); -- goto loop; -- } -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) -- == 3) /* SEM-rearm is safe */ -- break; -- /* Wait time 3 is safe for CP, but our MMIO read/write -- * time is close to 1 microsecond, so check for 10 to -- * leave more buffer room -- */ -- if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) -- >= 10) -- break; -- pr_debug("IQ timer is active\n"); -- } else -- break; --loop: -- if (!retry) { -- pr_err("CP HQD IQ timer status time out\n"); -- break; -- } -- ndelay(100); -- --retry; -- } -- retry = 1000; -- while (true) { -- temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); -- if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) -- break; -- pr_debug("Dequeue request is pending\n"); -- -- if (!retry) { -- pr_err("CP HQD dequeue request time out\n"); -- break; -- } -- ndelay(100); -- --retry; -- } -- local_irq_restore(flags); -- preempt_enable(); --#endif -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); -- -- end_jiffies = (utimeout * HZ / 1000) + jiffies; -- while (true) { -- temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); -- if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) -- break; -- if (time_after(jiffies, end_jiffies)) { -- pr_err("cp queue preemption time out.\n"); -- release_queue(kgd); -- return -ETIME; -- } -- usleep_range(500, 1000); -- } -- -- release_queue(kgd); -- return 0; --} -- --static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, -- unsigned int utimeout) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- struct v9_sdma_mqd *m; -- uint32_t sdma_base_addr; -- uint32_t temp; -- unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; -- -- m = get_sdma_mqd(mqd); -- sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, -- m->sdma_queue_id); -- -- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); -- temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); -- -- while (true) { -- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); -- if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) -- break; -- if (time_after(jiffies, end_jiffies)) -- return -ETIME; -- usleep_range(500, 1000); -- } -- -- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); -- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, -- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | -- SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); -- -- m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); -- m->sdmax_rlcx_rb_rptr_hi = -- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); -- -- return 0; --} -- --static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, -- uint8_t vmid) --{ -- uint32_t reg; -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- -- reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) -- + vmid); -- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; --} -- --static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, -- uint8_t vmid) --{ -- uint32_t reg; -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- -- reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) -- + vmid); -- return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; --} -- --static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- uint32_t req = (1 << vmid) | -- (1 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* light */ -- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK | -- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK | -- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK | -- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK | -- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK; -- -- spin_lock(&adev->tlb_invalidation_lock); -- -- /* Use light weight invalidation. -- * -- * TODO 1: agree on the right set of invalidation registers for -- * KFD use. Use the last one for now. Invalidate both GC and -- * MMHUB. -- * -- * TODO 2: support range-based invalidation, requires kfg2kgd -- * interface change -- */ -- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), -- 0xffffffff); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), -- 0x0000001f); -- -- WREG32(SOC15_REG_OFFSET(MMHUB, 0, -- mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), -- 0xffffffff); -- WREG32(SOC15_REG_OFFSET(MMHUB, 0, -- mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), -- 0x0000001f); -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req); -- -- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), -- req); -- -- while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) & -- (1 << vmid))) -- cpu_relax(); -- -- while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, -- mmMMHUB_VM_INVALIDATE_ENG16_ACK)) & -- (1 << vmid))) -- cpu_relax(); -- -- spin_unlock(&adev->tlb_invalidation_lock); -- --} -- --static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) --{ -- signed long r; -- struct dma_fence *f; -- struct amdgpu_ring *ring = &adev->gfx.kiq.ring; -- -- mutex_lock(&adev->gfx.kiq.ring_mutex); -- amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ -- amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); -- amdgpu_ring_write(ring, -- PACKET3_INVALIDATE_TLBS_DST_SEL(1) | -- PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | -- PACKET3_INVALIDATE_TLBS_PASID(pasid) | -- PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(2)); -- amdgpu_fence_emit(ring, &f); -- amdgpu_ring_commit(ring); -- mutex_unlock(&adev->gfx.kiq.ring_mutex); -- -- r = dma_fence_wait(f, false); -- if (r) -- DRM_ERROR("wait for kiq fence error: %ld.\n", r); -- dma_fence_put(f); -- -- return r; --} -- --static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- int vmid; -- struct amdgpu_ring *ring = &adev->gfx.kiq.ring; -- -- if (ring->ready) -- return invalidate_tlbs_with_kiq(adev, pasid); -- -- for (vmid = 0; vmid < 16; vmid++) { -- if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) -- continue; -- if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { -- if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) -- == pasid) { -- write_vmid_invalidate_request(kgd, vmid); -- break; -- } -- } -- } -- -- return 0; --} -- --static int kgd_address_watch_disable(struct kgd_dev *kgd) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- union TCP_WATCH_CNTL_BITS cntl; -- unsigned int i; -- uint32_t watch_base_addr; -- -- cntl.u32All = 0; -- -- cntl.bitfields.valid = 0; -- cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; -- cntl.bitfields.atc = 1; -- -- watch_base_addr = get_watch_base_addr(); -- /* Turning off this address until we set all the registers */ -- for (i = 0; i < MAX_WATCH_ADDRESSES; i++) -- WREG32(watch_base_addr + -- watchRegs[i * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_CNTL], -- cntl.u32All); -- -- return 0; --} -- --static int kgd_address_watch_execute(struct kgd_dev *kgd, -- unsigned int watch_point_id, -- uint32_t cntl_val, -- uint32_t addr_hi, -- uint32_t addr_lo) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- union TCP_WATCH_CNTL_BITS cntl; -- uint32_t watch_base_addr; -- -- watch_base_addr = get_watch_base_addr(); -- cntl.u32All = cntl_val; -- -- /* Turning off this watch point until we set all the registers */ -- cntl.bitfields.valid = 0; -- WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], -- cntl.u32All); -- -- WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], -- addr_hi); -- -- WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], -- addr_lo); -- -- /* Enable the watch point */ -- cntl.bitfields.valid = 1; -- -- WREG32(watch_base_addr + -- watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + -- ADDRESS_WATCH_REG_CNTL], -- cntl.u32All); -- -- return 0; --} -- --static int kgd_wave_control_execute(struct kgd_dev *kgd, -- uint32_t gfx_index_val, -- uint32_t sq_cmd) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint32_t data = 0; -- -- mutex_lock(&adev->grbm_idx_mutex); -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); -- -- data = REG_SET_FIELD(data, GRBM_GFX_INDEX, -- INSTANCE_BROADCAST_WRITES, 1); -- data = REG_SET_FIELD(data, GRBM_GFX_INDEX, -- SH_BROADCAST_WRITES, 1); -- data = REG_SET_FIELD(data, GRBM_GFX_INDEX, -- SE_BROADCAST_WRITES, 1); -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); -- mutex_unlock(&adev->grbm_idx_mutex); -- -- return 0; --} -- --static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, -- unsigned int watch_point_id, -- unsigned int reg_offset) --{ -- return get_watch_base_addr() + -- watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; --} -- --static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, -- uint8_t element_size, uint8_t index_stride, uint8_t mtype) --{ -- /* No longer needed on GFXv9. These values are now hard-coded, -- * except for the MTYPE which comes from the page table. -- */ -- -- return 0; --} --static int alloc_memory_of_scratch(struct kgd_dev *kgd, -- uint64_t va, uint32_t vmid) --{ -- /* No longer needed on GFXv9. The scratch base address is -- * passed to the shader by the CP. It's the user mode driver's -- * responsibility. -- */ -- -- return 0; --} -- --/* FIXME: Does this need to be ASIC-specific code? */ --static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- const union amdgpu_firmware_header *hdr; -- -- switch (type) { -- case KGD_ENGINE_PFP: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; -- break; -- -- case KGD_ENGINE_ME: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; -- break; -- -- case KGD_ENGINE_CE: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; -- break; -- -- case KGD_ENGINE_MEC1: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; -- break; -- -- case KGD_ENGINE_MEC2: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; -- break; -- -- case KGD_ENGINE_RLC: -- hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; -- break; -- -- case KGD_ENGINE_SDMA1: -- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; -- break; -- -- case KGD_ENGINE_SDMA2: -- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; -- break; -- -- default: -- return 0; -- } -- -- if (hdr == NULL) -- return 0; -- -- /* Only 12 bit in use*/ -- return hdr->common.ucode_version; --} -- --static void set_num_of_requests(struct kgd_dev *kgd, -- uint8_t num_of_requests) --{ -- pr_debug("This is a stub\n"); --} -- --static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t page_table_base) --{ -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT | -- AMDGPU_PTE_VALID; -- -- /* TODO: Don't use hardcoded VMIDs */ -- if (vmid < 8 || vmid > 15) { -- pr_err("trying to set page table base for wrong VMID %u\n", -- vmid); -- return; -- } -- -- /* TODO: take advantage of per-process address space size. For -- * now, all processes share the same address space size, like -- * on GFX8 and older. -- */ -- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); -- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); -- -- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), -- lower_32_bits(adev->vm_manager.max_pfn - 1)); -- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), -- upper_32_bits(adev->vm_manager.max_pfn - 1)); -- -- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); -- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), -- lower_32_bits(adev->vm_manager.max_pfn - 1)); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), -- upper_32_bits(adev->vm_manager.max_pfn - 1)); -- -- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); -- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); --} -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c -deleted file mode 100644 -index 7df892d..0000000 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c -+++ /dev/null -@@ -1,2578 +0,0 @@ --/* -- * Copyright 2014 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- --#undef pr_fmt --#define pr_fmt(fmt) "kfd2kgd: " fmt -- --#include <linux/module.h> --#include <linux/fdtable.h> --#include <linux/uaccess.h> --#include <linux/firmware.h> --#include <linux/list.h> --#include <linux/sched/mm.h> --#include <drm/drmP.h> --#include <linux/dma-buf.h> --#include <linux/pagemap.h> --#include "amdgpu_amdkfd.h" --#include "amdgpu_ucode.h" --#include "gca/gfx_8_0_sh_mask.h" --#include "gca/gfx_8_0_d.h" --#include "gca/gfx_8_0_enum.h" --#include "oss/oss_3_0_sh_mask.h" --#include "oss/oss_3_0_d.h" --#include "gmc/gmc_8_1_sh_mask.h" --#include "gmc/gmc_8_1_d.h" -- --/* Special VM and GART address alignment needed for VI pre-Fiji due to -- * a HW bug. -- */ --#define VI_BO_SIZE_ALIGN (0x8000) -- --/* BO flag to indicate a KFD userptr BO */ --#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63) -- --/* Impose limit on how much memory KFD can use */ --struct kfd_mem_usage_limit { -- uint64_t max_system_mem_limit; -- uint64_t max_userptr_mem_limit; -- int64_t system_mem_used; -- int64_t userptr_mem_used; -- spinlock_t mem_limit_lock; --}; -- --static struct kfd_mem_usage_limit kfd_mem_limit; -- --/* Struct used for amdgpu_amdkfd_bo_validate */ --struct amdgpu_vm_parser { -- uint32_t domain; -- bool wait; --}; -- --static const char * const domain_bit_to_string[] = { -- "CPU", -- "GTT", -- "VRAM", -- "GDS", -- "GWS", -- "OA" --}; -- --#define domain_string(domain) domain_bit_to_string[ffs(domain)-1] -- --static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); -- -- --static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) --{ -- return (struct amdgpu_device *)kgd; --} -- --static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, -- struct kgd_mem *mem) --{ -- struct kfd_bo_va_list *entry; -- -- list_for_each_entry(entry, &mem->bo_va_list, bo_list) -- if (entry->bo_va->base.vm == avm) -- return false; -- -- return true; --} -- --/* Set memory usage limits. Current, limits are -- * System (kernel) memory - 15/16th System RAM -- * Userptr memory - 15/16th System RAM -- */ --void amdgpu_amdkfd_gpuvm_init_mem_limits(void) --{ -- struct sysinfo si; -- uint64_t mem; -- -- si_meminfo(&si); -- mem = si.totalram - si.totalhigh; -- mem *= si.mem_unit; -- -- spin_lock_init(&kfd_mem_limit.mem_limit_lock); -- kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4); /* 15/16 */ -- kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 4); /* 15/16 */ -- pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n", -- (kfd_mem_limit.max_system_mem_limit >> 20), -- (kfd_mem_limit.max_userptr_mem_limit >> 20)); --} -- --static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, -- uint64_t size, u32 domain) --{ -- size_t acc_size; -- int ret = 0; -- -- acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, -- sizeof(struct amdgpu_bo)); -- -- spin_lock(&kfd_mem_limit.mem_limit_lock); -- if (domain == AMDGPU_GEM_DOMAIN_GTT) { -- if (kfd_mem_limit.system_mem_used + (acc_size + size) > -- kfd_mem_limit.max_system_mem_limit) { -- ret = -ENOMEM; -- goto err_no_mem; -- } -- kfd_mem_limit.system_mem_used += (acc_size + size); -- } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { -- if ((kfd_mem_limit.system_mem_used + acc_size > -- kfd_mem_limit.max_system_mem_limit) || -- (kfd_mem_limit.userptr_mem_used + (size + acc_size) > -- kfd_mem_limit.max_userptr_mem_limit)) { -- ret = -ENOMEM; -- goto err_no_mem; -- } -- kfd_mem_limit.system_mem_used += acc_size; -- kfd_mem_limit.userptr_mem_used += size; -- } --err_no_mem: -- spin_unlock(&kfd_mem_limit.mem_limit_lock); -- return ret; --} -- --static void unreserve_system_mem_limit(struct amdgpu_device *adev, -- uint64_t size, u32 domain) --{ -- size_t acc_size; -- -- acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, -- sizeof(struct amdgpu_bo)); -- -- spin_lock(&kfd_mem_limit.mem_limit_lock); -- if (domain == AMDGPU_GEM_DOMAIN_GTT) { -- kfd_mem_limit.system_mem_used -= (acc_size + size); -- } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { -- kfd_mem_limit.system_mem_used -= acc_size; -- kfd_mem_limit.userptr_mem_used -= size; -- } -- WARN_ONCE(kfd_mem_limit.system_mem_used < 0, -- "kfd system memory accounting unbalanced"); -- WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, -- "kfd userptr memory accounting unbalanced"); -- -- spin_unlock(&kfd_mem_limit.mem_limit_lock); --} -- --void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo) --{ -- spin_lock(&kfd_mem_limit.mem_limit_lock); -- -- if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) { -- kfd_mem_limit.system_mem_used -= bo->tbo.acc_size; -- kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo); -- } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { -- kfd_mem_limit.system_mem_used -= -- (bo->tbo.acc_size + amdgpu_bo_size(bo)); -- } -- WARN_ONCE(kfd_mem_limit.system_mem_used < 0, -- "kfd system memory accounting unbalanced"); -- WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, -- "kfd userptr memory accounting unbalanced"); -- -- spin_unlock(&kfd_mem_limit.mem_limit_lock); --} -- -- --/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence(s) from BO's -- * reservation object. -- * -- * @bo: [IN] Remove eviction fence(s) from this BO -- * @ef: [IN] If ef is specified, then this eviction fence is removed if it -- * is present in the shared list. -- * @ef_list: [OUT] Returns list of eviction fences. These fences are removed -- * from BO's reservation object shared list. -- * @ef_count: [OUT] Number of fences in ef_list. -- * -- * NOTE: If called with ef_list, then amdgpu_amdkfd_add_eviction_fence must be -- * called to restore the eviction fences and to avoid memory leak. This is -- * useful for shared BOs. -- * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held. -- */ --static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, -- struct amdgpu_amdkfd_fence *ef, -- struct amdgpu_amdkfd_fence ***ef_list, -- unsigned int *ef_count) --{ -- struct reservation_object_list *fobj; -- struct reservation_object *resv; -- unsigned int i = 0, j = 0, k = 0, shared_count; -- unsigned int count = 0; -- struct amdgpu_amdkfd_fence **fence_list; -- -- if (!ef && !ef_list) -- return -EINVAL; -- -- if (ef_list) { -- *ef_list = NULL; -- *ef_count = 0; -- } -- -- resv = bo->tbo.resv; -- fobj = reservation_object_get_list(resv); -- -- if (!fobj) -- return 0; -- -- preempt_disable(); -- write_seqcount_begin(&resv->seq); -- -- /* Go through all the shared fences in the resevation object. If -- * ef is specified and it exists in the list, remove it and reduce the -- * count. If ef is not specified, then get the count of eviction fences -- * present. -- */ -- shared_count = fobj->shared_count; -- for (i = 0; i < shared_count; ++i) { -- struct dma_fence *f; -- -- f = rcu_dereference_protected(fobj->shared[i], -- reservation_object_held(resv)); -- -- if (ef) { -- if (f->context == ef->base.context) { -- dma_fence_put(f); -- fobj->shared_count--; -- } else -- RCU_INIT_POINTER(fobj->shared[j++], f); -- -- } else if (to_amdgpu_amdkfd_fence(f)) -- count++; -- } -- write_seqcount_end(&resv->seq); -- preempt_enable(); -- -- if (ef || !count) -- return 0; -- -- /* Alloc memory for count number of eviction fence pointers. Fill the -- * ef_list array and ef_count -- */ -- -- fence_list = kcalloc(count, sizeof(struct amdgpu_amdkfd_fence *), -- GFP_KERNEL); -- if (!fence_list) -- return -ENOMEM; -- -- preempt_disable(); -- write_seqcount_begin(&resv->seq); -- -- j = 0; -- for (i = 0; i < shared_count; ++i) { -- struct dma_fence *f; -- struct amdgpu_amdkfd_fence *efence; -- -- f = rcu_dereference_protected(fobj->shared[i], -- reservation_object_held(resv)); -- -- efence = to_amdgpu_amdkfd_fence(f); -- if (efence) { -- fence_list[k++] = efence; -- fobj->shared_count--; -- } else -- RCU_INIT_POINTER(fobj->shared[j++], f); -- } -- -- write_seqcount_end(&resv->seq); -- preempt_enable(); -- -- *ef_list = fence_list; -- *ef_count = k; -- -- return 0; --} -- --/* amdgpu_amdkfd_add_eviction_fence - Adds eviction fence(s) back into BO's -- * reservation object. -- * -- * @bo: [IN] Add eviction fences to this BO -- * @ef_list: [IN] List of eviction fences to be added -- * @ef_count: [IN] Number of fences in ef_list. -- * -- * NOTE: Must call amdgpu_amdkfd_remove_eviction_fence before calling this -- * function. -- */ --static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo, -- struct amdgpu_amdkfd_fence **ef_list, -- unsigned int ef_count) --{ -- int i; -- -- if (!ef_list || !ef_count) -- return; -- -- for (i = 0; i < ef_count; i++) { -- amdgpu_bo_fence(bo, &ef_list[i]->base, true); -- /* Readding the fence takes an additional reference. Drop that -- * reference. -- */ -- dma_fence_put(&ef_list[i]->base); -- } -- -- kfree(ef_list); --} -- --static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, -- bool wait) --{ -- int ret; -- -- if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm), -- "Called with userptr BO")) -- return -EINVAL; -- -- amdgpu_ttm_placement_from_domain(bo, domain); -- -- ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); -- if (ret) -- goto validate_fail; -- if (wait) { -- struct amdgpu_amdkfd_fence **ef_list; -- unsigned int ef_count; -- -- ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list, -- &ef_count); -- if (ret) -- goto validate_fail; -- -- ttm_bo_wait(&bo->tbo, false, false); -- amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count); -- } -- --validate_fail: -- return ret; --} -- --static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo) --{ -- struct amdgpu_vm_parser *p = param; -- -- return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait); --} -- --/* vm_validate_pt_pd_bos - Validate page table and directory BOs -- * -- * Also updates page directory entries so we don't need to do this -- * again later until the page directory is validated again (e.g. after -- * an eviction or allocating new page tables). -- */ --static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm) --{ -- struct amdgpu_bo *pd = vm->root.base.bo; -- struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev); -- struct amdgpu_vm_parser param; -- int ret; -- -- param.domain = AMDGPU_GEM_DOMAIN_VRAM; -- param.wait = false; -- -- ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate, -- ¶m); -- if (ret) { -- pr_err("amdgpu: failed to validate PT BOs\n"); -- return ret; -- } -- -- ret = amdgpu_amdkfd_validate(¶m, pd); -- if (ret) { -- pr_err("amdgpu: failed to validate PD\n"); -- return ret; -- } -- -- ret = amdgpu_vm_update_directories(adev, vm); -- if (ret != 0) -- return ret; -- -- return 0; --} -- --/* add_bo_to_vm - Add a BO to a VM -- * -- * Everything that needs to bo done only once when a BO is first added -- * to a VM. It can later be mapped and unmapped many times without -- * repeating these steps. -- * -- * 1. Allocate and initialize BO VA entry data structure -- * 2. Add BO to the VM -- * 3. Determine ASIC-specific PTE flags -- * 4. Alloc page tables and directories if needed -- * 4a. Validate new page tables and directories and update directories -- */ --static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem, -- struct amdgpu_vm *avm, bool is_aql, -- struct kfd_bo_va_list **p_bo_va_entry) --{ -- int ret; -- struct kfd_bo_va_list *bo_va_entry; -- struct amdkfd_vm *kvm = container_of(avm, -- struct amdkfd_vm, base); -- struct amdgpu_bo *pd = avm->root.base.bo; -- struct amdgpu_bo *bo = mem->bo; -- uint64_t va = mem->va; -- struct list_head *list_bo_va = &mem->bo_va_list; -- unsigned long bo_size = bo->tbo.mem.size; -- -- if (!va) { -- pr_err("Invalid VA when adding BO to VM\n"); -- return -EINVAL; -- } -- -- if (is_aql) -- va += bo_size; -- -- bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL); -- if (!bo_va_entry) -- return -ENOMEM; -- -- pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va, -- va + bo_size, avm); -- -- /* Add BO to VM internal data structures*/ -- bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo); -- if (bo_va_entry->bo_va == NULL) { -- ret = -EINVAL; -- pr_err("Failed to add BO object to VM. ret == %d\n", -- ret); -- goto err_vmadd; -- } -- -- bo_va_entry->va = va; -- bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev, -- mem->mapping_flags); -- bo_va_entry->kgd_dev = (void *)adev; -- list_add(&bo_va_entry->bo_list, list_bo_va); -- -- if (p_bo_va_entry) -- *p_bo_va_entry = bo_va_entry; -- -- /* Allocate new page tables if neeeded and validate -- * them. Clearing of new page tables and validate need to wait -- * on move fences. We don't want that to trigger the eviction -- * fence, so remove it temporarily. -- */ -- amdgpu_amdkfd_remove_eviction_fence(pd, -- kvm->process_info->eviction_fence, -- NULL, NULL); -- -- ret = amdgpu_vm_alloc_pts(adev, avm, va, amdgpu_bo_size(bo)); -- if (ret) { -- pr_err("Failed to allocate pts, err=%d\n", ret); -- goto err_alloc_pts; -- } -- -- ret = vm_validate_pt_pd_bos(avm); -- if (ret != 0) { -- pr_err("validate_pt_pd_bos() failed\n"); -- goto err_alloc_pts; -- } -- -- /* Add the eviction fence back */ -- amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); -- -- return 0; -- --err_alloc_pts: -- amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); -- amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va); -- list_del(&bo_va_entry->bo_list); --err_vmadd: -- kfree(bo_va_entry); -- return ret; --} -- --static void remove_bo_from_vm(struct amdgpu_device *adev, -- struct kfd_bo_va_list *entry, unsigned long size) --{ -- pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n", -- entry->va, -- entry->va + size, entry); -- amdgpu_vm_bo_rmv(adev, entry->bo_va); -- list_del(&entry->bo_list); -- kfree(entry); --} -- --static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, -- struct amdkfd_process_info *process_info, -- bool userptr) --{ -- struct ttm_validate_buffer *entry = &mem->validate_list; -- struct amdgpu_bo *bo = mem->bo; -- -- INIT_LIST_HEAD(&entry->head); -- entry->shared = true; -- entry->bo = &bo->tbo; -- mutex_lock(&process_info->lock); -- if (userptr) -- list_add_tail(&entry->head, &process_info->userptr_valid_list); -- else -- list_add_tail(&entry->head, &process_info->kfd_bo_list); -- mutex_unlock(&process_info->lock); --} -- --/* Initializes user pages. It registers the MMU notifier and validates -- * the userptr BO in the GTT domain. -- * -- * The BO must already be on the userptr_valid_list. Otherwise an -- * eviction and restore may happen that leaves the new BO unmapped -- * with the user mode queues running. -- * -- * Takes the process_info->lock to protect against concurrent restore -- * workers. -- * -- * Returns 0 for success, negative errno for errors. -- */ --static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, -- uint64_t user_addr) --{ -- struct amdkfd_process_info *process_info = mem->process_info; -- struct amdgpu_bo *bo = mem->bo; -- int ret = 0; -- -- mutex_lock(&process_info->lock); -- -- ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); -- if (ret) { -- pr_err("%s: Failed to set userptr: %d\n", __func__, ret); -- goto out; -- } -- -- ret = amdgpu_mn_register(bo, user_addr); -- if (ret) { -- pr_err("%s: Failed to register MMU notifier: %d\n", -- __func__, ret); -- goto out; -- } -- -- /* If no restore worker is running concurrently, user_pages -- * should not be allocated -- */ -- WARN(mem->user_pages, "Leaking user_pages array"); -- --#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -- mem->user_pages = drm_calloc_large(bo->tbo.ttm->num_pages, -- sizeof(struct page *)); --#else -- mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages, -- sizeof(struct page *), -- GFP_KERNEL | __GFP_ZERO); --#endif -- if (!mem->user_pages) { -- pr_err("%s: Failed to allocate pages array\n", __func__); -- ret = -ENOMEM; -- goto unregister_out; -- } -- -- ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); -- if (ret) { -- pr_err("%s: Failed to get user pages: %d\n", __func__, ret); -- goto free_out; -- } -- -- amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages); -- -- ret = amdgpu_bo_reserve(bo, true); -- if (ret) { -- pr_err("%s: Failed to reserve BO\n", __func__); -- goto release_out; -- } -- amdgpu_ttm_placement_from_domain(bo, mem->domain); -- ret = ttm_bo_validate(&bo->tbo, &bo->placement, -- true, false); -- if (ret) -- pr_err("%s: failed to validate BO\n", __func__); -- amdgpu_bo_unreserve(bo); -- --release_out: -- if (ret) -- release_pages(mem->user_pages, bo->tbo.ttm->num_pages, 0); --free_out: --#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -- drm_free_large(mem->user_pages); --#else -- kvfree(mem->user_pages); --#endif -- mem->user_pages = NULL; --unregister_out: -- if (ret) -- amdgpu_mn_unregister(bo); --out: -- mutex_unlock(&process_info->lock); -- return ret; --} -- --static int __map_bo_to_kernel(struct amdgpu_bo *bo, u32 domain, void **kptr) --{ -- int ret; -- -- ret = amdgpu_bo_reserve(bo, true); -- if (ret) { -- pr_err("Failed to reserve bo. ret %d\n", ret); -- return ret; -- } -- -- ret = amdgpu_bo_pin(bo, domain, NULL); -- if (ret) { -- pr_err("Failed to pin bo. ret %d\n", ret); -- goto pin_failed; -- } -- -- ret = amdgpu_bo_kmap(bo, kptr); -- if (ret) { -- pr_err("Failed to map bo to kernel. ret %d\n", ret); -- goto kmap_failed; -- } -- -- amdgpu_bo_unreserve(bo); -- -- return ret; -- --kmap_failed: -- amdgpu_bo_unpin(bo); --pin_failed: -- amdgpu_bo_unreserve(bo); -- -- return ret; --} -- --static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, -- uint64_t size, void *vm, struct kgd_mem **mem, -- uint64_t *offset, u32 domain, u64 flags, -- struct sg_table *sg, bool aql_queue, -- bool readonly, bool execute, bool coherent, bool no_sub, -- bool userptr) --{ -- struct amdgpu_device *adev; -- int ret; -- struct amdgpu_bo *bo; -- uint64_t user_addr = 0; -- int byte_align; -- u32 alloc_domain; -- uint32_t mapping_flags; -- struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; -- -- if (aql_queue) -- size = size >> 1; -- if (userptr) { -- if (!offset || !*offset) -- return -EINVAL; -- user_addr = *offset; -- } -- -- adev = get_amdgpu_device(kgd); -- byte_align = (adev->family == AMDGPU_FAMILY_VI && -- adev->asic_type != CHIP_FIJI && -- adev->asic_type != CHIP_POLARIS10 && -- adev->asic_type != CHIP_POLARIS11) ? -- VI_BO_SIZE_ALIGN : 1; -- -- *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); -- if (*mem == NULL) { -- ret = -ENOMEM; -- goto err; -- } -- INIT_LIST_HEAD(&(*mem)->bo_va_list); -- mutex_init(&(*mem)->lock); -- (*mem)->coherent = coherent; -- (*mem)->no_substitute = no_sub; -- (*mem)->aql_queue = aql_queue; -- -- mapping_flags = AMDGPU_VM_PAGE_READABLE; -- if (!readonly) -- mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE; -- if (execute) -- mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; -- if (coherent) -- mapping_flags |= AMDGPU_VM_MTYPE_UC; -- else -- mapping_flags |= AMDGPU_VM_MTYPE_NC; -- -- (*mem)->mapping_flags = mapping_flags; -- -- alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain; -- -- amdgpu_sync_create(&(*mem)->sync); -- -- ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain); -- if (ret) { -- pr_err("Insufficient system memory\n"); -- goto err_bo_create; -- } -- -- pr_debug("\t create BO VA 0x%llx size 0x%llx domain %s\n", -- va, size, domain_string(alloc_domain)); -- -- /* Allocate buffer object. Userptr objects need to start out -- * in the CPU domain, get moved to GTT when pinned. -- */ -- ret = amdgpu_bo_create(adev, size, byte_align, false, -- alloc_domain, -- flags, sg, NULL, 0, &bo); -- if (ret != 0) { -- pr_err("Failed to create BO on domain %s. ret %d\n", -- domain_string(alloc_domain), ret); -- unreserve_system_mem_limit(adev, size, alloc_domain); -- goto err_bo_create; -- } -- bo->kfd_bo = *mem; -- (*mem)->bo = bo; -- if (userptr) -- bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; -- -- (*mem)->va = va; -- (*mem)->domain = domain; -- (*mem)->mapped_to_gpu_memory = 0; -- (*mem)->process_info = kfd_vm->process_info; -- add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, userptr); -- -- if (userptr) { -- ret = init_user_pages(*mem, current->mm, user_addr); -- if (ret) { -- mutex_lock(&kfd_vm->process_info->lock); -- list_del(&(*mem)->validate_list.head); -- mutex_unlock(&kfd_vm->process_info->lock); -- goto allocate_init_user_pages_failed; -- } -- } -- -- if (offset) -- *offset = amdgpu_bo_mmap_offset(bo); -- -- return 0; -- --allocate_init_user_pages_failed: -- amdgpu_bo_unref(&bo); --err_bo_create: -- kfree(*mem); --err: -- return ret; --} -- --/* Reserving a BO and its page table BOs must happen atomically to -- * avoid deadlocks. When updating userptrs we need to temporarily -- * back-off the reservation and then reacquire it. Track all the -- * reservation info in a context structure. Buffers can be mapped to -- * multiple VMs simultaneously (buffers being restored on multiple -- * GPUs). -- */ --struct bo_vm_reservation_context { -- struct amdgpu_bo_list_entry kfd_bo; -- unsigned int n_vms; -- struct amdgpu_bo_list_entry *vm_pd; -- struct ww_acquire_ctx ticket; -- struct list_head list, duplicates; -- struct amdgpu_sync *sync; -- bool reserved; --}; -- --/** -- * reserve_bo_and_vm - reserve a BO and a VM unconditionally. -- * @mem: KFD BO structure. -- * @vm: the VM to reserve. -- * @ctx: the struct that will be used in unreserve_bo_and_vms(). -- */ --static int reserve_bo_and_vm(struct kgd_mem *mem, -- struct amdgpu_vm *vm, -- struct bo_vm_reservation_context *ctx) --{ -- struct amdgpu_bo *bo = mem->bo; -- int ret; -- -- WARN_ON(!vm); -- -- ctx->reserved = false; -- ctx->n_vms = 1; -- ctx->sync = &mem->sync; -- -- INIT_LIST_HEAD(&ctx->list); -- INIT_LIST_HEAD(&ctx->duplicates); -- -- ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) -- * ctx->n_vms, GFP_KERNEL); -- if (ctx->vm_pd == NULL) -- return -ENOMEM; -- -- ctx->kfd_bo.robj = bo; -- ctx->kfd_bo.priority = 0; -- ctx->kfd_bo.tv.bo = &bo->tbo; -- ctx->kfd_bo.tv.shared = true; -- ctx->kfd_bo.user_pages = NULL; -- list_add(&ctx->kfd_bo.tv.head, &ctx->list); -- -- amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]); -- -- ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, -- false, &ctx->duplicates); -- if (!ret) -- ctx->reserved = true; -- else -- pr_err("Failed to reserve buffers in ttm\n"); -- -- if (ret) { -- kfree(ctx->vm_pd); -- ctx->vm_pd = NULL; -- } -- -- return ret; --} -- --enum VA_TYPE { -- VA_NOT_MAPPED = 0, -- VA_MAPPED, -- VA_DO_NOT_CARE, --}; -- --/** -- * reserve_bo_and_vm - reserve a BO and some VMs that the BO has been added -- * to, conditionally based on map_type. -- * @mem: KFD BO structure. -- * @vm: the VM to reserve. If NULL, then all VMs associated with the BO -- * is used. Otherwise, a single VM associated with the BO. -- * @map_type: the mapping status that will be used to filter the VMs. -- * @ctx: the struct that will be used in unreserve_bo_and_vms(). -- */ --static int reserve_bo_and_cond_vms(struct kgd_mem *mem, -- struct amdgpu_vm *vm, enum VA_TYPE map_type, -- struct bo_vm_reservation_context *ctx) --{ -- struct amdgpu_bo *bo = mem->bo; -- struct kfd_bo_va_list *entry; -- unsigned int i; -- int ret; -- -- ctx->reserved = false; -- ctx->n_vms = 0; -- ctx->vm_pd = NULL; -- ctx->sync = &mem->sync; -- -- INIT_LIST_HEAD(&ctx->list); -- INIT_LIST_HEAD(&ctx->duplicates); -- -- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { -- if ((vm && vm != entry->bo_va->base.vm) || -- (entry->is_mapped != map_type -- && map_type != VA_DO_NOT_CARE)) -- continue; -- -- ctx->n_vms++; -- } -- -- if (ctx->n_vms != 0) { -- ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) -- * ctx->n_vms, GFP_KERNEL); -- if (ctx->vm_pd == NULL) -- return -ENOMEM; -- } -- -- ctx->kfd_bo.robj = bo; -- ctx->kfd_bo.priority = 0; -- ctx->kfd_bo.tv.bo = &bo->tbo; -- ctx->kfd_bo.tv.shared = true; -- ctx->kfd_bo.user_pages = NULL; -- list_add(&ctx->kfd_bo.tv.head, &ctx->list); -- -- i = 0; -- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { -- if ((vm && vm != entry->bo_va->base.vm) || -- (entry->is_mapped != map_type -- && map_type != VA_DO_NOT_CARE)) -- continue; -- -- amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list, -- &ctx->vm_pd[i]); -- i++; -- } -- -- ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, -- false, &ctx->duplicates); -- if (!ret) -- ctx->reserved = true; -- else -- pr_err("Failed to reserve buffers in ttm.\n"); -- -- if (ret) { -- kfree(ctx->vm_pd); -- ctx->vm_pd = NULL; -- } -- -- return ret; --} -- --static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, -- bool wait, bool intr) --{ -- int ret = 0; -- -- if (wait) -- ret = amdgpu_sync_wait(ctx->sync, intr); -- -- if (ctx->reserved) -- ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); -- kfree(ctx->vm_pd); -- -- ctx->sync = NULL; -- -- ctx->reserved = false; -- ctx->vm_pd = NULL; -- -- return ret; --} -- --static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, -- struct kfd_bo_va_list *entry, -- struct amdgpu_sync *sync) --{ -- struct amdgpu_bo_va *bo_va = entry->bo_va; -- struct amdgpu_vm *vm = bo_va->base.vm; -- struct amdkfd_vm *kvm = container_of(vm, struct amdkfd_vm, base); -- struct amdgpu_bo *pd = vm->root.base.bo; -- -- /* Remove eviction fence from PD (and thereby from PTs too as they -- * share the resv. object. Otherwise during PT update job (see -- * amdgpu_vm_bo_update_mapping), eviction fence will get added to -- * job->sync object -- */ -- amdgpu_amdkfd_remove_eviction_fence(pd, -- kvm->process_info->eviction_fence, -- NULL, NULL); -- amdgpu_vm_bo_unmap(adev, bo_va, entry->va); -- -- amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); -- -- /* Add the eviction fence back */ -- amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); -- -- amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); -- -- /* Sync objects can't handle multiple GPUs (contexts) updating -- * sync->last_vm_update. Fortunately we don't need it for -- * KFD's purposes, so we can just drop that fence. -- */ -- if (sync->last_vm_update) { -- dma_fence_put(sync->last_vm_update); -- sync->last_vm_update = NULL; -- } -- -- return 0; --} -- --static int update_gpuvm_pte(struct amdgpu_device *adev, -- struct kfd_bo_va_list *entry, -- struct amdgpu_sync *sync) --{ -- int ret; -- struct amdgpu_vm *vm; -- struct amdgpu_bo_va *bo_va; -- struct amdgpu_bo *bo; -- -- bo_va = entry->bo_va; -- vm = bo_va->base.vm; -- bo = bo_va->base.bo; -- -- /* Update the page tables */ -- ret = amdgpu_vm_bo_update(adev, bo_va, false); -- if (ret != 0) { -- pr_err("amdgpu_vm_bo_update failed\n"); -- return ret; -- } -- -- amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); -- -- /* Sync objects can't handle multiple GPUs (contexts) updating -- * sync->last_vm_update. Fortunately we don't need it for -- * KFD's purposes, so we can just drop that fence. -- */ -- if (sync->last_vm_update) { -- dma_fence_put(sync->last_vm_update); -- sync->last_vm_update = NULL; -- } -- -- return 0; --} -- --static int map_bo_to_gpuvm(struct amdgpu_device *adev, -- struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, -- bool no_update_pte) --{ -- int ret; -- -- /* Set virtual address for the allocation */ -- ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0, -- amdgpu_bo_size(entry->bo_va->base.bo), entry->pte_flags); -- if (ret != 0) { -- pr_err("Failed to map VA 0x%llx in vm. ret %d\n", -- entry->va, ret); -- return ret; -- } -- -- if (no_update_pte) -- return 0; -- -- ret = update_gpuvm_pte(adev, entry, sync); -- if (ret != 0) { -- pr_err("update_gpuvm_pte() failed\n"); -- goto update_gpuvm_pte_failed; -- } -- -- return 0; -- --update_gpuvm_pte_failed: -- unmap_bo_from_gpuvm(adev, entry, sync); -- return ret; --} -- --static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size) --{ -- struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL); -- -- if (!sg) -- return NULL; -- if (sg_alloc_table(sg, 1, GFP_KERNEL)) { -- kfree(sg); -- return NULL; -- } -- sg->sgl->dma_address = addr; -- sg->sgl->length = size; --#ifdef CONFIG_NEED_SG_DMA_LENGTH -- sg->sgl->dma_length = size; --#endif -- return sg; --} -- --int amdgpu_amdkfd_gpuvm_sync_memory( -- struct kgd_dev *kgd, struct kgd_mem *mem, bool intr) --{ -- int ret = 0; -- struct amdgpu_sync sync; -- struct amdgpu_device *adev; -- -- adev = get_amdgpu_device(kgd); -- amdgpu_sync_create(&sync); -- -- mutex_lock(&mem->lock); -- amdgpu_sync_clone(adev, &mem->sync, &sync); -- mutex_unlock(&mem->lock); -- -- ret = amdgpu_sync_wait(&sync, intr); -- amdgpu_sync_free(&sync); -- return ret; --} -- --#define BOOL_TO_STR(b) (b == true) ? "true" : "false" -- --int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( -- struct kgd_dev *kgd, uint64_t va, uint64_t size, -- void *vm, struct kgd_mem **mem, -- uint64_t *offset, uint32_t flags) --{ -- bool aql_queue, public, readonly, execute, coherent, no_sub, userptr; -- u64 alloc_flag; -- uint32_t domain; -- uint64_t *temp_offset; -- struct sg_table *sg = NULL; -- -- if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) { -- pr_err("current hw doesn't support paged memory\n"); -- return -EINVAL; -- } -- -- domain = 0; -- alloc_flag = 0; -- temp_offset = NULL; -- -- aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false; -- public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false; -- readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false; -- execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false; -- coherent = (flags & ALLOC_MEM_FLAGS_COHERENT) ? true : false; -- no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false; -- userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false; -- -- /* -- * Check on which domain to allocate BO -- */ -- if (flags & ALLOC_MEM_FLAGS_VRAM) { -- domain = AMDGPU_GEM_DOMAIN_VRAM; -- alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; -- if (public) { -- alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; -- temp_offset = offset; -- } -- alloc_flag |= AMDGPU_GEM_CREATE_VRAM_CLEARED; -- } else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) { -- domain = AMDGPU_GEM_DOMAIN_GTT; -- alloc_flag = 0; -- temp_offset = offset; -- } else if (flags & ALLOC_MEM_FLAGS_DOORBELL) { -- domain = AMDGPU_GEM_DOMAIN_GTT; -- alloc_flag = 0; -- temp_offset = offset; -- if (size > UINT_MAX) -- return -EINVAL; -- sg = create_doorbell_sg(*offset, size); -- if (!sg) -- return -ENOMEM; -- } -- -- if (offset && !userptr) -- *offset = 0; -- -- pr_debug("Allocate VA 0x%llx - 0x%llx domain %s aql %s\n", -- va, va + size, domain_string(domain), -- BOOL_TO_STR(aql_queue)); -- -- pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s coherent %s no_sub %s\n", -- alloc_flag, BOOL_TO_STR(public), -- BOOL_TO_STR(readonly), BOOL_TO_STR(execute), -- BOOL_TO_STR(coherent), BOOL_TO_STR(no_sub)); -- -- return __alloc_memory_of_gpu(kgd, va, size, vm, mem, -- temp_offset, domain, -- alloc_flag, sg, -- aql_queue, readonly, execute, -- coherent, no_sub, userptr); --} -- --int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( -- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) --{ -- struct amdgpu_device *adev; -- struct kfd_bo_va_list *entry, *tmp; -- struct bo_vm_reservation_context ctx; -- int ret = 0; -- struct ttm_validate_buffer *bo_list_entry; -- struct amdkfd_process_info *process_info; -- unsigned long bo_size; -- -- adev = get_amdgpu_device(kgd); -- process_info = ((struct amdkfd_vm *)vm)->process_info; -- -- bo_size = mem->bo->tbo.mem.size; -- -- mutex_lock(&mem->lock); -- -- if (mem->mapped_to_gpu_memory > 0) { -- pr_err("BO VA 0x%llx size 0x%lx is already mapped to vm %p.\n", -- mem->va, bo_size, vm); -- mutex_unlock(&mem->lock); -- return -EBUSY; -- } -- -- mutex_unlock(&mem->lock); -- /* lock is not needed after this, since mem is unused and will -- * be freed anyway -- */ -- -- /* No more MMU notifiers */ -- amdgpu_mn_unregister(mem->bo); -- -- /* Make sure restore workers don't access the BO any more */ -- bo_list_entry = &mem->validate_list; -- mutex_lock(&process_info->lock); -- list_del(&bo_list_entry->head); -- mutex_unlock(&process_info->lock); -- -- /* Free user pages if necessary */ -- if (mem->user_pages) { -- pr_debug("%s: Freeing user_pages array\n", __func__); -- if (mem->user_pages[0]) -- release_pages(mem->user_pages, -- mem->bo->tbo.ttm->num_pages, 0); --#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -- drm_free_large(mem->user_pages); --#else -- kvfree(mem->user_pages); --#endif -- } -- -- ret = reserve_bo_and_cond_vms(mem, NULL, VA_DO_NOT_CARE, &ctx); -- if (unlikely(ret != 0)) -- return ret; -- -- /* The eviction fence should be removed by the last unmap. -- * TODO: Log an error condition if the bo still has the eviction fence -- * attached -- */ -- amdgpu_amdkfd_remove_eviction_fence(mem->bo, -- process_info->eviction_fence, -- NULL, NULL); -- pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va, -- mem->va + bo_size * (1 + mem->aql_queue)); -- -- /* Remove from VM internal data structures */ -- list_for_each_entry_safe(entry, tmp, &mem->bo_va_list, bo_list) { -- remove_bo_from_vm((struct amdgpu_device *)entry->kgd_dev, -- entry, bo_size); -- } -- -- ret = unreserve_bo_and_vms(&ctx, false, false); -- -- /* Free the sync object */ -- amdgpu_sync_free(&mem->sync); -- -- /* If the SG is not NULL, it's one we created for a doorbell -- * BO. We need to free it. -- */ -- if (mem->bo->tbo.sg) { -- sg_free_table(mem->bo->tbo.sg); -- kfree(mem->bo->tbo.sg); -- } -- -- /* Free the BO*/ -- amdgpu_bo_unref(&mem->bo); -- kfree(mem); -- -- return ret; --} -- --int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( -- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) --{ -- struct amdgpu_device *adev; -- int ret; -- struct amdgpu_bo *bo; -- uint32_t domain; -- struct kfd_bo_va_list *entry; -- struct bo_vm_reservation_context ctx; -- struct kfd_bo_va_list *bo_va_entry = NULL; -- struct kfd_bo_va_list *bo_va_entry_aql = NULL; -- struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; -- unsigned long bo_size; -- bool is_invalid_userptr; -- -- adev = get_amdgpu_device(kgd); -- -- /* Make sure restore is not running concurrently. Since we -- * don't map invalid userptr BOs, we rely on the next restore -- * worker to do the mapping -- */ -- mutex_lock(&mem->process_info->lock); -- -- /* Lock mmap-sem. If we find an invalid userptr BO, we can be -- * sure that the MMU notifier is no longer running -- * concurrently and the queues are actually stopped -- */ -- down_read(¤t->mm->mmap_sem); -- is_invalid_userptr = atomic_read(&mem->invalid); -- up_read(¤t->mm->mmap_sem); -- -- mutex_lock(&mem->lock); -- -- bo = mem->bo; -- -- if (!bo) { -- pr_err("Invalid BO when mapping memory to GPU\n"); -- return -EINVAL; -- } -- -- domain = mem->domain; -- bo_size = bo->tbo.mem.size; -- -- pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n", -- mem->va, -- mem->va + bo_size * (1 + mem->aql_queue), -- vm, domain_string(domain)); -- -- ret = reserve_bo_and_vm(mem, vm, &ctx); -- if (unlikely(ret != 0)) -- goto bo_reserve_failed; -- -- /* Userptr can be marked as "not invalid", but not actually be -- * validated yet (still in the system domain). In that case -- * the queues are still stopped and we can leave mapping for -- * the next restore worker -- */ -- if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) -- is_invalid_userptr = true; -- -- if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, mem)) { -- ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, false, -- &bo_va_entry); -- if (ret != 0) -- goto add_bo_to_vm_failed; -- if (mem->aql_queue) { -- ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, -- true, &bo_va_entry_aql); -- if (ret != 0) -- goto add_bo_to_vm_failed_aql; -- } -- } -- -- if (mem->mapped_to_gpu_memory == 0 && -- !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { -- /* Validate BO only once. The eviction fence gets added to BO -- * the first time it is mapped. Validate will wait for all -- * background evictions to complete. -- */ -- ret = amdgpu_amdkfd_bo_validate(bo, domain, true); -- if (ret) { -- pr_debug("Validate failed\n"); -- goto map_bo_to_gpuvm_failed; -- } -- } -- -- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { -- if (entry->bo_va->base.vm == vm && !entry->is_mapped) { -- pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n", -- entry->va, entry->va + bo_size, -- entry); -- -- ret = map_bo_to_gpuvm(adev, entry, ctx.sync, -- is_invalid_userptr); -- if (ret != 0) { -- pr_err("Failed to map radeon bo to gpuvm\n"); -- goto map_bo_to_gpuvm_failed; -- } -- entry->is_mapped = true; -- mem->mapped_to_gpu_memory++; -- pr_debug("\t INC mapping count %d\n", -- mem->mapped_to_gpu_memory); -- } -- } -- -- if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) == NULL) -- amdgpu_bo_fence(bo, -- &kfd_vm->process_info->eviction_fence->base, -- true); -- ret = unreserve_bo_and_vms(&ctx, false, false); -- -- mutex_unlock(&mem->process_info->lock); -- mutex_unlock(&mem->lock); -- return ret; -- --map_bo_to_gpuvm_failed: -- if (bo_va_entry_aql) -- remove_bo_from_vm(adev, bo_va_entry_aql, bo_size); --add_bo_to_vm_failed_aql: -- if (bo_va_entry) -- remove_bo_from_vm(adev, bo_va_entry, bo_size); --add_bo_to_vm_failed: -- unreserve_bo_and_vms(&ctx, false, false); --bo_reserve_failed: -- mutex_unlock(&mem->process_info->lock); -- mutex_unlock(&mem->lock); -- return ret; --} -- --static u64 get_vm_pd_gpu_offset(void *vm) --{ -- struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; -- struct amdgpu_device *adev = -- amdgpu_ttm_adev(avm->root.base.bo->tbo.bdev); -- u64 offset; -- -- BUG_ON(avm == NULL); -- -- amdgpu_bo_reserve(avm->root.base.bo, false); -- -- offset = amdgpu_bo_gpu_offset(avm->root.base.bo); -- -- amdgpu_bo_unreserve(avm->root.base.bo); -- -- /* On some ASICs the FB doesn't start at 0. Adjust FB offset -- * to an actual MC address. -- */ -- if (adev->gart.gart_funcs->get_vm_pde) -- offset = amdgpu_gart_get_vm_pde(adev, offset); -- -- return offset; --} -- --int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, -- void **process_info, -- struct dma_fence **ef) --{ -- int ret; -- struct amdkfd_vm *new_vm; -- struct amdkfd_process_info *info; -- struct amdgpu_device *adev = get_amdgpu_device(kgd); -- -- new_vm = kzalloc(sizeof(*new_vm), GFP_KERNEL); -- if (new_vm == NULL) -- return -ENOMEM; -- -- /* Initialize the VM context, allocate the page directory and zero it */ -- ret = amdgpu_vm_init(adev, &new_vm->base, AMDGPU_VM_CONTEXT_COMPUTE); -- if (ret != 0) { -- pr_err("Failed init vm ret %d\n", ret); -- /* Undo everything related to the new VM context */ -- goto vm_init_fail; -- } -- new_vm->adev = adev; -- -- if (!*process_info) { -- info = kzalloc(sizeof(*info), GFP_KERNEL); -- if (!info) { -- pr_err("Failed to create amdkfd_process_info"); -- ret = -ENOMEM; -- goto alloc_process_info_fail; -- } -- -- mutex_init(&info->lock); -- INIT_LIST_HEAD(&info->vm_list_head); -- INIT_LIST_HEAD(&info->kfd_bo_list); -- INIT_LIST_HEAD(&info->userptr_valid_list); -- INIT_LIST_HEAD(&info->userptr_inval_list); -- -- info->eviction_fence = -- amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), -- current->mm); -- if (info->eviction_fence == NULL) { -- pr_err("Failed to create eviction fence\n"); -- goto create_evict_fence_fail; -- } -- -- info->pid = get_task_pid(current->group_leader, -- PIDTYPE_PID); -- atomic_set(&info->evicted_bos, 0); -- INIT_DELAYED_WORK(&info->work, -- amdgpu_amdkfd_restore_userptr_worker); -- -- *process_info = info; -- *ef = dma_fence_get(&info->eviction_fence->base); -- } -- -- new_vm->process_info = *process_info; -- -- mutex_lock(&new_vm->process_info->lock); -- list_add_tail(&new_vm->vm_list_node, -- &(new_vm->process_info->vm_list_head)); -- new_vm->process_info->n_vms++; -- mutex_unlock(&new_vm->process_info->lock); -- -- *vm = (void *) new_vm; -- -- pr_debug("Created process vm %p\n", *vm); -- -- return ret; -- --create_evict_fence_fail: -- kfree(info); --alloc_process_info_fail: -- amdgpu_vm_fini(adev, &new_vm->base); --vm_init_fail: -- kfree(new_vm); -- return ret; -- --} -- --void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *) kgd; -- struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *) vm; -- struct amdgpu_vm *avm = &kfd_vm->base; -- struct amdgpu_bo *pd; -- struct amdkfd_process_info *process_info; -- -- if (WARN_ON(!kgd || !vm)) -- return; -- -- pr_debug("Destroying process vm %p\n", vm); -- /* Release eviction fence from PD */ -- pd = avm->root.base.bo; -- amdgpu_bo_reserve(pd, false); -- amdgpu_bo_fence(pd, NULL, false); -- amdgpu_bo_unreserve(pd); -- -- process_info = kfd_vm->process_info; -- -- mutex_lock(&process_info->lock); -- process_info->n_vms--; -- list_del(&kfd_vm->vm_list_node); -- mutex_unlock(&process_info->lock); -- -- /* Release per-process resources */ -- if (!process_info->n_vms) { -- WARN_ON(!list_empty(&process_info->kfd_bo_list)); -- WARN_ON(!list_empty(&process_info->userptr_valid_list)); -- WARN_ON(!list_empty(&process_info->userptr_inval_list)); -- -- dma_fence_put(&process_info->eviction_fence->base); -- cancel_delayed_work_sync(&process_info->work); -- put_pid(process_info->pid); -- kfree(process_info); -- } -- -- /* Release the VM context */ -- amdgpu_vm_fini(adev, avm); -- kfree(vm); --} -- --uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm) --{ -- return get_vm_pd_gpu_offset(vm) >> AMDGPU_GPU_PAGE_SHIFT; --} -- --int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, -- struct kfd_vm_fault_info *mem) --{ -- struct amdgpu_device *adev; -- -- adev = (struct amdgpu_device *) kgd; -- if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) { -- *mem = *adev->mc.vm_fault_info; -- mb(); -- atomic_set(&adev->mc.vm_fault_info_updated, 0); -- } -- return 0; --} -- --static bool is_mem_on_local_device(struct kgd_dev *kgd, -- struct list_head *bo_va_list, void *vm) --{ -- struct kfd_bo_va_list *entry; -- -- list_for_each_entry(entry, bo_va_list, bo_list) { -- if (entry->kgd_dev == kgd && entry->bo_va->base.vm == vm) -- return true; -- } -- -- return false; --} -- --int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( -- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) --{ -- struct kfd_bo_va_list *entry; -- struct amdgpu_device *adev; -- unsigned int mapped_before; -- int ret = 0; -- struct bo_vm_reservation_context ctx; -- struct amdkfd_process_info *process_info; -- unsigned long bo_size; -- -- adev = (struct amdgpu_device *) kgd; -- process_info = ((struct amdkfd_vm *)vm)->process_info; -- -- bo_size = mem->bo->tbo.mem.size; -- -- mutex_lock(&mem->lock); -- -- /* -- * Make sure that this BO mapped on KGD before unmappping it -- */ -- if (!is_mem_on_local_device(kgd, &mem->bo_va_list, vm)) { -- ret = -EINVAL; -- goto out; -- } -- -- if (mem->mapped_to_gpu_memory == 0) { -- pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", -- mem->va, bo_size, vm); -- ret = -EINVAL; -- goto out; -- } -- mapped_before = mem->mapped_to_gpu_memory; -- -- ret = reserve_bo_and_cond_vms(mem, vm, VA_MAPPED, &ctx); -- if (unlikely(ret != 0)) -- goto out; -- -- pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n", -- mem->va, -- mem->va + bo_size * (1 + mem->aql_queue), -- vm); -- -- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { -- if (entry->bo_va->base.vm == vm && entry->is_mapped) { -- pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n", -- entry->va, -- entry->va + bo_size, -- entry); -- -- ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync); -- if (ret == 0) { -- entry->is_mapped = false; -- } else { -- pr_err("failed to unmap VA 0x%llx\n", -- mem->va); -- goto unreserve_out; -- } -- -- mem->mapped_to_gpu_memory--; -- pr_debug("\t DEC mapping count %d\n", -- mem->mapped_to_gpu_memory); -- } -- } -- -- /* If BO is unmapped from all VMs, unfence it. It can be evicted if -- * required. -- */ -- if (mem->mapped_to_gpu_memory == 0 && -- !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) -- amdgpu_amdkfd_remove_eviction_fence(mem->bo, -- process_info->eviction_fence, -- NULL, NULL); -- -- if (mapped_before == mem->mapped_to_gpu_memory) { -- pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", -- mem->va, bo_size, vm); -- ret = -EINVAL; -- } -- --unreserve_out: -- unreserve_bo_and_vms(&ctx, false, false); --out: -- mutex_unlock(&mem->lock); -- return ret; --} -- --int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) --{ -- struct amdgpu_device *adev; -- -- adev = get_amdgpu_device(kgd); -- if (!adev) { -- pr_err("Could not get amdgpu device in %s\n", __func__); -- return -ENODEV; -- } -- -- return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev); --} -- --int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, -- struct kgd_mem *mem, void **kptr) --{ -- int ret; -- struct amdgpu_bo *bo = mem->bo; -- -- if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { -- pr_err("userptr can't be mapped to kernel\n"); -- return -EINVAL; -- } -- -- /* delete kgd_mem from kfd_bo_list to avoid re-validating -- * this BO in BO's restoring after eviction. -- */ -- mutex_lock(&mem->process_info->lock); -- -- list_del_init(&mem->validate_list.head); -- -- ret = __map_bo_to_kernel(bo, AMDGPU_GEM_DOMAIN_GTT, kptr); -- if (!ret) -- mem->kptr = *kptr; -- -- mutex_unlock(&mem->process_info->lock); -- -- return ret; --} -- --static int pin_bo_wo_map(struct kgd_mem *mem) --{ -- struct amdgpu_bo *bo = mem->bo; -- int ret = 0; -- -- ret = amdgpu_bo_reserve(bo, false); -- if (unlikely(ret != 0)) -- return ret; -- -- ret = amdgpu_bo_pin(bo, mem->domain, NULL); -- amdgpu_bo_unreserve(bo); -- -- return ret; --} -- --static void unpin_bo_wo_map(struct kgd_mem *mem) --{ -- struct amdgpu_bo *bo = mem->bo; -- int ret = 0; -- -- ret = amdgpu_bo_reserve(bo, false); -- if (unlikely(ret != 0)) -- return; -- -- amdgpu_bo_unpin(bo); -- amdgpu_bo_unreserve(bo); --} -- --#define AMD_GPU_PAGE_SHIFT PAGE_SHIFT --#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT) -- --static int get_sg_table(struct amdgpu_device *adev, -- struct kgd_mem *mem, uint64_t offset, -- uint64_t size, struct sg_table **ret_sg) --{ -- struct amdgpu_bo *bo = mem->bo; -- struct sg_table *sg = NULL; -- unsigned long bus_addr; -- unsigned int chunks; -- unsigned int i; -- struct scatterlist *s; -- uint64_t offset_in_page; -- unsigned int page_size; -- int ret; -- -- sg = kmalloc(sizeof(*sg), GFP_KERNEL); -- if (!sg) { -- ret = -ENOMEM; -- goto out; -- } -- -- if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) -- page_size = AMD_GPU_PAGE_SIZE; -- else -- page_size = PAGE_SIZE; -- -- -- offset_in_page = offset & (page_size - 1); -- chunks = (size + offset_in_page + page_size - 1) -- / page_size; -- -- ret = sg_alloc_table(sg, chunks, GFP_KERNEL); -- if (unlikely(ret)) -- goto out; -- -- if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) { -- bus_addr = bo->tbo.offset + adev->mc.aper_base + offset; -- -- for_each_sg(sg->sgl, s, sg->orig_nents, i) { -- uint64_t chunk_size, length; -- -- chunk_size = page_size - offset_in_page; -- length = min(size, chunk_size); -- -- sg_set_page(s, NULL, length, offset_in_page); -- s->dma_address = bus_addr; -- s->dma_length = length; -- -- size -= length; -- offset_in_page = 0; -- bus_addr += length; -- } -- } else { -- struct page **pages; -- unsigned int cur_page; -- -- pages = bo->tbo.ttm->pages; -- -- cur_page = offset / page_size; -- for_each_sg(sg->sgl, s, sg->orig_nents, i) { -- uint64_t chunk_size, length; -- -- chunk_size = page_size - offset_in_page; -- length = min(size, chunk_size); -- -- sg_set_page(s, pages[cur_page], length, offset_in_page); -- s->dma_address = page_to_phys(pages[cur_page]); -- s->dma_length = length; -- -- size -= length; -- offset_in_page = 0; -- cur_page++; -- } -- } -- -- *ret_sg = sg; -- return 0; --out: -- kfree(sg); -- *ret_sg = NULL; -- return ret; --} -- --int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, -- struct kgd_mem *mem, uint64_t offset, -- uint64_t size, struct sg_table **ret_sg) --{ -- int ret; -- struct amdgpu_device *adev; -- -- ret = pin_bo_wo_map(mem); -- if (unlikely(ret != 0)) -- return ret; -- -- adev = get_amdgpu_device(kgd); -- -- ret = get_sg_table(adev, mem, offset, size, ret_sg); -- if (ret) -- unpin_bo_wo_map(mem); -- -- return ret; --} -- --void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( -- struct kgd_mem *mem, struct sg_table *sg) --{ -- sg_free_table(sg); -- kfree(sg); -- -- unpin_bo_wo_map(mem); --} -- --int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, -- struct dma_buf *dma_buf, -- uint64_t va, void *vm, -- struct kgd_mem **mem, uint64_t *size, -- uint64_t *mmap_offset) --{ -- struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -- struct drm_gem_object *obj; -- struct amdgpu_bo *bo; -- struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; -- -- if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) -- /* Can't handle non-graphics buffers */ -- return -EINVAL; -- -- obj = dma_buf->priv; -- if (obj->dev->dev_private != adev) -- /* Can't handle buffers from other devices */ -- return -EINVAL; -- -- bo = gem_to_amdgpu_bo(obj); -- if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | -- AMDGPU_GEM_DOMAIN_GTT | -- AMDGPU_GEM_DOMAIN_DGMA))) -- /* Only VRAM and GTT BOs are supported */ -- return -EINVAL; -- -- *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); -- if (*mem == NULL) -- return -ENOMEM; -- -- if (size) -- *size = amdgpu_bo_size(bo); -- -- if (mmap_offset) -- *mmap_offset = amdgpu_bo_mmap_offset(bo); -- -- INIT_LIST_HEAD(&(*mem)->bo_va_list); -- mutex_init(&(*mem)->lock); -- (*mem)->mapping_flags = -- AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | -- AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC; -- -- (*mem)->bo = amdgpu_bo_ref(bo); -- (*mem)->va = va; -- if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) -- (*mem)->domain = AMDGPU_GEM_DOMAIN_VRAM; -- else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) -- (*mem)->domain = AMDGPU_GEM_DOMAIN_GTT; -- else -- (*mem)->domain = AMDGPU_GEM_DOMAIN_DGMA; -- (*mem)->mapped_to_gpu_memory = 0; -- (*mem)->process_info = kfd_vm->process_info; -- add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, false); -- amdgpu_sync_create(&(*mem)->sync); -- -- return 0; --} -- --int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, -- struct kgd_mem *mem, -- struct dma_buf **dmabuf) --{ -- struct amdgpu_device *adev = NULL; -- struct amdgpu_bo *bo = NULL; -- struct drm_gem_object *gobj = NULL; -- -- if (!dmabuf || !kgd || !vm || !mem) -- return -EINVAL; -- -- adev = get_amdgpu_device(kgd); -- bo = mem->bo; -- -- gobj = amdgpu_gem_prime_foreign_bo(adev, bo); -- if (gobj == NULL) { -- pr_err("Export BO failed. Unable to find/create GEM object\n"); -- return -EINVAL; -- } -- -- *dmabuf = amdgpu_gem_prime_export(adev->ddev, gobj, 0); -- return 0; --} -- --static int process_validate_vms(struct amdkfd_process_info *process_info) --{ -- struct amdkfd_vm *peer_vm; -- int ret; -- -- list_for_each_entry(peer_vm, &process_info->vm_list_head, -- vm_list_node) { -- ret = vm_validate_pt_pd_bos(&peer_vm->base); -- if (ret) -- return ret; -- } -- -- return 0; --} -- --/* Evict a userptr BO by stopping the queues if necessary -- * -- * Runs in MMU notifier, may be in RECLAIM_FS context. This means it -- * cannot do any memory allocations, and cannot take any locks that -- * are held elsewhere while allocating memory. Therefore this is as -- * simple as possible, using atomic counters. -- * -- * It doesn't do anything to the BO itself. The real work happens in -- * restore, where we get updated page addresses. This function only -- * ensures that GPU access to the BO is stopped. -- */ --int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, -- struct mm_struct *mm) --{ -- struct amdkfd_process_info *process_info = mem->process_info; -- int invalid, evicted_bos; -- int r = 0; -- -- invalid = atomic_inc_return(&mem->invalid); -- evicted_bos = atomic_inc_return(&process_info->evicted_bos); -- if (evicted_bos == 1) { -- /* First eviction, stop the queues */ -- r = kgd2kfd->quiesce_mm(NULL, mm); -- if (r != 0) -- pr_err("Failed to quiesce KFD\n"); -- schedule_delayed_work(&process_info->work, 1); -- } -- -- return r; --} -- --/* Update invalid userptr BOs -- * -- * Moves invalidated (evicted) userptr BOs from userptr_valid_list to -- * userptr_inval_list and updates user pages for all BOs that have -- * been invalidated since their last update. -- */ --static int update_invalid_user_pages(struct amdkfd_process_info *process_info, -- struct mm_struct *mm) --{ -- struct kgd_mem *mem, *tmp_mem; -- struct amdgpu_bo *bo; -- int invalid, ret; -- -- /* Move all invalidated BOs to the userptr_inval_list and -- * release their user pages by migration to the CPU domain -- */ -- list_for_each_entry_safe(mem, tmp_mem, -- &process_info->userptr_valid_list, -- validate_list.head) { -- if (!atomic_read(&mem->invalid)) -- continue; /* BO is still valid */ -- -- bo = mem->bo; -- -- if (amdgpu_bo_reserve(bo, true)) -- return -EAGAIN; -- amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); -- ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); -- amdgpu_bo_unreserve(bo); -- if (ret) { -- pr_err("%s: Failed to invalidate userptr BO\n", -- __func__); -- return -EAGAIN; -- } -- -- list_move_tail(&mem->validate_list.head, -- &process_info->userptr_inval_list); -- } -- -- if (list_empty(&process_info->userptr_inval_list)) -- return 0; /* All evicted userptr BOs were freed */ -- -- /* Go through userptr_inval_list and update any invalid user_pages */ -- list_for_each_entry(mem, &process_info->userptr_inval_list, -- validate_list.head) { -- invalid = atomic_read(&mem->invalid); -- if (!invalid) -- /* BO hasn't been invalidated since the last -- * revalidation attempt. Keep its BO list. -- */ -- continue; -- -- bo = mem->bo; -- -- if (!mem->user_pages) { --#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -- mem->user_pages = -- drm_calloc_large(bo->tbo.ttm->num_pages, -- sizeof(struct page *)); --#else -- mem->user_pages = -- kvmalloc_array(bo->tbo.ttm->num_pages, -- sizeof(struct page *), -- GFP_KERNEL | __GFP_ZERO); --#endif -- if (!mem->user_pages) { -- pr_err("%s: Failed to allocate pages array\n", -- __func__); -- return -ENOMEM; -- } -- } else if (mem->user_pages[0]) { -- release_pages(mem->user_pages, -- bo->tbo.ttm->num_pages, 0); -- } -- -- /* Get updated user pages */ -- ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, -- mem->user_pages); -- if (ret) { -- mem->user_pages[0] = NULL; -- pr_info("%s: Failed to get user pages: %d\n", -- __func__, ret); -- /* Pretend it succeeded. It will fail later -- * with a VM fault if the GPU tries to access -- * it. Better than hanging indefinitely with -- * stalled user mode queues. -- */ -- } -- -- /* Mark the BO as valid unless it was invalidated -- * again concurrently -- */ -- if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) -- return -EAGAIN; -- } -- return 0; --} -- --/* Validate invalid userptr BOs -- * -- * Validates BOs on the userptr_inval_list, and moves them back to the -- * userptr_valid_list. Also updates GPUVM page tables with new page -- * addresses and waits for the page table updates to complete. -- */ --static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) --{ -- struct amdgpu_bo_list_entry *pd_bo_list_entries; -- struct list_head resv_list, duplicates; -- struct ww_acquire_ctx ticket; -- struct amdgpu_sync sync; -- -- struct amdkfd_vm *peer_vm; -- struct kgd_mem *mem, *tmp_mem; -- struct amdgpu_bo *bo; -- int i, ret; -- -- pd_bo_list_entries = kcalloc(process_info->n_vms, -- sizeof(struct amdgpu_bo_list_entry), -- GFP_KERNEL); -- if (!pd_bo_list_entries) { -- pr_err("%s: Failed to allocate PD BO list entries\n", __func__); -- return -ENOMEM; -- } -- -- INIT_LIST_HEAD(&resv_list); -- INIT_LIST_HEAD(&duplicates); -- -- /* Get all the page directory BOs that need to be reserved */ -- i = 0; -- list_for_each_entry(peer_vm, &process_info->vm_list_head, -- vm_list_node) -- amdgpu_vm_get_pd_bo(&peer_vm->base, &resv_list, -- &pd_bo_list_entries[i++]); -- /* Add the userptr_inval_list entries to resv_list */ -- list_for_each_entry(mem, &process_info->userptr_inval_list, -- validate_list.head) { -- list_add_tail(&mem->resv_list.head, &resv_list); -- mem->resv_list.bo = mem->validate_list.bo; -- mem->resv_list.shared = mem->validate_list.shared; -- } -- -- /* Reserve all BOs and page tables for validation */ -- ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); -- WARN(!list_empty(&duplicates), "Duplicates should be empty"); -- if (ret) -- goto out; -- -- amdgpu_sync_create(&sync); -- -- /* Avoid triggering eviction fences when unmapping invalid -- * userptr BOs (waits for all fences, doesn't use -- * FENCE_OWNER_VM) -- */ -- list_for_each_entry(peer_vm, &process_info->vm_list_head, -- vm_list_node) -- amdgpu_amdkfd_remove_eviction_fence(peer_vm->base.root.base.bo, -- process_info->eviction_fence, -- NULL, NULL); -- -- ret = process_validate_vms(process_info); -- if (ret) -- goto unreserve_out; -- -- /* Validate BOs and update GPUVM page tables */ -- list_for_each_entry_safe(mem, tmp_mem, -- &process_info->userptr_inval_list, -- validate_list.head) { -- struct kfd_bo_va_list *bo_va_entry; -- -- bo = mem->bo; -- -- /* Copy pages array and validate the BO if we got user pages */ -- if (mem->user_pages[0]) { -- amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, -- mem->user_pages); -- amdgpu_ttm_placement_from_domain(bo, mem->domain); -- ret = ttm_bo_validate(&bo->tbo, &bo->placement, -- false, false); -- if (ret) { -- pr_err("%s: failed to validate BO\n", __func__); -- goto unreserve_out; -- } -- } -- -- /* Validate succeeded, now the BO owns the pages, free -- * our copy of the pointer array. Put this BO back on -- * the userptr_valid_list. If we need to revalidate -- * it, we need to start from scratch. -- */ --#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -- drm_free_large(mem->user_pages); --#else -- kvfree(mem->user_pages); --#endif -- mem->user_pages = NULL; -- list_move_tail(&mem->validate_list.head, -- &process_info->userptr_valid_list); -- -- /* Update mapping. If the BO was not validated -- * (because we couldn't get user pages), this will -- * clear the page table entries, which will result in -- * VM faults if the GPU tries to access the invalid -- * memory. -- */ -- list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { -- if (!bo_va_entry->is_mapped) -- continue; -- -- ret = update_gpuvm_pte((struct amdgpu_device *) -- bo_va_entry->kgd_dev, -- bo_va_entry, &sync); -- if (ret) { -- pr_err("%s: update PTE failed\n", __func__); -- /* make sure this gets validated again */ -- atomic_inc(&mem->invalid); -- goto unreserve_out; -- } -- } -- } --unreserve_out: -- list_for_each_entry(peer_vm, &process_info->vm_list_head, -- vm_list_node) -- amdgpu_bo_fence(peer_vm->base.root.base.bo, -- &process_info->eviction_fence->base, true); -- ttm_eu_backoff_reservation(&ticket, &resv_list); -- amdgpu_sync_wait(&sync, false); -- amdgpu_sync_free(&sync); --out: -- kfree(pd_bo_list_entries); -- -- return ret; --} -- --/* Worker callback to restore evicted userptr BOs -- * -- * Tries to update and validate all userptr BOs. If successful and no -- * concurrent evictions happened, the queues are restarted. Otherwise, -- * reschedule for another attempt later. -- */ --static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) --{ -- struct delayed_work *dwork = to_delayed_work(work); -- struct amdkfd_process_info *process_info = -- container_of(dwork, struct amdkfd_process_info, work); -- struct task_struct *usertask; -- struct mm_struct *mm; -- int evicted_bos; -- -- evicted_bos = atomic_read(&process_info->evicted_bos); -- if (!evicted_bos) -- return; -- -- /* Reference task and mm in case of concurrent process termination */ -- usertask = get_pid_task(process_info->pid, PIDTYPE_PID); -- if (!usertask) -- return; -- mm = get_task_mm(usertask); -- if (!mm) { -- put_task_struct(usertask); -- return; -- } -- -- mutex_lock(&process_info->lock); -- -- if (update_invalid_user_pages(process_info, mm)) -- goto unlock_out; -- /* userptr_inval_list can be empty if all evicted userptr BOs -- * have been freed. In that case there is nothing to validate -- * and we can just restart the queues. -- */ -- if (!list_empty(&process_info->userptr_inval_list)) { -- if (atomic_read(&process_info->evicted_bos) != evicted_bos) -- goto unlock_out; /* Concurrent eviction, try again */ -- -- if (validate_invalid_user_pages(process_info)) -- goto unlock_out; -- } -- /* Final check for concurrent evicton and atomic update. If -- * another eviction happens after successful update, it will -- * be a first eviction that calls quiesce_mm. The eviction -- * reference counting inside KFD will handle this case. -- */ -- if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != -- evicted_bos) -- goto unlock_out; -- evicted_bos = 0; -- if (kgd2kfd->resume_mm(NULL, mm)) { -- pr_err("%s: Failed to resume KFD\n", __func__); -- /* No recovery from this failure. Probably the CP is -- * hanging. No point trying again. -- */ -- } --unlock_out: -- mutex_unlock(&process_info->lock); -- mmput(mm); -- put_task_struct(usertask); -- -- /* If validation failed, reschedule another attempt */ -- if (evicted_bos) -- schedule_delayed_work(&process_info->work, 1); --} -- --/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given -- * KFD process identified by process_info -- * -- * @process_info: amdkfd_process_info of the KFD process -- * -- * After memory eviction, restore thread calls this function. The function -- * should be called when the Process is still valid. BO restore involves - -- * -- * 1. Release old eviction fence and create new one -- * 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list. -- * 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of -- * BOs that need to be reserved. -- * 4. Reserve all the BOs -- * 5. Validate of PD and PT BOs. -- * 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence -- * 7. Add fence to all PD and PT BOs. -- * 8. Unreserve all BOs -- */ -- --int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) --{ -- struct amdgpu_bo_list_entry *pd_bo_list; -- struct amdkfd_process_info *process_info = info; -- struct amdkfd_vm *peer_vm; -- struct kgd_mem *mem; -- struct bo_vm_reservation_context ctx; -- struct amdgpu_amdkfd_fence *new_fence; -- int ret = 0, i; -- struct list_head duplicate_save; -- struct amdgpu_sync sync_obj; -- -- INIT_LIST_HEAD(&duplicate_save); -- INIT_LIST_HEAD(&ctx.list); -- INIT_LIST_HEAD(&ctx.duplicates); -- -- pd_bo_list = kcalloc(process_info->n_vms, -- sizeof(struct amdgpu_bo_list_entry), -- GFP_KERNEL); -- if (pd_bo_list == NULL) -- return -ENOMEM; -- -- i = 0; -- mutex_lock(&process_info->lock); -- list_for_each_entry(peer_vm, &process_info->vm_list_head, -- vm_list_node) -- amdgpu_vm_get_pd_bo(&peer_vm->base, &ctx.list, -- &pd_bo_list[i++]); -- -- /* Reserve all BOs and page tables/directory. Add all BOs from -- * kfd_bo_list to ctx.list -- */ -- list_for_each_entry(mem, &process_info->kfd_bo_list, -- validate_list.head) { -- -- list_add_tail(&mem->resv_list.head, &ctx.list); -- mem->resv_list.bo = mem->validate_list.bo; -- mem->resv_list.shared = mem->validate_list.shared; -- } -- -- ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list, -- false, &duplicate_save); -- if (ret) { -- pr_debug("Memory eviction: TTM Reserve Failed. Try again\n"); -- goto ttm_reserve_fail; -- } -- -- amdgpu_sync_create(&sync_obj); -- ctx.sync = &sync_obj; -- -- /* Validate PDs and PTs */ -- ret = process_validate_vms(process_info); -- if (ret) -- goto validate_map_fail; -- -- /* Wait for PD/PTs validate to finish */ -- /* FIXME: I think this isn't needed */ -- list_for_each_entry(peer_vm, &process_info->vm_list_head, -- vm_list_node) { -- struct amdgpu_bo *bo = peer_vm->base.root.base.bo; -- -- ttm_bo_wait(&bo->tbo, false, false); -- } -- -- /* Validate BOs and map them to GPUVM (update VM page tables). */ -- list_for_each_entry(mem, &process_info->kfd_bo_list, -- validate_list.head) { -- -- struct amdgpu_bo *bo = mem->bo; -- uint32_t domain = mem->domain; -- struct kfd_bo_va_list *bo_va_entry; -- -- ret = amdgpu_amdkfd_bo_validate(bo, domain, false); -- if (ret) { -- pr_debug("Memory eviction: Validate BOs failed. Try again\n"); -- goto validate_map_fail; -- } -- -- list_for_each_entry(bo_va_entry, &mem->bo_va_list, -- bo_list) { -- ret = update_gpuvm_pte((struct amdgpu_device *) -- bo_va_entry->kgd_dev, -- bo_va_entry, -- ctx.sync); -- if (ret) { -- pr_debug("Memory eviction: update PTE failed. Try again\n"); -- goto validate_map_fail; -- } -- } -- } -- -- amdgpu_sync_wait(ctx.sync, false); -- -- /* Release old eviction fence and create new one, because fence only -- * goes from unsignaled to signaled, fence cannot be reused. -- * Use context and mm from the old fence. -- */ -- new_fence = amdgpu_amdkfd_fence_create( -- process_info->eviction_fence->base.context, -- process_info->eviction_fence->mm); -- if (!new_fence) { -- pr_err("Failed to create eviction fence\n"); -- ret = -ENOMEM; -- goto validate_map_fail; -- } -- dma_fence_put(&process_info->eviction_fence->base); -- process_info->eviction_fence = new_fence; -- *ef = dma_fence_get(&new_fence->base); -- -- /* Wait for validate to finish and attach new eviction fence */ -- list_for_each_entry(mem, &process_info->kfd_bo_list, -- validate_list.head) -- ttm_bo_wait(&mem->bo->tbo, false, false); -- list_for_each_entry(mem, &process_info->kfd_bo_list, -- validate_list.head) -- amdgpu_bo_fence(mem->bo, -- &process_info->eviction_fence->base, true); -- -- /* Attach eviction fence to PD / PT BOs */ -- list_for_each_entry(peer_vm, &process_info->vm_list_head, -- vm_list_node) { -- struct amdgpu_bo *bo = peer_vm->base.root.base.bo; -- -- amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true); -- } --validate_map_fail: -- ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list); -- amdgpu_sync_free(&sync_obj); --ttm_reserve_fail: -- mutex_unlock(&process_info->lock); --evict_fence_fail: -- kfree(pd_bo_list); -- return ret; --} -- --int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, -- uint64_t src_offset, struct kgd_mem *dst_mem, -- uint64_t dst_offset, uint64_t size, -- struct dma_fence **f, uint64_t *actual_size) --{ -- struct amdgpu_device *adev = NULL; -- struct ttm_mem_reg *src = NULL, *dst = NULL; -- struct ttm_buffer_object *src_ttm_bo, *dst_ttm_bo; -- struct drm_mm_node *src_mm, *dst_mm; -- struct amdgpu_ring *ring; -- struct ww_acquire_ctx ticket; -- struct list_head list; -- struct ttm_validate_buffer resv_list[2]; -- uint64_t src_start, dst_start; -- uint64_t src_left, dst_left, cur_copy_size, total_copy_size = 0; -- struct dma_fence *fence = NULL; -- int r; -- -- if (!kgd || !src_mem || !dst_mem) -- return -EINVAL; -- -- if (actual_size) -- *actual_size = 0; -- -- adev = get_amdgpu_device(kgd); -- src_ttm_bo = &src_mem->bo->tbo; -- dst_ttm_bo = &dst_mem->bo->tbo; -- src = &src_ttm_bo->mem; -- dst = &dst_ttm_bo->mem; -- src_mm = (struct drm_mm_node *)src->mm_node; -- dst_mm = (struct drm_mm_node *)dst->mm_node; -- -- ring = adev->mman.buffer_funcs_ring; -- -- INIT_LIST_HEAD(&list); -- -- resv_list[0].bo = src_ttm_bo; -- resv_list[0].shared = true; -- resv_list[1].bo = dst_ttm_bo; -- resv_list[1].shared = true; -- -- list_add_tail(&resv_list[0].head, &list); -- list_add_tail(&resv_list[1].head, &list); -- -- if (!ring->ready) { -- pr_err("Trying to move memory with ring turned off.\n"); -- return -EINVAL; -- } -- -- r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL); -- if (r) { -- pr_err("Copy buffer failed. Unable to reserve bo (%d)\n", r); -- return r; -- } -- -- switch (src->mem_type) { -- case TTM_PL_TT: -- r = amdgpu_ttm_bind(src_ttm_bo, src); -- if (r) { -- DRM_ERROR("Copy failed. Cannot bind to gart\n"); -- goto copy_fail; -- } -- break; -- case TTM_PL_VRAM: -- /* VRAM could be scattered. Find the node in which the offset -- * belongs to -- */ -- while (src_offset >= (src_mm->size << PAGE_SHIFT)) { -- src_offset -= (src_mm->size << PAGE_SHIFT); -- ++src_mm; -- } -- break; -- default: -- DRM_ERROR("Unknown placement %d\n", src->mem_type); -- r = -EINVAL; -- goto copy_fail; -- } -- src_start = src_mm->start << PAGE_SHIFT; -- src_start += src_ttm_bo->bdev->man[src->mem_type].gpu_offset; -- src_start += src_offset; -- src_left = (src_mm->size << PAGE_SHIFT) - src_offset; -- -- switch (dst->mem_type) { -- case TTM_PL_TT: -- r = amdgpu_ttm_bind(dst_ttm_bo, dst); -- if (r) { -- DRM_ERROR("Copy failed. Cannot bind to gart\n"); -- goto copy_fail; -- } -- break; -- case TTM_PL_VRAM: -- while (dst_offset >= (dst_mm->size << PAGE_SHIFT)) { -- dst_offset -= (dst_mm->size << PAGE_SHIFT); -- ++dst_mm; -- } -- break; -- default: -- DRM_ERROR("Unknown placement %d\n", dst->mem_type); -- r = -EINVAL; -- goto copy_fail; -- } -- dst_start = dst_mm->start << PAGE_SHIFT; -- dst_start += dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; -- dst_start += dst_offset; -- dst_left = (dst_mm->size << PAGE_SHIFT) - dst_offset; -- -- do { -- struct dma_fence *next; -- -- /* src_left/dst_left: amount of space left in the current node -- * Copy minimum of (src_left, dst_left, amount of bytes left to -- * copy) -- */ -- cur_copy_size = min3(src_left, dst_left, -- (size - total_copy_size)); -- -- r = amdgpu_copy_buffer(ring, src_start, dst_start, -- cur_copy_size, NULL, &next, false, false); -- if (r) -- break; -- -- /* Just keep the last fence */ -- dma_fence_put(fence); -- fence = next; -- -- total_copy_size += cur_copy_size; -- /* Required amount of bytes copied. Done. */ -- if (total_copy_size >= size) -- break; -- -- /* If end of src or dst node is reached, move to next node */ -- src_left -= cur_copy_size; -- if (!src_left) { -- ++src_mm; -- src_start = src_mm->start << PAGE_SHIFT; -- src_start += -- src_ttm_bo->bdev->man[src->mem_type].gpu_offset; -- src_left = src_mm->size << PAGE_SHIFT; -- } else -- src_start += cur_copy_size; -- -- dst_left -= cur_copy_size; -- if (!dst_left) { -- ++dst_mm; -- dst_start = dst_mm->start << PAGE_SHIFT; -- dst_start += -- dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; -- dst_left = dst_mm->size << PAGE_SHIFT; -- } else -- dst_start += cur_copy_size; -- -- } while (total_copy_size < size); -- -- /* Failure could occur after partial copy. So fill in amount copied -- * and fence, still fill-in -- */ -- if (actual_size) -- *actual_size = total_copy_size; -- -- if (fence) { -- amdgpu_bo_fence(src_mem->bo, fence, true); -- amdgpu_bo_fence(dst_mem->bo, fence, true); -- } -- -- if (f) -- *f = fence; -- --copy_fail: -- ttm_eu_backoff_reservation(&ticket, &list); -- return r; --} -- -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c -index ff6f90a..5ad0580 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c -@@ -27,7 +27,9 @@ - #include <linux/pagemap.h> - #include <drm/drmP.h> - #include <drm/amdgpu_drm.h> -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) - #include <drm/drm_syncobj.h> -+#endif - #include "amdgpu.h" - #include "amdgpu_trace.h" - -@@ -38,7 +40,7 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, - struct drm_gem_object *gobj; - unsigned long size; - -- gobj = drm_gem_object_lookup(p->filp, data->handle); -+ gobj = kcl_drm_gem_object_lookup(p->adev->ddev, p->filp, data->handle); - if (gobj == NULL) - return -EINVAL; - -@@ -54,7 +56,7 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, - - *offset = data->offset; - -- drm_gem_object_put_unlocked(gobj); -+ kcl_drm_gem_object_put_unlocked(gobj); - - if (amdgpu_ttm_tt_get_usermm(p->uf_entry.robj->tbo.ttm)) { - amdgpu_bo_unref(&p->uf_entry.robj); -@@ -90,7 +92,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) - } - - /* get chunks */ -- chunk_array_user = u64_to_user_ptr(cs->in.chunks); -+ chunk_array_user = kcl_u64_to_user_ptr(cs->in.chunks); - if (copy_from_user(chunk_array, chunk_array_user, - sizeof(uint64_t)*cs->in.num_chunks)) { - ret = -EFAULT; -@@ -110,7 +112,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) - struct drm_amdgpu_cs_chunk user_chunk; - uint32_t __user *cdata; - -- chunk_ptr = u64_to_user_ptr(chunk_array[i]); -+ chunk_ptr = kcl_u64_to_user_ptr(chunk_array[i]); - if (copy_from_user(&user_chunk, chunk_ptr, - sizeof(struct drm_amdgpu_cs_chunk))) { - ret = -EFAULT; -@@ -121,9 +123,13 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) - p->chunks[i].length_dw = user_chunk.length_dw; - - size = p->chunks[i].length_dw; -- cdata = u64_to_user_ptr(user_chunk.chunk_data); -+ cdata = kcl_u64_to_user_ptr(user_chunk.chunk_data); - -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ p->chunks[i].kdata = drm_malloc_ab(size, sizeof(uint32_t)); -+#else - p->chunks[i].kdata = kvmalloc_array(size, sizeof(uint32_t), GFP_KERNEL); -+#endif - if (p->chunks[i].kdata == NULL) { - ret = -ENOMEM; - i--; -@@ -155,8 +161,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) - break; - - case AMDGPU_CHUNK_ID_DEPENDENCIES: -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) - case AMDGPU_CHUNK_ID_SYNCOBJ_IN: - case AMDGPU_CHUNK_ID_SYNCOBJ_OUT: -+#endif - break; - - default: -@@ -178,7 +186,11 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) - i = p->nchunks - 1; - free_partial_kdata: - for (; i >= 0; i--) -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ drm_free_large(p->chunks[i].kdata); -+#else - kvfree(p->chunks[i].kdata); -+#endif - kfree(p->chunks); - p->chunks = NULL; - p->nchunks = 0; -@@ -477,16 +489,11 @@ static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p, - return -EPERM; - - /* Check if we have user pages and nobody bound the BO already */ -- if (amdgpu_ttm_tt_userptr_needs_pages(bo->tbo.ttm) && -- lobj->user_pages) { -- amdgpu_ttm_placement_from_domain(bo, -- AMDGPU_GEM_DOMAIN_CPU); -- r = ttm_bo_validate(&bo->tbo, &bo->placement, true, -- false); -- if (r) -- return r; -- amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, -- lobj->user_pages); -+ if (lobj->user_pages && bo->tbo.ttm->state != tt_bound) { -+ size_t size = sizeof(struct page *); -+ -+ size *= bo->tbo.ttm->num_pages; -+ memcpy(bo->tbo.ttm->pages, lobj->user_pages, size); - binding_userptr = true; - } - -@@ -498,7 +505,11 @@ static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p, - return r; - - if (binding_userptr) { -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ drm_free_large(lobj->user_pages); -+#else - kvfree(lobj->user_pages); -+#endif - lobj->user_pages = NULL; - } - } -@@ -511,6 +522,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, - struct amdgpu_fpriv *fpriv = p->filp->driver_priv; - struct amdgpu_bo_list_entry *e; - struct list_head duplicates; -+ bool need_mmap_lock = false; - unsigned i, tries = 10; - int r; - -@@ -518,9 +530,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, - - p->bo_list = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle); - if (p->bo_list) { -+ need_mmap_lock = p->bo_list->first_userptr != -+ p->bo_list->num_entries; - amdgpu_bo_list_get_list(p->bo_list, &p->validated); -- if (p->bo_list->first_userptr != p->bo_list->num_entries) -- p->mn = amdgpu_mn_get(p->adev); - } - - INIT_LIST_HEAD(&duplicates); -@@ -529,6 +541,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, - if (p->uf_entry.robj && !p->uf_entry.robj->parent) - list_add(&p->uf_entry.tv.head, &p->validated); - -+ if (need_mmap_lock) -+ down_read(¤t->mm->mmap_sem); -+ - while (1) { - struct list_head need_pages; - unsigned i; -@@ -548,25 +563,27 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, - INIT_LIST_HEAD(&need_pages); - for (i = p->bo_list->first_userptr; - i < p->bo_list->num_entries; ++i) { -- struct amdgpu_bo *bo; - - e = &p->bo_list->array[i]; -- bo = e->robj; -- -- if (amdgpu_ttm_tt_userptr_invalidated(bo->tbo.ttm, -+ -+ if (amdgpu_ttm_tt_userptr_invalidated(e->robj->tbo.ttm, - &e->user_invalidated) && e->user_pages) { - - /* We acquired a page array, but somebody - * invalidated it. Free it and try again - */ - release_pages(e->user_pages, -- bo->tbo.ttm->num_pages, -+ e->robj->tbo.ttm->num_pages, - false); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ drm_free_large(e->user_pages); -+#else - kvfree(e->user_pages); -+#endif - e->user_pages = NULL; - } - -- if (amdgpu_ttm_tt_userptr_needs_pages(bo->tbo.ttm) && -+ if (e->robj->tbo.ttm->state != tt_bound && - !e->user_pages) { - list_del(&e->tv.head); - list_add(&e->tv.head, &need_pages); -@@ -592,9 +609,14 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, - list_for_each_entry(e, &need_pages, tv.head) { - struct ttm_tt *ttm = e->robj->tbo.ttm; - -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ e->user_pages = drm_calloc_large(ttm->num_pages, -+ sizeof(struct page*)); -+#else - e->user_pages = kvmalloc_array(ttm->num_pages, - sizeof(struct page*), - GFP_KERNEL | __GFP_ZERO); -+#endif - if (!e->user_pages) { - r = -ENOMEM; - DRM_ERROR("calloc failure in %s\n", __func__); -@@ -604,7 +626,11 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, - r = amdgpu_ttm_tt_get_user_pages(ttm, e->user_pages); - if (r) { - DRM_ERROR("amdgpu_ttm_tt_get_user_pages failed.\n"); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ drm_free_large(e->user_pages); -+#else - kvfree(e->user_pages); -+#endif - e->user_pages = NULL; - goto error_free_pages; - } -@@ -643,6 +669,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, - - amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved, - p->bytes_moved_vis); -+ fpriv->vm.last_eviction_counter = -+ atomic64_read(&p->adev->num_evictions); -+ - if (p->bo_list) { - struct amdgpu_bo *gds = p->bo_list->gds_obj; - struct amdgpu_bo *gws = p->bo_list->gws_obj; -@@ -683,6 +712,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, - - error_free_pages: - -+ if (need_mmap_lock) -+ up_read(¤t->mm->mmap_sem); -+ - if (p->bo_list) { - for (i = p->bo_list->first_userptr; - i < p->bo_list->num_entries; ++i) { -@@ -694,7 +726,11 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, - release_pages(e->user_pages, - e->robj->tbo.ttm->num_pages, - false); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ drm_free_large(e->user_pages); -+#else - kvfree(e->user_pages); -+#endif - } - } - -@@ -729,13 +765,19 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, - { - unsigned i; - -- if (error && backoff) -+ if (!error) -+ ttm_eu_fence_buffer_objects(&parser->ticket, -+ &parser->validated, -+ parser->fence); -+ else if (backoff) - ttm_eu_backoff_reservation(&parser->ticket, - &parser->validated); - -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) - for (i = 0; i < parser->num_post_dep_syncobjs; i++) - drm_syncobj_put(parser->post_dep_syncobjs[i]); - kfree(parser->post_dep_syncobjs); -+#endif - - dma_fence_put(parser->fence); - -@@ -745,7 +787,11 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, - amdgpu_bo_list_put(parser->bo_list); - - for (i = 0; i < parser->nchunks; i++) -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ drm_free_large(parser->chunks[i].kdata); -+#else - kvfree(parser->chunks[i].kdata); -+#endif - kfree(parser->chunks); - if (parser->job) - amdgpu_job_free(parser->job); -@@ -765,6 +811,10 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) - if (r) - return r; - -+ r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_dir_update); -+ if (r) -+ return r; -+ - r = amdgpu_vm_clear_freed(adev, vm, NULL); - if (r) - return r; -@@ -818,13 +868,7 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) - - } - -- r = amdgpu_vm_handle_moved(adev, vm); -- if (r) -- return r; -- -- r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_update); -- if (r) -- return r; -+ r = amdgpu_vm_clear_moved(adev, vm, &p->job->sync); - - if (amdgpu_vm_debug && p->bo_list) { - /* Invalidate all BOs to test for userspace bugs */ -@@ -834,7 +878,7 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) - if (!bo) - continue; - -- amdgpu_vm_bo_invalidate(adev, bo, false); -+ amdgpu_vm_bo_invalidate(adev, bo); - } - } - -@@ -859,7 +903,7 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev, - } - - if (p->job->vm) { -- p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->root.base.bo); -+ p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->root.bo); - - r = amdgpu_bo_vm_update_pte(p); - if (r) -@@ -927,11 +971,11 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, - uint64_t offset; - uint8_t *kptr; - -- r = amdgpu_cs_find_mapping(parser, chunk_ib->va_start, -- &aobj, &m); -- if (r) { -+ m = amdgpu_cs_find_mapping(parser, chunk_ib->va_start, -+ &aobj); -+ if (!aobj) { - DRM_ERROR("IB va_start is invalid\n"); -- return r; -+ return -EINVAL; - } - - if ((chunk_ib->va_start + chunk_ib->ib_bytes) > -@@ -1029,12 +1073,13 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, - return 0; - } - -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) - static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p, - uint32_t handle) - { - int r; - struct dma_fence *fence; -- r = drm_syncobj_find_fence(p->filp, handle, &fence); -+ r = drm_syncobj_fence_get(p->filp, handle, &fence); - if (r) - return r; - -@@ -1089,6 +1134,7 @@ static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p, - } - return 0; - } -+#endif - - static int amdgpu_cs_dependencies(struct amdgpu_device *adev, - struct amdgpu_cs_parser *p) -@@ -1104,6 +1150,7 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, - r = amdgpu_cs_process_fence_dep(p, chunk); - if (r) - return r; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) - } else if (chunk->chunk_id == AMDGPU_CHUNK_ID_SYNCOBJ_IN) { - r = amdgpu_cs_process_syncobj_in_dep(p, chunk); - if (r) -@@ -1112,12 +1159,14 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, - r = amdgpu_cs_process_syncobj_out_dep(p, chunk); - if (r) - return r; -+#endif - } - } - - return amdgpu_sem_add_cs(p->ctx, p->job->ring, &p->job->dep_sync); - } - -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) - static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) - { - int i; -@@ -1125,6 +1174,7 @@ static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) - for (i = 0; i < p->num_post_dep_syncobjs; ++i) - drm_syncobj_replace_fence(p->post_dep_syncobjs[i], p->fence); - } -+#endif - - static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, - union drm_amdgpu_cs *cs) -@@ -1132,29 +1182,14 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, - struct amdgpu_ring *ring = p->job->ring; - struct amd_sched_entity *entity = &p->ctx->rings[ring->idx].entity; - struct amdgpu_job *job; -- unsigned i; - int r; - -- amdgpu_mn_lock(p->mn); -- if (p->bo_list) { -- for (i = p->bo_list->first_userptr; -- i < p->bo_list->num_entries; ++i) { -- struct amdgpu_bo *bo = p->bo_list->array[i].robj; -- -- if (amdgpu_ttm_tt_userptr_needs_pages(bo->tbo.ttm)) { -- amdgpu_mn_unlock(p->mn); -- return -ERESTARTSYS; -- } -- } -- } -- - job = p->job; - p->job = NULL; - - r = amd_sched_job_init(&job->base, &ring->sched, entity, p->filp); - if (r) { - amdgpu_job_free(job); -- amdgpu_mn_unlock(p->mn); - return r; - } - -@@ -1162,18 +1197,17 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, - job->fence_ctx = entity->fence_context; - p->fence = dma_fence_get(&job->base.s_fence->finished); - -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) - amdgpu_cs_post_dependencies(p); -+#endif - - cs->out.handle = amdgpu_ctx_add_fence(p->ctx, ring, p->fence); - job->uf_sequence = cs->out.handle; - amdgpu_job_free_resources(job); -+ amdgpu_cs_parser_fini(p, 0, true); - - trace_amdgpu_cs_ioctl(job); - amd_sched_entity_push_job(&job->base); -- -- ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence); -- amdgpu_mn_unlock(p->mn); -- - return 0; - } - -@@ -1228,7 +1262,10 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) - goto out; - - r = amdgpu_cs_submit(&parser, cs); -+ if (r) -+ goto out; - -+ return 0; - out: - amdgpu_cs_parser_fini(&parser, r, reserved_buffers); - return r; -@@ -1274,7 +1311,7 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data, - if (IS_ERR(fence)) - r = PTR_ERR(fence); - else if (fence) { -- r = dma_fence_wait_timeout(fence, true, timeout); -+ r = kcl_fence_wait_timeout(fence, true, timeout); - dma_fence_put(fence); - } else - r = 1; -@@ -1349,7 +1386,7 @@ static int amdgpu_cs_wait_all_fences(struct amdgpu_device *adev, - else if (!fence) - continue; - -- r = dma_fence_wait_timeout(fence, true, timeout); -+ r = kcl_fence_wait_timeout(fence, true, timeout); - dma_fence_put(fence); - if (r < 0) - return r; -@@ -1401,13 +1438,12 @@ static int amdgpu_cs_wait_any_fence(struct amdgpu_device *adev, - array[i] = fence; - } else { /* NULL, the fence has been already signaled */ - r = 1; -- first = i; - goto out; - } - } - -- r = dma_fence_wait_any_timeout(array, fence_count, true, timeout, -- &first); -+ r = kcl_fence_wait_any_timeout(array, fence_count, true, timeout, -+ &first); - if (r < 0) - goto err_free_fence_array; - -@@ -1452,7 +1488,7 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, - if (fences == NULL) - return -ENOMEM; - -- fences_user = u64_to_user_ptr(wait->in.fences); -+ fences_user = kcl_u64_to_user_ptr(wait->in.fences); - if (copy_from_user(fences, fences_user, - sizeof(struct drm_amdgpu_fence) * fence_count)) { - r = -EFAULT; -@@ -1481,36 +1517,78 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, - * virtual memory address. Returns allocation structure when found, NULL - * otherwise. - */ --int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, -- uint64_t addr, struct amdgpu_bo **bo, -- struct amdgpu_bo_va_mapping **map) -+struct amdgpu_bo_va_mapping * -+amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, -+ uint64_t addr, struct amdgpu_bo **bo) - { -- struct amdgpu_fpriv *fpriv = parser->filp->driver_priv; -- struct amdgpu_vm *vm = &fpriv->vm; - struct amdgpu_bo_va_mapping *mapping; -- int r; -+ unsigned i; -+ -+ if (!parser->bo_list) -+ return NULL; - - addr /= AMDGPU_GPU_PAGE_SIZE; - -- mapping = amdgpu_vm_bo_lookup_mapping(vm, addr); -- if (!mapping || !mapping->bo_va || !mapping->bo_va->base.bo) -- return -EINVAL; -+ for (i = 0; i < parser->bo_list->num_entries; i++) { -+ struct amdgpu_bo_list_entry *lobj; - -- *bo = mapping->bo_va->base.bo; -- *map = mapping; -+ lobj = &parser->bo_list->array[i]; -+ if (!lobj->bo_va) -+ continue; - -- /* Double check that the BO is reserved by this CS */ -- if (READ_ONCE((*bo)->tbo.resv->lock.ctx) != &parser->ticket) -- return -EINVAL; -+ list_for_each_entry(mapping, &lobj->bo_va->valids, list) { -+ if (mapping->start > addr || -+ addr > mapping->last) -+ continue; - -- r = amdgpu_ttm_bind(&(*bo)->tbo, &(*bo)->tbo.mem); -- if (unlikely(r)) -- return r; -+ *bo = lobj->bo_va->base.bo; -+ return mapping; -+ } -+ -+ list_for_each_entry(mapping, &lobj->bo_va->invalids, list) { -+ if (mapping->start > addr || -+ addr > mapping->last) -+ continue; - -- if ((*bo)->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) -+ *bo = lobj->bo_va->base.bo; -+ return mapping; -+ } -+ } -+ -+ return NULL; -+} -+ -+/** -+ * amdgpu_cs_sysvm_access_required - make BOs accessible by the system VM -+ * -+ * @parser: command submission parser context -+ * -+ * Helper for UVD/VCE VM emulation, make sure BOs are accessible by the system VM. -+ */ -+int amdgpu_cs_sysvm_access_required(struct amdgpu_cs_parser *parser) -+{ -+ unsigned i; -+ int r; -+ -+ if (!parser->bo_list) - return 0; - -- (*bo)->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; -- amdgpu_ttm_placement_from_domain(*bo, (*bo)->allowed_domains); -- return ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, false, false); -+ for (i = 0; i < parser->bo_list->num_entries; i++) { -+ struct amdgpu_bo *bo = parser->bo_list->array[i].robj; -+ -+ r = amdgpu_ttm_bind(&bo->tbo, &bo->tbo.mem); -+ if (unlikely(r)) -+ return r; -+ -+ if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) -+ continue; -+ -+ bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; -+ amdgpu_ttm_placement_from_domain(bo, bo->allowed_domains); -+ r = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); -+ if (unlikely(r)) -+ return r; -+ } -+ -+ return 0; - } -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c -index f032e87..37398e3 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c -@@ -404,15 +404,6 @@ void amdgpu_pci_config_reset(struct amdgpu_device *adev) - */ - static int amdgpu_doorbell_init(struct amdgpu_device *adev) - { -- /* No doorbell on SI hardware generation */ -- if (adev->asic_type < CHIP_BONAIRE) { -- adev->doorbell.base = 0; -- adev->doorbell.size = 0; -- adev->doorbell.num_doorbells = 0; -- adev->doorbell.ptr = NULL; -- return 0; -- } -- - /* doorbell bar mapping */ - adev->doorbell.base = pci_resource_start(adev->pdev, 2); - adev->doorbell.size = pci_resource_len(adev->pdev, 2); -@@ -2130,8 +2121,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, - DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); - DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); - -- /* doorbell bar mapping */ -- amdgpu_doorbell_init(adev); -+ if (adev->asic_type >= CHIP_BONAIRE) -+ /* doorbell bar mapping */ -+ amdgpu_doorbell_init(adev); - - /* io port mapping */ - for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { -@@ -2348,7 +2340,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) - amdgpu_atombios_fini(adev); - kfree(adev->bios); - adev->bios = NULL; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0) - if (!pci_is_thunderbolt_attached(adev->pdev)) -+#endif - vga_switcheroo_unregister_client(adev->pdev); - if (adev->flags & AMD_IS_PX) - vga_switcheroo_fini_domain_pm_ops(adev->dev); -@@ -2358,7 +2352,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) - adev->rio_mem = NULL; - iounmap(adev->rmmio); - adev->rmmio = NULL; -- amdgpu_doorbell_fini(adev); -+ if (adev->asic_type >= CHIP_BONAIRE) -+ amdgpu_doorbell_fini(adev); - amdgpu_debugfs_regs_cleanup(adev); - } - -@@ -3159,6 +3154,27 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, - return 0; - } - -+#if defined(BUILD_AS_DKMS) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) -+void amdgpu_debugfs_cleanup(struct drm_minor *minor) -+{ -+ struct drm_info_node *node, *tmp; -+ -+ if (!&minor->debugfs_root) -+ return 0; -+ -+ mutex_lock(&minor->debugfs_lock); -+ list_for_each_entry_safe(node, tmp, -+ &minor->debugfs_list, list) { -+ debugfs_remove(node->dent); -+ list_del(&node->list); -+ kfree(node); -+ } -+ mutex_unlock(&minor->debugfs_lock); -+ -+ return 0; -+} -+#endif -+ - #if defined(CONFIG_DEBUG_FS) - - static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user *buf, -@@ -3570,7 +3586,10 @@ static ssize_t amdgpu_debugfs_sensor_read(struct file *f, char __user *buf, - - valuesize = sizeof(values); - if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->read_sensor) -- r = amdgpu_dpm_read_sensor(adev, idx, &values[0], &valuesize); -+ r = adev->powerplay.pp_funcs->read_sensor(adev->powerplay.pp_handle, idx, &values[0], &valuesize); -+ else if (adev->pm.funcs && adev->pm.funcs->read_sensor) -+ r = adev->pm.funcs->read_sensor(adev, idx, &values[0], -+ &valuesize); - else - return -EINVAL; - -@@ -3594,7 +3613,7 @@ static ssize_t amdgpu_debugfs_sensor_read(struct file *f, char __user *buf, - static ssize_t amdgpu_debugfs_wave_read(struct file *f, char __user *buf, - size_t size, loff_t *pos) - { -- struct amdgpu_device *adev = f->f_inode->i_private; -+ struct amdgpu_device *adev = file_inode(f)->i_private; - int r, x; - ssize_t result=0; - uint32_t offset, se, sh, cu, wave, simd, data[32]; -@@ -3644,7 +3663,8 @@ static ssize_t amdgpu_debugfs_wave_read(struct file *f, char __user *buf, - static ssize_t amdgpu_debugfs_gpr_read(struct file *f, char __user *buf, - size_t size, loff_t *pos) - { -- struct amdgpu_device *adev = f->f_inode->i_private; -+ struct amdgpu_device *adev = file_inode(f)->i_private; -+ - int r; - ssize_t result = 0; - uint32_t offset, se, sh, cu, wave, simd, thread, bank, *data; -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h -index 0d22259..12a4a78 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h -@@ -427,6 +427,7 @@ struct amdgpu_pm { - struct amdgpu_dpm dpm; - const struct firmware *fw; /* SMC firmware */ - uint32_t fw_version; -+ const struct amdgpu_dpm_funcs *funcs; - uint32_t pcie_gen_mask; - uint32_t pcie_mlw_mask; - struct amd_pp_display_configuration pm_display_cfg;/* set by dc */ -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -index 2be2e05..0720358 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -@@ -69,10 +69,9 @@ - * - 3.17.0 - Add AMDGPU_NUM_VRAM_CPU_PAGE_FAULTS. - * - 3.18.0 - Export gpu always on cu bitmap - * - 3.19.0 - Add support for UVD MJPEG decode -- * - 3.20.0 - Add support for local BOs - */ - #define KMS_DRIVER_MAJOR 3 --#define KMS_DRIVER_MINOR 20 -+#define KMS_DRIVER_MINOR 19 - #define KMS_DRIVER_PATCHLEVEL 0 - - int amdgpu_vram_limit = 0; -@@ -124,7 +123,6 @@ int amdgpu_cntl_sb_buf_per_se = 0; - int amdgpu_param_buf_per_se = 0; - int amdgpu_job_hang_limit = 0; - int amdgpu_lbpw = -1; --int amdgpu_compute_multipipe = -1; - - MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes"); - module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); -@@ -274,9 +272,6 @@ module_param_named(job_hang_limit, amdgpu_job_hang_limit, int ,0444); - MODULE_PARM_DESC(lbpw, "Load Balancing Per Watt (LBPW) support (1 = enable, 0 = disable, -1 = auto)"); - module_param_named(lbpw, amdgpu_lbpw, int, 0444); - --MODULE_PARM_DESC(compute_multipipe, "Force compute queues to be spread across pipes (1 = enable, 0 = disable, -1 = auto)"); --module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444); -- - #ifdef CONFIG_DRM_AMDGPU_SI - - int amdgpu_si_support = 1; -@@ -822,10 +817,8 @@ static struct drm_driver kms_driver = { - .open = amdgpu_driver_open_kms, - .postclose = amdgpu_driver_postclose_kms, - .lastclose = amdgpu_driver_lastclose_kms, -+ .set_busid = drm_pci_set_busid, - .unload = amdgpu_driver_unload_kms, -- .get_vblank_counter = amdgpu_get_vblank_counter_kms, -- .enable_vblank = amdgpu_enable_vblank_kms, -- .disable_vblank = amdgpu_disable_vblank_kms, - .get_vblank_timestamp = drm_calc_vbltimestamp_from_scanoutpos, - .get_scanout_position = amdgpu_get_crtc_scanout_position, - #if defined(CONFIG_DEBUG_FS) -@@ -841,6 +834,7 @@ static struct drm_driver kms_driver = { - .gem_close_object = amdgpu_gem_object_close, - .dumb_create = amdgpu_mode_dumb_create, - .dumb_map_offset = amdgpu_mode_dumb_mmap, -+ .dumb_destroy = drm_gem_dumb_destroy, - .fops = &amdgpu_driver_kms_fops, - - .prime_handle_to_fd = drm_gem_prime_handle_to_fd, -@@ -931,4 +925,3 @@ module_exit(amdgpu_exit); - MODULE_AUTHOR(DRIVER_AUTHOR); - MODULE_DESCRIPTION(DRIVER_DESC); - MODULE_LICENSE("GPL and additional rights"); --MODULE_VERSION("17.50.2.13"); -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c -index 3d08c6f..fdb9d85 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c -@@ -44,12 +44,20 @@ - * This is the main unload function for KMS (all asics). - * Returns 0 on success. - */ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) -+int amdgpu_driver_unload_kms(struct drm_device *dev) -+#else - void amdgpu_driver_unload_kms(struct drm_device *dev) -+#endif - { - struct amdgpu_device *adev = dev->dev_private; - - if (adev == NULL) -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) -+ return 0; -+#else - return; -+#endif - - if (adev->rmmio == NULL) - goto done_free; -@@ -71,6 +79,9 @@ void amdgpu_driver_unload_kms(struct drm_device *dev) - done_free: - kfree(adev); - dev->dev_private = NULL; -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) -+ return 0; -+#endif - } - - /** -@@ -129,8 +140,12 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags) - amdgpu_has_atpx() && - (amdgpu_is_atpx_hybrid() || - amdgpu_has_atpx_dgpu_power_cntl()) && -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) -+ ((flags & AMD_IS_APU) == 0)) -+#else - ((flags & AMD_IS_APU) == 0) && - !pci_is_thunderbolt_attached(dev->pdev)) -+#endif - flags |= AMD_IS_PX; - - /* amdgpu_device_init should report only fatal error -@@ -1052,6 +1067,72 @@ void amdgpu_disable_vblank_kms(struct drm_device *dev, unsigned int pipe) - amdgpu_irq_put(adev, &adev->crtc_irq, idx); - } - -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) -+/** -+ * amdgpu_get_vblank_timestamp_kms - get vblank timestamp -+ * -+ * @dev: drm dev pointer -+ * @crtc: crtc to get the timestamp for -+ * @max_error: max error -+ * @vblank_time: time value -+ * @flags: flags passed to the driver -+ * -+ * Gets the timestamp on the requested crtc based on the -+ * scanout position. (all asics). -+ * Returns postive status flags on success, negative error on failure. -+ */ -+int amdgpu_get_vblank_timestamp_kms(struct drm_device *dev, unsigned int pipe, -+ int *max_error, -+ struct timeval *vblank_time, -+ unsigned flags) -+{ -+ struct drm_crtc *crtc; -+ struct amdgpu_device *adev = dev->dev_private; -+ -+ if (pipe >= dev->num_crtcs) { -+ DRM_ERROR("Invalid crtc %u\n", pipe); -+ return -EINVAL; -+ } -+ -+ /* Get associated drm_crtc: */ -+ crtc = &adev->mode_info.crtcs[pipe]->base; -+ if (!crtc) { -+ /* This can occur on driver load if some component fails to -+ * initialize completely and driver is unloaded */ -+ DRM_ERROR("Uninitialized crtc %d\n", pipe); -+ return -EINVAL; -+ } -+ -+ /* Helper routine in DRM core does all the work: */ -+ return kcl_drm_calc_vbltimestamp_from_scanoutpos(dev, pipe, max_error, -+ vblank_time, flags, -+ crtc, &crtc->hwmode); -+} -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) -+const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_CREATE, amdgpu_gem_create_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_CTX, amdgpu_ctx_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_BO_LIST, amdgpu_bo_list_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ /* KMS */ -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_MMAP, amdgpu_gem_mmap_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_WAIT_IDLE, amdgpu_gem_wait_idle_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_CS, amdgpu_cs_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_INFO, amdgpu_info_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_WAIT_CS, amdgpu_cs_wait_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_WAIT_FENCES, amdgpu_cs_wait_fences_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_METADATA, amdgpu_gem_metadata_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_VA, amdgpu_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_OP, amdgpu_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_USERPTR, amdgpu_gem_userptr_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_FIND_BO, amdgpu_gem_find_bo_by_cpu_mapping_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_FREESYNC, amdgpu_freesync_ioctl, DRM_MASTER|DRM_UNLOCKED), -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_FIND_BO, amdgpu_gem_find_bo_by_cpu_mapping_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_GEM_DGMA, amdgpu_gem_dgma_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+ DRM_IOCTL_DEF_DRV(AMDGPU_SEM, amdgpu_sem_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), -+}; -+#else - const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { - DRM_IOCTL_DEF_DRV(AMDGPU_GEM_CREATE, amdgpu_gem_create_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), - DRM_IOCTL_DEF_DRV(AMDGPU_CTX, amdgpu_ctx_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), -@@ -1073,6 +1154,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { - DRM_IOCTL_DEF_DRV(AMDGPU_GEM_DGMA, amdgpu_gem_dgma_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), - DRM_IOCTL_DEF_DRV(AMDGPU_SEM, amdgpu_sem_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), - }; -+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) */ - const int amdgpu_max_kms_ioctl = ARRAY_SIZE(amdgpu_ioctls_kms); - - /* -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c -index d25ec38..430c622 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c -@@ -50,10 +50,8 @@ struct amdgpu_mn { - struct hlist_node node; - - /* objects protected by lock */ -- struct rw_semaphore lock; -- struct rb_root_cached objects; -- struct mutex read_lock; -- atomic_t recursion; -+ struct mutex lock; -+ struct rb_root objects; - }; - - struct amdgpu_mn_node { -@@ -76,17 +74,17 @@ static void amdgpu_mn_destroy(struct work_struct *work) - struct amdgpu_bo *bo, *next_bo; - - mutex_lock(&adev->mn_lock); -- down_write(&rmn->lock); -+ mutex_lock(&rmn->lock); - hash_del(&rmn->node); -- rbtree_postorder_for_each_entry_safe(node, next_node, -- &rmn->objects.rb_root, it.rb) { -+ rbtree_postorder_for_each_entry_safe(node, next_node, &rmn->objects, -+ it.rb) { - list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) { - bo->mn = NULL; - list_del_init(&bo->mn_list); - } - kfree(node); - } -- up_write(&rmn->lock); -+ mutex_unlock(&rmn->lock); - mutex_unlock(&adev->mn_lock); - mmu_notifier_unregister_no_release(&rmn->mn, rmn->mm); - kfree(rmn); -@@ -108,53 +106,6 @@ static void amdgpu_mn_release(struct mmu_notifier *mn, - schedule_work(&rmn->work); - } - -- --/** -- * amdgpu_mn_lock - take the write side lock for this mn -- */ --void amdgpu_mn_lock(struct amdgpu_mn *mn) --{ -- if (mn) -- down_write(&mn->lock); --} -- --/** -- * amdgpu_mn_unlock - drop the write side lock for this mn -- */ --void amdgpu_mn_unlock(struct amdgpu_mn *mn) --{ -- if (mn) -- up_write(&mn->lock); --} -- --/** -- * amdgpu_mn_read_lock - take the rmn read lock -- * -- * @rmn: our notifier -- * -- * Take the rmn read side lock. -- */ --static void amdgpu_mn_read_lock(struct amdgpu_mn *rmn) --{ -- mutex_lock(&rmn->read_lock); -- if (atomic_inc_return(&rmn->recursion) == 1) -- down_read_non_owner(&rmn->lock); -- mutex_unlock(&rmn->read_lock); --} -- --/** -- * amdgpu_mn_read_unlock - drop the rmn read lock -- * -- * @rmn: our notifier -- * -- * Drop the rmn read side lock. -- */ --static void amdgpu_mn_read_unlock(struct amdgpu_mn *rmn) --{ -- if (atomic_dec_return(&rmn->recursion) == 0) -- up_read_non_owner(&rmn->lock); --} -- - /** - * amdgpu_mn_invalidate_node - unmap all BOs of a node - * -@@ -175,12 +126,23 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, - if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end)) - continue; - -- r = reservation_object_wait_timeout_rcu(bo->tbo.resv, -+ r = amdgpu_bo_reserve(bo, true); -+ if (r) { -+ DRM_ERROR("(%ld) failed to reserve user bo\n", r); -+ continue; -+ } -+ -+ r = kcl_reservation_object_wait_timeout_rcu(bo->tbo.resv, - true, false, MAX_SCHEDULE_TIMEOUT); - if (r <= 0) - DRM_ERROR("(%ld) failed to wait for user bo\n", r); - -- amdgpu_ttm_tt_mark_user_pages(bo->tbo.ttm); -+ amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); -+ r = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); -+ if (r) -+ DRM_ERROR("(%ld) failed to validate user bo\n", r); -+ -+ amdgpu_bo_unreserve(bo); - } - } - -@@ -206,7 +168,7 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, - /* notification is exclusive, but interval is inclusive */ - end -= 1; - -- amdgpu_mn_read_lock(rmn); -+ mutex_lock(&rmn->lock); - - it = interval_tree_iter_first(&rmn->objects, start, end); - while (it) { -@@ -218,33 +180,12 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, - amdgpu_mn_invalidate_node(node, start, end); - } - -- up_read(&rmn->lock); --} -- --/** -- * amdgpu_mn_invalidate_range_end - callback to notify about mm change -- * -- * @mn: our notifier -- * @mn: the mm this callback is about -- * @start: start of updated range -- * @end: end of updated range -- * -- * Release the lock again to allow new command submissions. -- */ --static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn, -- struct mm_struct *mm, -- unsigned long start, -- unsigned long end) --{ -- struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); -- -- amdgpu_mn_read_unlock(rmn); -+ mutex_unlock(&rmn->lock); - } - - static const struct mmu_notifier_ops amdgpu_mn_ops = { - .release = amdgpu_mn_release, - .invalidate_range_start = amdgpu_mn_invalidate_range_start, -- .invalidate_range_end = amdgpu_mn_invalidate_range_end, - }; - - /** -@@ -254,19 +195,30 @@ static const struct mmu_notifier_ops amdgpu_mn_ops = { - * - * Creates a notifier context for current->mm. - */ --struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) -+static struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) - { - struct mm_struct *mm = current->mm; - struct amdgpu_mn *rmn; - int r; -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -+ struct hlist_node *node; -+#endif - - mutex_lock(&adev->mn_lock); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 7, 0) -+ down_write(&mm->mmap_sem); -+#else - if (down_write_killable(&mm->mmap_sem)) { - mutex_unlock(&adev->mn_lock); - return ERR_PTR(-EINTR); - } -+#endif - -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) -+ hash_for_each_possible(adev->mn_hash, rmn, node, node, (unsigned long)mm) -+#else - hash_for_each_possible(adev->mn_hash, rmn, node, (unsigned long)mm) -+#endif - if (rmn->mm == mm) - goto release_locks; - -@@ -279,10 +231,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) - rmn->adev = adev; - rmn->mm = mm; - rmn->mn.ops = &amdgpu_mn_ops; -- init_rwsem(&rmn->lock); -- rmn->objects = RB_ROOT_CACHED; -- mutex_init(&rmn->read_lock); -- atomic_set(&rmn->recursion, 0); -+ mutex_init(&rmn->lock); -+ rmn->objects = RB_ROOT; - - r = __mmu_notifier_register(&rmn->mn, mm); - if (r) -@@ -328,7 +278,7 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) - - INIT_LIST_HEAD(&bos); - -- down_write(&rmn->lock); -+ mutex_lock(&rmn->lock); - - while ((it = interval_tree_iter_first(&rmn->objects, addr, end))) { - kfree(node); -@@ -340,9 +290,9 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) - } - - if (!node) { -- node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_NOIO); -+ node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_KERNEL); - if (!node) { -- up_write(&rmn->lock); -+ mutex_unlock(&rmn->lock); - return -ENOMEM; - } - } -@@ -357,7 +307,7 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) - - interval_tree_insert(&node->it, &rmn->objects); - -- up_write(&rmn->lock); -+ mutex_unlock(&rmn->lock); - - return 0; - } -@@ -383,7 +333,7 @@ void amdgpu_mn_unregister(struct amdgpu_bo *bo) - return; - } - -- down_write(&rmn->lock); -+ mutex_lock(&rmn->lock); - - /* save the next list entry for later */ - head = bo->mn_list.next; -@@ -398,7 +348,6 @@ void amdgpu_mn_unregister(struct amdgpu_bo *bo) - kfree(node); - } - -- up_write(&rmn->lock); -+ mutex_unlock(&rmn->lock); - mutex_unlock(&adev->mn_lock); - } -- -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -index f421505..fb6c3d6 100755 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -@@ -36,7 +36,6 @@ - #include <drm/drm_cache.h> - #include "amdgpu.h" - #include "amdgpu_trace.h" --#include "amdgpu_amdkfd.h" - - static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) - { -@@ -47,9 +46,8 @@ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) - - if (bo->tbo.mem.mem_type == AMDGPU_PL_DGMA_IMPORT) - kfree(tbo->mem.bus.addr); -- if (bo->kfd_bo) -- amdgpu_amdkfd_unreserve_system_memory_limit(bo); - amdgpu_bo_kunmap(bo); -+ drm_gem_object_release(&bo->gem_base); - - if (bo->gem_base.import_attach) - drm_prime_gem_destroy(&bo->gem_base, bo->tbo.sg); -@@ -70,12 +68,11 @@ bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo) - return false; - } - --void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain) -+static void amdgpu_ttm_placement_init(struct amdgpu_device *adev, -+ struct ttm_placement *placement, -+ struct ttm_place *places, -+ u32 domain, u64 flags) - { -- struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev); -- struct ttm_placement *placement = &abo->placement; -- struct ttm_place *places = abo->placements; -- u64 flags = abo->flags; - u32 c = 0, i; - - if ((domain & AMDGPU_GEM_DOMAIN_DGMA) && amdgpu_direct_gma_size) { -@@ -178,6 +175,27 @@ void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain) - placement->busy_placement = places; - } - -+void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain) -+{ -+ struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev); -+ -+ amdgpu_ttm_placement_init(adev, &abo->placement, abo->placements, -+ domain, abo->flags); -+} -+ -+static void amdgpu_fill_placement_to_bo(struct amdgpu_bo *bo, -+ struct ttm_placement *placement) -+{ -+ BUG_ON(placement->num_placement > (AMDGPU_GEM_DOMAIN_MAX + 1)); -+ -+ memcpy(bo->placements, placement->placement, -+ placement->num_placement * sizeof(struct ttm_place)); -+ bo->placement.num_placement = placement->num_placement; -+ bo->placement.num_busy_placement = placement->num_busy_placement; -+ bo->placement.placement = bo->placements; -+ bo->placement.busy_placement = bo->placements; -+} -+ - /** - * amdgpu_bo_create_reserved - create reserved BO for kernel use - * -@@ -309,13 +327,14 @@ void amdgpu_bo_free_kernel(struct amdgpu_bo **bo, u64 *gpu_addr, - *cpu_addr = NULL; - } - --static int amdgpu_bo_do_create(struct amdgpu_device *adev, -- unsigned long size, int byte_align, -- bool kernel, u32 domain, u64 flags, -- struct sg_table *sg, -- struct reservation_object *resv, -- uint64_t init_value, -- struct amdgpu_bo **bo_ptr) -+int amdgpu_bo_create_restricted(struct amdgpu_device *adev, -+ unsigned long size, int byte_align, -+ bool kernel, u32 domain, u64 flags, -+ struct sg_table *sg, -+ struct ttm_placement *placement, -+ struct reservation_object *resv, -+ uint64_t init_value, -+ struct amdgpu_bo **bo_ptr) - { - struct amdgpu_bo *bo; - enum ttm_bo_type type; -@@ -342,10 +361,13 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, - bo = kzalloc(sizeof(struct amdgpu_bo), GFP_KERNEL); - if (bo == NULL) - return -ENOMEM; -- -+ r = drm_gem_object_init(adev->ddev, &bo->gem_base, size); -+ if (unlikely(r)) { -+ kfree(bo); -+ return r; -+ } - INIT_LIST_HEAD(&bo->shadow_list); - INIT_LIST_HEAD(&bo->va); -- INIT_LIST_HEAD(&bo->gem_objects); - bo->preferred_domains = domain & (AMDGPU_GEM_DOMAIN_VRAM | - AMDGPU_GEM_DOMAIN_GTT | - AMDGPU_GEM_DOMAIN_CPU | -@@ -388,17 +410,13 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, - bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - #endif - -- bo->tbo.bdev = &adev->mman.bdev; -- amdgpu_ttm_placement_from_domain(bo, domain); -+ amdgpu_fill_placement_to_bo(bo, placement); -+ /* Kernel allocation are uninterruptible */ - - initial_bytes_moved = atomic64_read(&adev->num_bytes_moved); -- /* Kernel allocation are uninterruptible */ - r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type, - &bo->placement, page_align, !kernel, NULL, - acc_size, sg, resv, &amdgpu_ttm_bo_destroy); -- if (unlikely(r != 0)) -- return r; -- - bytes_moved = atomic64_read(&adev->num_bytes_moved) - - initial_bytes_moved; - if (adev->mc.visible_vram_size < adev->mc.real_vram_size && -@@ -408,6 +426,9 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, - else - amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0); - -+ if (unlikely(r != 0)) -+ return r; -+ - if (domain & AMDGPU_GEM_DOMAIN_DGMA && adev->ssg.enabled) - bo->tbo.ssg_can_map = true; - -@@ -422,9 +443,13 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, - if (unlikely(r)) - goto fail_unreserve; - -+#if defined(BUILD_AS_DKMS) -+ dma_fence_wait(fence, false); -+#else - amdgpu_bo_fence(bo, fence, false); - dma_fence_put(bo->tbo.moving); - bo->tbo.moving = dma_fence_get(fence); -+#endif - dma_fence_put(fence); - } - if (!resv) -@@ -459,17 +484,27 @@ static int amdgpu_bo_create_shadow(struct amdgpu_device *adev, - unsigned long size, int byte_align, - struct amdgpu_bo *bo) - { -+ struct ttm_placement placement = {0}; -+ struct ttm_place placements[AMDGPU_GEM_DOMAIN_MAX + 1]; - int r; - - if (bo->shadow) - return 0; - -- r = amdgpu_bo_do_create(adev, size, byte_align, true, -- AMDGPU_GEM_DOMAIN_GTT, -- AMDGPU_GEM_CREATE_CPU_GTT_USWC | -- AMDGPU_GEM_CREATE_SHADOW, -- NULL, bo->tbo.resv, 0, -- &bo->shadow); -+ memset(&placements, 0, sizeof(placements)); -+ amdgpu_ttm_placement_init(adev, &placement, placements, -+ AMDGPU_GEM_DOMAIN_GTT, -+ AMDGPU_GEM_CREATE_CPU_GTT_USWC | -+ AMDGPU_GEM_CREATE_SHADOW); -+ -+ r = amdgpu_bo_create_restricted(adev, size, byte_align, true, -+ AMDGPU_GEM_DOMAIN_GTT, -+ AMDGPU_GEM_CREATE_CPU_GTT_USWC | -+ AMDGPU_GEM_CREATE_SHADOW, -+ NULL, &placement, -+ bo->tbo.resv, -+ 0, -+ &bo->shadow); - if (!r) { - bo->shadow->parent = amdgpu_bo_ref(bo); - mutex_lock(&adev->shadow_list_lock); -@@ -491,11 +526,18 @@ int amdgpu_bo_create(struct amdgpu_device *adev, - uint64_t init_value, - struct amdgpu_bo **bo_ptr) - { -+ struct ttm_placement placement = {0}; -+ struct ttm_place placements[AMDGPU_GEM_DOMAIN_MAX + 1]; - uint64_t parent_flags = flags & ~AMDGPU_GEM_CREATE_SHADOW; - int r; - -- r = amdgpu_bo_do_create(adev, size, byte_align, kernel, domain, -- parent_flags, sg, resv, init_value, bo_ptr); -+ memset(&placements, 0, sizeof(placements)); -+ amdgpu_ttm_placement_init(adev, &placement, placements, -+ domain, parent_flags); -+ -+ r = amdgpu_bo_create_restricted(adev, size, byte_align, kernel, domain, -+ parent_flags, sg, &placement, resv, -+ init_value, bo_ptr); - if (r) - return r; - -@@ -931,7 +973,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, - return; - - abo = container_of(bo, struct amdgpu_bo, tbo); -- amdgpu_vm_bo_invalidate(adev, abo, evict); -+ amdgpu_vm_bo_invalidate(adev, abo); - - amdgpu_bo_kunmap(abo); - -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h -index f73dba5..024e5cb 100755 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h -@@ -35,7 +35,6 @@ - - /* bo virtual addresses in a vm */ - struct amdgpu_bo_va_mapping { -- struct amdgpu_bo_va *bo_va; - struct list_head list; - struct rb_node rb; - uint64_t start; -@@ -50,17 +49,12 @@ struct amdgpu_bo_va { - struct amdgpu_vm_bo_base base; - - /* protected by bo being reserved */ -- unsigned ref_count; -- -- /* all other members protected by the VM PD being reserved */ - struct dma_fence *last_pt_update; -+ unsigned ref_count; - - /* mappings for this bo_va */ - struct list_head invalids; - struct list_head valids; -- -- /* If the mappings are cleared or filled */ -- bool cleared; - }; - - struct amdgpu_bo { -@@ -78,18 +72,16 @@ struct amdgpu_bo { - void *metadata; - u32 metadata_size; - unsigned prime_shared_count; -- /* GEM objects refereing to this BO */ -- struct list_head gem_objects; -- - /* list of all virtual address to which this bo is associated to */ - struct list_head va; - /* Constant after initialization */ -+ struct drm_gem_object gem_base; - struct amdgpu_bo *parent; - struct amdgpu_bo *shadow; - - struct ttm_bo_kmap_obj dma_buf_vmap; - struct amdgpu_mn *mn; -- struct kgd_mem *kfd_bo; -+ struct kfd_process_device *pdd; - - union { - struct list_head mn_list; -@@ -207,6 +199,14 @@ int amdgpu_bo_create(struct amdgpu_device *adev, - struct reservation_object *resv, - uint64_t init_value, - struct amdgpu_bo **bo_ptr); -+int amdgpu_bo_create_restricted(struct amdgpu_device *adev, -+ unsigned long size, int byte_align, -+ bool kernel, u32 domain, u64 flags, -+ struct sg_table *sg, -+ struct ttm_placement *placement, -+ struct reservation_object *resv, -+ uint64_t init_value, -+ struct amdgpu_bo **bo_ptr); - int amdgpu_bo_create_reserved(struct amdgpu_device *adev, - unsigned long size, int align, - u32 domain, struct amdgpu_bo **bo_ptr, -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c -index 90adff8..06b824c 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c -@@ -136,8 +136,7 @@ void amdgpu_ring_commit(struct amdgpu_ring *ring) - if (ring->funcs->end_use) - ring->funcs->end_use(ring); - -- if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ) -- amdgpu_ring_lru_touch(ring->adev, ring); -+ amdgpu_ring_lru_touch(ring->adev, ring); - } - - /** -@@ -382,7 +381,7 @@ void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring *ring) - static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, - size_t size, loff_t *pos) - { -- struct amdgpu_ring *ring = file_inode(f)->i_private; -+ struct amdgpu_ring *ring = (struct amdgpu_ring*)kcl_file_private(f); - int r, i; - uint32_t value, result, early[3]; - -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h -index af8e544..322d2529 100755 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h -@@ -36,7 +36,6 @@ - /* some special values for the owner field */ - #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul) - #define AMDGPU_FENCE_OWNER_VM ((void*)1ul) --#define AMDGPU_FENCE_OWNER_KFD ((void *)2ul) - - #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) - #define AMDGPU_FENCE_FLAG_INT (1 << 1) -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c -index 7ee8247..8492a26 100755 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c -@@ -31,7 +31,6 @@ - #include <drm/drmP.h> - #include "amdgpu.h" - #include "amdgpu_trace.h" --#include "amdgpu_amdkfd.h" - - struct amdgpu_sync_entry { - struct hlist_node node; -@@ -85,20 +84,11 @@ static bool amdgpu_sync_same_dev(struct amdgpu_device *adev, - */ - static void *amdgpu_sync_get_owner(struct dma_fence *f) - { -- struct amd_sched_fence *s_fence; -- struct amdgpu_amdkfd_fence *kfd_fence; -- -- if (f == NULL) -- return AMDGPU_FENCE_OWNER_UNDEFINED; -+ struct amd_sched_fence *s_fence = to_amd_sched_fence(f); - -- s_fence = to_amd_sched_fence(f); - if (s_fence) - return s_fence->owner; - -- kfd_fence = to_amdgpu_amdkfd_fence(f); -- if (kfd_fence) -- return AMDGPU_FENCE_OWNER_KFD; -- - return AMDGPU_FENCE_OWNER_UNDEFINED; - } - -@@ -180,9 +170,7 @@ int amdgpu_sync_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync, - * @sync: sync object to add fences from reservation object to - * @resv: reservation object with embedded fence - * @shared: true if we should only sync to the exclusive fence -- * -- * Sync to the fence except if it is KFD eviction fence and owner is -- * AMDGPU_FENCE_OWNER_VM. -+ * Sync to the fence - */ - int amdgpu_sync_resv(struct amdgpu_device *adev, - struct amdgpu_sync *sync, -@@ -209,15 +197,12 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, - for (i = 0; i < flist->shared_count; ++i) { - f = rcu_dereference_protected(flist->shared[i], - reservation_object_held(resv)); -- fence_owner = amdgpu_sync_get_owner(f); -- if (fence_owner == AMDGPU_FENCE_OWNER_KFD && -- owner != AMDGPU_FENCE_OWNER_UNDEFINED) -- continue; - - if (amdgpu_sync_same_dev(adev, f)) { - /* VM updates are only interesting - * for other VM updates and moves. - */ -+ fence_owner = amdgpu_sync_get_owner(f); - if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) && - (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) && - ((owner == AMDGPU_FENCE_OWNER_VM) != -diff --git a/drivers/gpu/drm/amd/amdgpu/vid.h b/drivers/gpu/drm/amd/amdgpu/vid.h -index d09592a..a648525 100755 ---- a/drivers/gpu/drm/amd/amdgpu/vid.h -+++ b/drivers/gpu/drm/amd/amdgpu/vid.h -@@ -27,8 +27,6 @@ - #define SDMA1_REGISTER_OFFSET 0x200 /* not a register */ - #define SDMA_MAX_INSTANCE 2 - --#define KFD_VI_SDMA_QUEUE_OFFSET 0x80 /* not a register */ -- - /* crtc instance offsets */ - #define CRTC0_REGISTER_OFFSET (0x1b9c - 0x1b9c) - #define CRTC1_REGISTER_OFFSET (0x1d9c - 0x1b9c) -@@ -369,10 +367,6 @@ - * x=0: tmz_begin - * x=1: tmz_end - */ --#define PACKET3_INVALIDATE_TLBS 0x98 --# define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) --# define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) -- - #define PACKET3_SET_RESOURCES 0xA0 - /* 1. header - * 2. CONTROL -diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig -index 95be0dd..e13c67c 100644 ---- a/drivers/gpu/drm/amd/amdkfd/Kconfig -+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig -@@ -4,7 +4,6 @@ - - config HSA_AMD - tristate "HSA kernel driver for AMD GPU devices" -- depends on (DRM_RADEON || DRM_AMDGPU) && (X86_64 || PPC64 || ARM64) -- select DRM_AMDGPU_USERPTR -+ depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64 - help - Enable this if you want to use HSA features on AMD GPU devices. -diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile -old mode 100755 -new mode 100644 -index dba08ec..b400d56 ---- a/drivers/gpu/drm/amd/amdkfd/Makefile -+++ b/drivers/gpu/drm/amd/amdkfd/Makefile -@@ -1,28 +1,19 @@ --# SPDX-License-Identifier: GPL-2.0 - # - # Makefile for Heterogenous System Architecture support for AMD GPU devices - # - --FULL_AMD_PATH=$(src)/.. -- --ccflags-y := -I$(FULL_AMD_PATH)/include/ \ -- -I$(FULL_AMD_PATH)/include/asic_reg -+ccflags-y := -Idrivers/gpu/drm/amd/include/ \ -+ -Idrivers/gpu/drm/amd/include/asic_reg - - amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ - kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ - kfd_process.o kfd_queue.o kfd_mqd_manager.o \ - kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ -- kfd_mqd_manager_v9.o \ - kfd_kernel_queue.o kfd_kernel_queue_cik.o \ -- kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ -- kfd_packet_manager.o kfd_process_queue_manager.o \ -- kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ -- kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ -- kfd_interrupt.o kfd_events.o cik_event_interrupt.o kfd_int_process_v9.o \ -- kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o \ -- kfd_peerdirect.o kfd_ipc.o -- --amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o -+ kfd_kernel_queue_vi.o kfd_packet_manager.o \ -+ kfd_process_queue_manager.o kfd_device_queue_manager.o \ -+ kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ -+ kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ -+ kfd_dbgdev.o kfd_dbgmgr.o - - obj-$(CONFIG_HSA_AMD) += amdkfd.o -- -diff --git a/drivers/gpu/drm/amd/amdkfd/backport/Makefile b/drivers/gpu/drm/amd/amdkfd/backport/Makefile -deleted file mode 100644 -index 6a3845e..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/backport/Makefile -+++ /dev/null -@@ -1,7 +0,0 @@ -- -- --LINUXINCLUDE := $(DKMS_INCLUDE_PREFIX) $(LINUXINCLUDE) -- --ccflags-y += \ -- -I$(AMDKFD_FULL_PATH) \ -- -include backport/backport.h -diff --git a/drivers/gpu/drm/amd/amdkfd/backport/backport.h b/drivers/gpu/drm/amd/amdkfd/backport/backport.h -deleted file mode 100644 -index e1f8c1d..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/backport/backport.h -+++ /dev/null -@@ -1,6 +0,0 @@ --#ifndef AMDKFD_BACKPORT_H --#define AMDKFD_BACKPORT_H -- --#include <linux/version.h> -- --#endif -diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c -index 00536a1..211fc48 100644 ---- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c -+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c -@@ -24,89 +24,40 @@ - #include "kfd_events.h" - #include "cik_int.h" - --static bool is_cpc_vm_fault(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry) --{ -- const struct cik_ih_ring_entry *ihre = -- (const struct cik_ih_ring_entry *)ih_ring_entry; -- -- if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || -- ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && -- ihre->vmid >= dev->vm_info.first_vmid_kfd && -- ihre->vmid <= dev->vm_info.last_vmid_kfd) -- return true; -- return false; --} -- - static bool cik_event_interrupt_isr(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry, -- uint32_t *patched_ihre, -- bool *patched_flag) -+ const uint32_t *ih_ring_entry) - { -+ unsigned int pasid; - const struct cik_ih_ring_entry *ihre = - (const struct cik_ih_ring_entry *)ih_ring_entry; -- const struct kfd2kgd_calls *f2g = dev->kfd2kgd; -- struct cik_ih_ring_entry *tmp_ihre = -- (struct cik_ih_ring_entry *) patched_ihre; - -- /* This workaround is due to HW/FW limitation on Hawaii that -- * VMID and PASID are not written into ih_ring_entry -- */ -- if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || -- ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && -- dev->device_info->asic_family == CHIP_HAWAII) { -- *patched_flag = true; -- *tmp_ihre = *ihre; -+ pasid = (ihre->ring_id & 0xffff0000) >> 16; - -- tmp_ihre->vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); -- tmp_ihre->pasid = f2g->get_atc_vmid_pasid_mapping_pasid( -- dev->kgd, tmp_ihre->vmid); -- return (tmp_ihre->pasid != 0) && -- tmp_ihre->vmid >= dev->vm_info.first_vmid_kfd && -- tmp_ihre->vmid <= dev->vm_info.last_vmid_kfd; -- } - /* Do not process in ISR, just request it to be forwarded to WQ. */ -- return (ihre->pasid != 0) && -+ return (pasid != 0) && - (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || -- ihre->source_id == CIK_INTSRC_SDMA_TRAP || - ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || -- ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || -- is_cpc_vm_fault(dev, ih_ring_entry)); -+ ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); - } - - static void cik_event_interrupt_wq(struct kfd_dev *dev, - const uint32_t *ih_ring_entry) - { -+ unsigned int pasid; - const struct cik_ih_ring_entry *ihre = - (const struct cik_ih_ring_entry *)ih_ring_entry; - -- if (ihre->pasid == 0) -+ pasid = (ihre->ring_id & 0xffff0000) >> 16; -+ -+ if (pasid == 0) - return; - - if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) -- kfd_signal_event_interrupt(ihre->pasid, 0, 0); -- else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) -- kfd_signal_event_interrupt(ihre->pasid, 0, 0); -+ kfd_signal_event_interrupt(pasid, 0, 0); - else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) -- kfd_signal_event_interrupt(ihre->pasid, ihre->data & 0xFF, 8); -+ kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8); - else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) -- kfd_signal_hw_exception_event(ihre->pasid); -- else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || -- ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { -- struct kfd_vm_fault_info info; -- -- kfd_process_vm_fault(dev->dqm, ihre->pasid); -- -- memset(&info, 0, sizeof(info)); -- dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); -- if (!info.page_addr && !info.status) -- return; -- -- if (info.vmid == ihre->vmid) -- kfd_signal_vm_fault_event(dev, ihre->pasid, &info); -- else -- kfd_signal_vm_fault_event(dev, ihre->pasid, NULL); -- } -+ kfd_signal_hw_exception_event(pasid); - } - - const struct kfd_event_interrupt_class event_interrupt_class_cik = { -diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h -index ff8255d..79a16d2 100644 ---- a/drivers/gpu/drm/amd/amdkfd/cik_int.h -+++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h -@@ -26,32 +26,16 @@ - #include <linux/types.h> - - struct cik_ih_ring_entry { -- uint32_t source_id:8; -- uint32_t reserved1:8; -- uint32_t reserved2:16; -- -- uint32_t data:28; -- uint32_t reserved3:4; -- -- /* pipeid, meid and unused3 are officially called RINGID, -- * but for our purposes, they always decode into pipe and ME. -- */ -- uint32_t pipeid:2; -- uint32_t meid:2; -- uint32_t reserved4:4; -- uint32_t vmid:8; -- uint32_t pasid:16; -- -- uint32_t reserved5; -+ uint32_t source_id; -+ uint32_t data; -+ uint32_t ring_id; -+ uint32_t reserved; - }; - - #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 - #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 - #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 - #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF --#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 --#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 --#define CIK_INTSRC_SDMA_TRAP 0xE0 - - #endif - -diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h -index 37ce6dd..48769d1 100644 ---- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h -+++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h -@@ -33,8 +33,7 @@ - #define APE1_MTYPE(x) ((x) << 7) - - /* valid for both DEFAULT_MTYPE and APE1_MTYPE */ --#define MTYPE_CACHED_NV 0 --#define MTYPE_CACHED 1 -+#define MTYPE_CACHED 0 - #define MTYPE_NONCACHED 3 - - #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) -diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h -deleted file mode 100644 -index d5d1331..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h -+++ /dev/null -@@ -1,1384 +0,0 @@ --/* -- * Copyright 2015 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- --#if 0 --HW (VI) source code for CWSR trap handler --#Version 18 + multiple trap handler -- --// this performance-optimal version was originally from Seven Xu at SRDC -- --// Revison #18 --... --/* Rev History --** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) --** #4. SR Memory Layout: --** 1. VGPR-SGPR-HWREG-{LDS} --** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. --** #5. Update: 1. Accurate g8sr_ts_save_d timestamp --** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) --** #7. Update: 1. don't barrier if noLDS --** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version --** 2. Fix SQ issue by s_sleep 2 --** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last --** 2. optimize s_buffer save by burst 16sgprs... --** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. --** #11. Update 1. Add 2 more timestamp for debug version --** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance --** #13. Integ 1. Always use MUBUF for PV trap shader... --** #14. Update 1. s_buffer_store soft clause... --** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. --** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree --** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] --** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... --** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 --** 2. FUNC - Handle non-CWSR traps --*/ -- --var G8SR_WDMEM_HWREG_OFFSET = 0 --var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes -- --// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. -- --var G8SR_DEBUG_TIMESTAMP = 0 --var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset --var s_g8sr_ts_save_s = s[34:35] // save start --var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi --var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ --var s_g8sr_ts_save_d = s[40:41] // save end --var s_g8sr_ts_restore_s = s[42:43] // restore start --var s_g8sr_ts_restore_d = s[44:45] // restore end -- --var G8SR_VGPR_SR_IN_DWX4 = 0 --var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes --var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 -- -- --/*************************************************************************/ --/* control on how to run the shader */ --/*************************************************************************/ --//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) --var EMU_RUN_HACK = 0 --var EMU_RUN_HACK_RESTORE_NORMAL = 0 --var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 --var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 --var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK --var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK --var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK --var SAVE_LDS = 1 --var WG_BASE_ADDR_LO = 0x9000a000 --var WG_BASE_ADDR_HI = 0x0 --var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem --var CTX_SAVE_CONTROL = 0x0 --var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL --var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) --var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write --var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes --var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing -- --/**************************************************************************/ --/* variables */ --/**************************************************************************/ --var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 --var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 --var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 -- --var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 --var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 --var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 --var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 --var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 --var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits -- --var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 --var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask --var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 --var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 --var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 --var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF --var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 --var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 --var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 --var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 --var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 -- --var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME --var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME --var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME --var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME --var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME -- --var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 --var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 -- -- --/* Save */ --var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes --var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE -- --var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit --var S_SAVE_SPI_INIT_ATC_SHIFT = 27 --var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype --var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 --var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG --var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -- --var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used --var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME --var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME --var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME -- --var s_save_spi_init_lo = exec_lo --var s_save_spi_init_hi = exec_hi -- -- //tba_lo and tba_hi need to be saved/restored --var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} --var s_save_pc_hi = ttmp1 --var s_save_exec_lo = ttmp2 --var s_save_exec_hi = ttmp3 --var s_save_status = ttmp4 --var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine --var s_save_xnack_mask_lo = ttmp6 --var s_save_xnack_mask_hi = ttmp7 --var s_save_buf_rsrc0 = ttmp8 --var s_save_buf_rsrc1 = ttmp9 --var s_save_buf_rsrc2 = ttmp10 --var s_save_buf_rsrc3 = ttmp11 -- --var s_save_mem_offset = tma_lo --var s_save_alloc_size = s_save_trapsts //conflict --var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) --var s_save_m0 = tma_hi -- --/* Restore */ --var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE --var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC -- --var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit --var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 --var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype --var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 --var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG --var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -- --var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT --var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK --var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT --var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK -- --var s_restore_spi_init_lo = exec_lo --var s_restore_spi_init_hi = exec_hi -- --var s_restore_mem_offset = ttmp2 --var s_restore_alloc_size = ttmp3 --var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored --var s_restore_mem_offset_save = s_restore_tmp //no conflict -- --var s_restore_m0 = s_restore_alloc_size //no conflict -- --var s_restore_mode = ttmp7 -- --var s_restore_pc_lo = ttmp0 --var s_restore_pc_hi = ttmp1 --var s_restore_exec_lo = tma_lo //no conflict --var s_restore_exec_hi = tma_hi //no conflict --var s_restore_status = ttmp4 --var s_restore_trapsts = ttmp5 --var s_restore_xnack_mask_lo = xnack_mask_lo --var s_restore_xnack_mask_hi = xnack_mask_hi --var s_restore_buf_rsrc0 = ttmp8 --var s_restore_buf_rsrc1 = ttmp9 --var s_restore_buf_rsrc2 = ttmp10 --var s_restore_buf_rsrc3 = ttmp11 -- --/**************************************************************************/ --/* trap handler entry points */ --/**************************************************************************/ --/* Shader Main*/ -- --shader main -- asic(VI) -- type(CS) -- -- -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore -- //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC -- s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC -- s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. -- s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE -- //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE -- s_branch L_SKIP_RESTORE //NOT restore, SAVE actually -- else -- s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save -- end -- --L_JUMP_TO_RESTORE: -- s_branch L_RESTORE //restore -- --L_SKIP_RESTORE: -- -- s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC -- s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save -- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save -- s_cbranch_scc1 L_SAVE //this is the operation for save -- -- // ********* Handle non-CWSR traps ******************* --if (!EMU_RUN_HACK) -- /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ -- s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 -- s_waitcnt lgkmcnt(0) -- s_or_b32 ttmp7, ttmp8, ttmp9 -- s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set -- s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) -- s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler -- --L_NO_NEXT_TRAP: -- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception -- s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. -- s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 -- s_addc_u32 ttmp1, ttmp1, 0 --L_EXCP_CASE: -- s_and_b32 ttmp1, ttmp1, 0xFFFF -- s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) -- s_rfe_b64 [ttmp0, ttmp1] --end -- // ********* End handling of non-CWSR traps ******************* -- --/**************************************************************************/ --/* save routine */ --/**************************************************************************/ -- --L_SAVE: -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_save_s -- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? --end -- -- //check whether there is mem_viol -- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK -- s_cbranch_scc0 L_NO_PC_REWIND -- -- //if so, need rewind PC assuming GDS operation gets NACKed -- s_mov_b32 s_save_tmp, 0 //clear mem_viol bit -- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit -- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] -- s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 -- s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc -- --L_NO_PC_REWIND: -- s_mov_b32 s_save_tmp, 0 //clear saveCtx bit -- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit -- -- s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK -- s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation -- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT -- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT -- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp -- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY -- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp -- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS -- s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG -- -- s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp -- -- /* inform SPI the readiness and wait for SPI's go signal */ -- s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI -- s_mov_b32 s_save_exec_hi, exec_hi -- s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_sq_save_msg -- s_waitcnt lgkmcnt(0) --end -- -- if (EMU_RUN_HACK) -- -- else -- s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC -- end -- -- L_SLEEP: -- s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 -- -- if (EMU_RUN_HACK) -- -- else -- s_cbranch_execz L_SLEEP -- end -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_spi_wrexec -- s_waitcnt lgkmcnt(0) --end -- -- /* setup Resource Contants */ -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) -- //calculate wd_addr using absolute thread id -- v_readlane_b32 s_save_tmp, v9, 0 -- s_lshr_b32 s_save_tmp, s_save_tmp, 6 -- s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE -- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO -- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI -- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL -- else -- end -- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) -- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO -- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI -- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL -- else -- end -- -- -- s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo -- s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE -- s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited -- s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC -- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK -- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position -- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC -- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK -- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position -- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE -- -- //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) -- s_mov_b32 s_save_m0, m0 //save M0 -- -- /* global mem offset */ -- s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 -- -- -- -- -- /* save HW registers */ -- ////////////////////////////// -- -- L_SAVE_HWREG: -- // HWREG SR memory offset : size(VGPR)+size(SGPR) -- get_vgpr_size_bytes(s_save_mem_offset) -- get_sgpr_size_bytes(s_save_tmp) -- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp -- -- -- s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- -- write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 -- -- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) -- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 -- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over -- s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO -- s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI -- end -- -- write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC -- write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) -- write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC -- write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) -- write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS -- -- //s_save_trapsts conflicts with s_save_alloc_size -- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -- write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS -- -- write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO -- write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI -- -- //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 -- s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE -- write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) -- write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO -- write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI -- -- -- -- /* the first wave in the threadgroup */ -- // save fist_wave bits in tba_hi unused bit.26 -- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit -- //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] -- s_mov_b32 s_save_exec_hi, 0x0 -- s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] -- -- -- /* save SGPRs */ -- // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... -- ////////////////////////////// -- -- // SGPR SR memory offset : size(VGPR) -- get_vgpr_size_bytes(s_save_mem_offset) -- // TODO, change RSRC word to rearrange memory layout for SGPRS -- -- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 -- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) -- -- if (SGPR_SAVE_USE_SQC) -- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes -- else -- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) -- end -- -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- -- // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 -- //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 -- s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 -- s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset -- s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 -- -- s_mov_b32 m0, 0x0 //SGPR initial index value =0 -- L_SAVE_SGPR_LOOP: -- // SGPR is allocated in 16 SGPR granularity -- s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] -- s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] -- s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] -- s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] -- s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] -- s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] -- s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] -- s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] -- -- write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 -- s_add_u32 m0, m0, 16 //next sgpr index -- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? -- // restore s_save_buf_rsrc0,1 -- //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo -- s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo -- -- -- -- -- /* save first 4 VGPR, then LDS save could use */ -- // each wave will alloc 4 vgprs at least... -- ///////////////////////////////////////////////////////////////////////////////////// -- -- s_mov_b32 s_save_mem_offset, 0 -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- -- // VGPR Allocated in 4-GPR granularity -- --if G8SR_VGPR_SR_IN_DWX4 -- // the const stride for DWx4 is 4*4 bytes -- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -- -- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -- -- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes --else -- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 -- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 -- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 --end -- -- -- -- /* save LDS */ -- ////////////////////////////// -- -- L_SAVE_LDS: -- -- // Change EXEC to all threads... -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size -- s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? -- s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE -- -- s_barrier //LDS is used? wait for other waves in the same TG -- //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here -- s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here -- s_cbranch_scc0 L_SAVE_LDS_DONE -- -- // first wave do LDS save; -- -- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw -- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes -- s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes -- -- // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) -- // -- get_vgpr_size_bytes(s_save_mem_offset) -- get_sgpr_size_bytes(s_save_tmp) -- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp -- s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() -- -- -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -- -- --var LDS_DMA_ENABLE = 0 --var UNROLL = 0 --if UNROLL==0 && LDS_DMA_ENABLE==1 -- s_mov_b32 s3, 256*2 -- s_nop 0 -- s_nop 0 -- s_nop 0 -- L_SAVE_LDS_LOOP: -- //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? -- if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity -- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW -- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW -- end -- -- s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes -- s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes -- s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? -- --elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss -- // store from higest LDS address to lowest -- s_mov_b32 s3, 256*2 -- s_sub_u32 m0, s_save_alloc_size, s3 -- s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 -- s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... -- s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest -- s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc -- s_nop 0 -- s_nop 0 -- s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes -- s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved -- s_add_u32 s0, s0,s_save_alloc_size -- s_addc_u32 s1, s1, 0 -- s_setpc_b64 s[0:1] -- -- -- for var i =0; i< 128; i++ -- // be careful to make here a 64Byte aligned address, which could improve performance... -- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW -- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW -- -- if i!=127 -- s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline -- s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 -- end -- end -- --else // BUFFER_STORE -- v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 -- v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid -- v_mul_i32_i24 v2, v3, 8 // tid*8 -- v_mov_b32 v3, 256*2 -- s_mov_b32 m0, 0x10000 -- s_mov_b32 s0, s_save_buf_rsrc3 -- s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid -- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT -- --L_SAVE_LDS_LOOP_VECTOR: -- ds_read_b64 v[0:1], v2 //x =LDS[a], byte address -- s_waitcnt lgkmcnt(0) -- buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 --// s_waitcnt vmcnt(0) -- v_add_u32 v2, vcc[0:1], v2, v3 -- v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size -- s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR -- -- // restore rsrc3 -- s_mov_b32 s_save_buf_rsrc3, s0 -- --end -- --L_SAVE_LDS_DONE: -- -- -- /* save VGPRs - set the Rest VGPRs */ -- ////////////////////////////////////////////////////////////////////////////////////// -- L_SAVE_VGPR: -- // VGPR SR memory offset: 0 -- // TODO rearrange the RSRC words to use swizzle for VGPR save... -- -- s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 -- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible -- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- -- // VGPR Allocated in 4-GPR granularity -- --if G8SR_VGPR_SR_IN_DWX4 -- // the const stride for DWx4 is 4*4 bytes -- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -- -- s_mov_b32 m0, 4 // skip first 4 VGPRs -- s_cmp_lt_u32 m0, s_save_alloc_size -- s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs -- -- s_set_gpr_idx_on m0, 0x1 // This will change M0 -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 --L_SAVE_VGPR_LOOP: -- v_mov_b32 v0, v0 // v0 = v[0+m0] -- v_mov_b32 v1, v1 -- v_mov_b32 v2, v2 -- v_mov_b32 v3, v3 -- -- -- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -- s_add_u32 m0, m0, 4 -- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 -- s_cmp_lt_u32 m0, s_save_alloc_size -- s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? -- s_set_gpr_idx_off --L_SAVE_VGPR_LOOP_END: -- -- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes --else -- // VGPR store using dw burst -- s_mov_b32 m0, 0x4 //VGPR initial index value =0 -- s_cmp_lt_u32 m0, s_save_alloc_size -- s_cbranch_scc0 L_SAVE_VGPR_END -- -- -- s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later -- -- L_SAVE_VGPR_LOOP: -- v_mov_b32 v0, v0 //v0 = v[0+m0] -- v_mov_b32 v1, v1 //v0 = v[0+m0] -- v_mov_b32 v2, v2 //v0 = v[0+m0] -- v_mov_b32 v3, v3 //v0 = v[0+m0] -- -- if(USE_MTBUF_INSTEAD_OF_MUBUF) -- tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -- else -- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 -- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 -- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -- end -- -- s_add_u32 m0, m0, 4 //next vgpr index -- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes -- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? -- s_set_gpr_idx_off --end -- --L_SAVE_VGPR_END: -- -- -- -- -- -- -- /* S_PGM_END_SAVED */ //FIXME graphics ONLY -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) -- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] -- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 -- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over -- s_rfe_b64 s_save_pc_lo //Return to the main shader program -- else -- end -- --// Save Done timestamp --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_save_d -- // SGPR SR memory offset : size(VGPR) -- get_vgpr_size_bytes(s_save_mem_offset) -- s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET -- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -- // Need reset rsrc2?? -- s_mov_b32 m0, s_save_mem_offset -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 --end -- -- -- s_branch L_END_PGM -- -- -- --/**************************************************************************/ --/* restore routine */ --/**************************************************************************/ -- --L_RESTORE: -- /* Setup Resource Contants */ -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) -- //calculate wd_addr using absolute thread id -- v_readlane_b32 s_restore_tmp, v9, 0 -- s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 -- s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE -- s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO -- s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI -- s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL -- else -- end -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_restore_s -- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -- // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... -- s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] -- s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. --end -- -- -- -- s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo -- s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi -- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE -- s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) -- s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC -- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK -- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position -- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC -- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK -- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position -- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE -- -- /* global mem offset */ --// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 -- -- /* the first wave in the threadgroup */ -- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK -- s_cbranch_scc0 L_RESTORE_VGPR -- -- /* restore LDS */ -- ////////////////////////////// -- L_RESTORE_LDS: -- -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size -- s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? -- s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR -- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw -- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes -- s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes -- -- // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) -- // -- get_vgpr_size_bytes(s_restore_mem_offset) -- get_sgpr_size_bytes(s_restore_tmp) -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? -- -- -- if (SWIZZLE_EN) -- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -- -- L_RESTORE_LDS_LOOP: -- if (SAVE_LDS) -- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW -- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW -- end -- s_add_u32 m0, m0, 256*2 // 128 DW -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW -- s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? -- -- -- /* restore VGPRs */ -- ////////////////////////////// -- L_RESTORE_VGPR: -- // VGPR SR memory offset : 0 -- s_mov_b32 s_restore_mem_offset, 0x0 -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 -- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) -- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) -- if (SWIZZLE_EN) -- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- --if G8SR_VGPR_SR_IN_DWX4 -- get_vgpr_size_bytes(s_restore_mem_offset) -- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -- -- // the const stride for DWx4 is 4*4 bytes -- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -- -- s_mov_b32 m0, s_restore_alloc_size -- s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 -- --L_RESTORE_VGPR_LOOP: -- buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 -- s_waitcnt vmcnt(0) -- s_sub_u32 m0, m0, 4 -- v_mov_b32 v0, v0 // v[0+m0] = v0 -- v_mov_b32 v1, v1 -- v_mov_b32 v2, v2 -- v_mov_b32 v3, v3 -- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -- s_cmp_eq_u32 m0, 0x8000 -- s_cbranch_scc0 L_RESTORE_VGPR_LOOP -- s_set_gpr_idx_off -- -- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes -- --else -- // VGPR load using dw burst -- s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -- s_mov_b32 m0, 4 //VGPR initial index value = 1 -- s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 -- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later -- -- L_RESTORE_VGPR_LOOP: -- if(USE_MTBUF_INSTEAD_OF_MUBUF) -- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -- else -- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 -- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 -- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 -- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 -- end -- s_waitcnt vmcnt(0) //ensure data ready -- v_mov_b32 v0, v0 //v[0+m0] = v0 -- v_mov_b32 v1, v1 -- v_mov_b32 v2, v2 -- v_mov_b32 v3, v3 -- s_add_u32 m0, m0, 4 //next vgpr index -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes -- s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? -- s_set_gpr_idx_off -- /* VGPR restore on v0 */ -- if(USE_MTBUF_INSTEAD_OF_MUBUF) -- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -- else -- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 -- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 -- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 -- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 -- end -- --end -- -- /* restore SGPRs */ -- ////////////////////////////// -- -- // SGPR SR memory offset : size(VGPR) -- get_vgpr_size_bytes(s_restore_mem_offset) -- get_sgpr_size_bytes(s_restore_tmp) -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group -- // TODO, change RSRC word to rearrange memory layout for SGPRS -- -- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 -- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) -- -- if (SGPR_SAVE_USE_SQC) -- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes -- else -- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) -- end -- if (SWIZZLE_EN) -- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), -- However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG -- */ -- s_mov_b32 m0, s_restore_alloc_size -- -- L_RESTORE_SGPR_LOOP: -- read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made -- s_waitcnt lgkmcnt(0) //ensure data ready -- -- s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] -- -- s_movreld_b64 s0, s0 //s[0+m0] = s0 -- s_movreld_b64 s2, s2 -- s_movreld_b64 s4, s4 -- s_movreld_b64 s6, s6 -- s_movreld_b64 s8, s8 -- s_movreld_b64 s10, s10 -- s_movreld_b64 s12, s12 -- s_movreld_b64 s14, s14 -- -- s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 -- s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? -- -- /* restore HW registers */ -- ////////////////////////////// -- L_RESTORE_HWREG: -- -- --if G8SR_DEBUG_TIMESTAMP -- s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo -- s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi --end -- -- // HWREG SR memory offset : size(VGPR)+size(SGPR) -- get_vgpr_size_bytes(s_restore_mem_offset) -- get_sgpr_size_bytes(s_restore_tmp) -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -- -- -- s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes -- if (SWIZZLE_EN) -- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 -- read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC -- read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) -- read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC -- read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) -- read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS -- read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS -- read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO -- read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI -- read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE -- read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO -- read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI -- -- s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS -- -- s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS -- -- //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) -- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) -- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over -- end -- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) -- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal -- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over -- end -- -- s_mov_b32 m0, s_restore_m0 -- s_mov_b32 exec_lo, s_restore_exec_lo -- s_mov_b32 exec_hi, s_restore_exec_hi -- -- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts -- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 -- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts -- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT -- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 -- //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore -- s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode -- //reuse s_restore_m0 as a temp register -- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK -- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT -- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT -- s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero -- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 -- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK -- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT -- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 -- s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK -- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT -- s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp -- -- s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 -- s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 -- s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu -- -- s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_restore_d -- s_waitcnt lgkmcnt(0) --end -- --// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution -- s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc -- -- --/**************************************************************************/ --/* the END */ --/**************************************************************************/ --L_END_PGM: -- s_endpgm -- --end -- -- --/**************************************************************************/ --/* the helper functions */ --/**************************************************************************/ -- --//Only for save hwreg to mem --function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) -- s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on -- s_mov_b32 m0, s_mem_offset -- s_buffer_store_dword s, s_rsrc, m0 glc:1 -- s_add_u32 s_mem_offset, s_mem_offset, 4 -- s_mov_b32 m0, exec_lo --end -- -- --// HWREG are saved before SGPRs, so all HWREG could be use. --function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) -- -- s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 -- s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 -- s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 -- s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 -- s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 -- s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc --end -- -- --function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) -- s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 -- s_add_u32 s_mem_offset, s_mem_offset, 4 --end -- --function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) -- s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 -- s_sub_u32 s_mem_offset, s_mem_offset, 4*16 --end -- -- -- --function get_lds_size_bytes(s_lds_size_byte) -- // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW -- s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size -- s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW --end -- --function get_vgpr_size_bytes(s_vgpr_size_byte) -- s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -- s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 -- s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible --end -- --function get_sgpr_size_bytes(s_sgpr_size_byte) -- s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -- s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 -- s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) --end -- --function get_hwreg_size_bytes -- return 128 //HWREG size 128 bytes --end -- -- --#endif -- --static const uint32_t cwsr_trap_carrizo_hex[] = { -- 0xbf820001, 0xbf820123, -- 0xb8f4f802, 0x89748674, -- 0xb8f5f803, 0x8675ff75, -- 0x00000400, 0xbf850011, -- 0xc00a1e37, 0x00000000, -- 0xbf8c007f, 0x87777978, -- 0xbf840002, 0xb974f802, -- 0xbe801d78, 0xb8f5f803, -- 0x8675ff75, 0x000001ff, -- 0xbf850002, 0x80708470, -- 0x82718071, 0x8671ff71, -- 0x0000ffff, 0xb974f802, -- 0xbe801f70, 0xb8f5f803, -- 0x8675ff75, 0x00000100, -- 0xbf840006, 0xbefa0080, -- 0xb97a0203, 0x8671ff71, -- 0x0000ffff, 0x80f08870, -- 0x82f18071, 0xbefa0080, -- 0xb97a0283, 0xbef60068, -- 0xbef70069, 0xb8fa1c07, -- 0x8e7a9c7a, 0x87717a71, -- 0xb8fa03c7, 0x8e7a9b7a, -- 0x87717a71, 0xb8faf807, -- 0x867aff7a, 0x00007fff, -- 0xb97af807, 0xbef2007e, -- 0xbef3007f, 0xbefe0180, -- 0xbf900004, 0xbf8e0002, -- 0xbf88fffe, 0xbef8007e, -- 0x8679ff7f, 0x0000ffff, -- 0x8779ff79, 0x00040000, -- 0xbefa0080, 0xbefb00ff, -- 0x00807fac, 0x867aff7f, -- 0x08000000, 0x8f7a837a, -- 0x877b7a7b, 0x867aff7f, -- 0x70000000, 0x8f7a817a, -- 0x877b7a7b, 0xbeef007c, -- 0xbeee0080, 0xb8ee2a05, -- 0x806e816e, 0x8e6e8a6e, -- 0xb8fa1605, 0x807a817a, -- 0x8e7a867a, 0x806e7a6e, -- 0xbefa0084, 0xbefa00ff, -- 0x01000000, 0xbefe007c, -- 0xbefc006e, 0xc0611bfc, -- 0x0000007c, 0x806e846e, -- 0xbefc007e, 0xbefe007c, -- 0xbefc006e, 0xc0611c3c, -- 0x0000007c, 0x806e846e, -- 0xbefc007e, 0xbefe007c, -- 0xbefc006e, 0xc0611c7c, -- 0x0000007c, 0x806e846e, -- 0xbefc007e, 0xbefe007c, -- 0xbefc006e, 0xc0611cbc, -- 0x0000007c, 0x806e846e, -- 0xbefc007e, 0xbefe007c, -- 0xbefc006e, 0xc0611cfc, -- 0x0000007c, 0x806e846e, -- 0xbefc007e, 0xbefe007c, -- 0xbefc006e, 0xc0611d3c, -- 0x0000007c, 0x806e846e, -- 0xbefc007e, 0xb8f5f803, -- 0xbefe007c, 0xbefc006e, -- 0xc0611d7c, 0x0000007c, -- 0x806e846e, 0xbefc007e, -- 0xbefe007c, 0xbefc006e, -- 0xc0611dbc, 0x0000007c, -- 0x806e846e, 0xbefc007e, -- 0xbefe007c, 0xbefc006e, -- 0xc0611dfc, 0x0000007c, -- 0x806e846e, 0xbefc007e, -- 0xb8eff801, 0xbefe007c, -- 0xbefc006e, 0xc0611bfc, -- 0x0000007c, 0x806e846e, -- 0xbefc007e, 0xbefe007c, -- 0xbefc006e, 0xc0611b3c, -- 0x0000007c, 0x806e846e, -- 0xbefc007e, 0xbefe007c, -- 0xbefc006e, 0xc0611b7c, -- 0x0000007c, 0x806e846e, -- 0xbefc007e, 0x867aff7f, -- 0x04000000, 0xbef30080, -- 0x8773737a, 0xb8ee2a05, -- 0x806e816e, 0x8e6e8a6e, -- 0xb8f51605, 0x80758175, -- 0x8e758475, 0x8e7a8275, -- 0xbefa00ff, 0x01000000, -- 0xbef60178, 0x80786e78, -- 0x82798079, 0xbefc0080, -- 0xbe802b00, 0xbe822b02, -- 0xbe842b04, 0xbe862b06, -- 0xbe882b08, 0xbe8a2b0a, -- 0xbe8c2b0c, 0xbe8e2b0e, -- 0xc06b003c, 0x00000000, -- 0xc06b013c, 0x00000010, -- 0xc06b023c, 0x00000020, -- 0xc06b033c, 0x00000030, -- 0x8078c078, 0x82798079, -- 0x807c907c, 0xbf0a757c, -- 0xbf85ffeb, 0xbef80176, -- 0xbeee0080, 0xbefe00c1, -- 0xbeff00c1, 0xbefa00ff, -- 0x01000000, 0xe0724000, -- 0x6e1e0000, 0xe0724100, -- 0x6e1e0100, 0xe0724200, -- 0x6e1e0200, 0xe0724300, -- 0x6e1e0300, 0xbefe00c1, -- 0xbeff00c1, 0xb8f54306, -- 0x8675c175, 0xbf84002c, -- 0xbf8a0000, 0x867aff73, -- 0x04000000, 0xbf840028, -- 0x8e758675, 0x8e758275, -- 0xbefa0075, 0xb8ee2a05, -- 0x806e816e, 0x8e6e8a6e, -- 0xb8fa1605, 0x807a817a, -- 0x8e7a867a, 0x806e7a6e, -- 0x806eff6e, 0x00000080, -- 0xbefa00ff, 0x01000000, -- 0xbefc0080, 0xd28c0002, -- 0x000100c1, 0xd28d0003, -- 0x000204c1, 0xd1060002, -- 0x00011103, 0x7e0602ff, -- 0x00000200, 0xbefc00ff, -- 0x00010000, 0xbe80007b, -- 0x867bff7b, 0xff7fffff, -- 0x877bff7b, 0x00058000, -- 0xd8ec0000, 0x00000002, -- 0xbf8c007f, 0xe0765000, -- 0x6e1e0002, 0x32040702, -- 0xd0c9006a, 0x0000eb02, -- 0xbf87fff7, 0xbefb0000, -- 0xbeee00ff, 0x00000400, -- 0xbefe00c1, 0xbeff00c1, -- 0xb8f52a05, 0x80758175, -- 0x8e758275, 0x8e7a8875, -- 0xbefa00ff, 0x01000000, -- 0xbefc0084, 0xbf0a757c, -- 0xbf840015, 0xbf11017c, -- 0x8075ff75, 0x00001000, -- 0x7e000300, 0x7e020301, -- 0x7e040302, 0x7e060303, -- 0xe0724000, 0x6e1e0000, -- 0xe0724100, 0x6e1e0100, -- 0xe0724200, 0x6e1e0200, -- 0xe0724300, 0x6e1e0300, -- 0x807c847c, 0x806eff6e, -- 0x00000400, 0xbf0a757c, -- 0xbf85ffef, 0xbf9c0000, -- 0xbf8200ca, 0xbef8007e, -- 0x8679ff7f, 0x0000ffff, -- 0x8779ff79, 0x00040000, -- 0xbefa0080, 0xbefb00ff, -- 0x00807fac, 0x8676ff7f, -- 0x08000000, 0x8f768376, -- 0x877b767b, 0x8676ff7f, -- 0x70000000, 0x8f768176, -- 0x877b767b, 0x8676ff7f, -- 0x04000000, 0xbf84001e, -- 0xbefe00c1, 0xbeff00c1, -- 0xb8f34306, 0x8673c173, -- 0xbf840019, 0x8e738673, -- 0x8e738273, 0xbefa0073, -- 0xb8f22a05, 0x80728172, -- 0x8e728a72, 0xb8f61605, -- 0x80768176, 0x8e768676, -- 0x80727672, 0x8072ff72, -- 0x00000080, 0xbefa00ff, -- 0x01000000, 0xbefc0080, -- 0xe0510000, 0x721e0000, -- 0xe0510100, 0x721e0000, -- 0x807cff7c, 0x00000200, -- 0x8072ff72, 0x00000200, -- 0xbf0a737c, 0xbf85fff6, -- 0xbef20080, 0xbefe00c1, -- 0xbeff00c1, 0xb8f32a05, -- 0x80738173, 0x8e738273, -- 0x8e7a8873, 0xbefa00ff, -- 0x01000000, 0xbef60072, -- 0x8072ff72, 0x00000400, -- 0xbefc0084, 0xbf11087c, -- 0x8073ff73, 0x00008000, -- 0xe0524000, 0x721e0000, -- 0xe0524100, 0x721e0100, -- 0xe0524200, 0x721e0200, -- 0xe0524300, 0x721e0300, -- 0xbf8c0f70, 0x7e000300, -- 0x7e020301, 0x7e040302, -- 0x7e060303, 0x807c847c, -- 0x8072ff72, 0x00000400, -- 0xbf0a737c, 0xbf85ffee, -- 0xbf9c0000, 0xe0524000, -- 0x761e0000, 0xe0524100, -- 0x761e0100, 0xe0524200, -- 0x761e0200, 0xe0524300, -- 0x761e0300, 0xb8f22a05, -- 0x80728172, 0x8e728a72, -- 0xb8f61605, 0x80768176, -- 0x8e768676, 0x80727672, -- 0x80f2c072, 0xb8f31605, -- 0x80738173, 0x8e738473, -- 0x8e7a8273, 0xbefa00ff, -- 0x01000000, 0xbefc0073, -- 0xc031003c, 0x00000072, -- 0x80f2c072, 0xbf8c007f, -- 0x80fc907c, 0xbe802d00, -- 0xbe822d02, 0xbe842d04, -- 0xbe862d06, 0xbe882d08, -- 0xbe8a2d0a, 0xbe8c2d0c, -- 0xbe8e2d0e, 0xbf06807c, -- 0xbf84fff1, 0xb8f22a05, -- 0x80728172, 0x8e728a72, -- 0xb8f61605, 0x80768176, -- 0x8e768676, 0x80727672, -- 0xbefa0084, 0xbefa00ff, -- 0x01000000, 0xc0211cfc, -- 0x00000072, 0x80728472, -- 0xc0211c3c, 0x00000072, -- 0x80728472, 0xc0211c7c, -- 0x00000072, 0x80728472, -- 0xc0211bbc, 0x00000072, -- 0x80728472, 0xc0211bfc, -- 0x00000072, 0x80728472, -- 0xc0211d3c, 0x00000072, -- 0x80728472, 0xc0211d7c, -- 0x00000072, 0x80728472, -- 0xc0211a3c, 0x00000072, -- 0x80728472, 0xc0211a7c, -- 0x00000072, 0x80728472, -- 0xc0211dfc, 0x00000072, -- 0x80728472, 0xc0211b3c, -- 0x00000072, 0x80728472, -- 0xc0211b7c, 0x00000072, -- 0x80728472, 0xbf8c007f, -- 0x8671ff71, 0x0000ffff, -- 0xbefc0073, 0xbefe006e, -- 0xbeff006f, 0x867375ff, -- 0x000003ff, 0xb9734803, -- 0x867375ff, 0xfffff800, -- 0x8f738b73, 0xb973a2c3, -- 0xb977f801, 0x8673ff71, -- 0xf0000000, 0x8f739c73, -- 0x8e739073, 0xbef60080, -- 0x87767376, 0x8673ff71, -- 0x08000000, 0x8f739b73, -- 0x8e738f73, 0x87767376, -- 0x8673ff74, 0x00800000, -- 0x8f739773, 0xb976f807, -- 0x86fe7e7e, 0x86ea6a6a, -- 0xb974f802, 0xbf8a0000, -- 0x95807370, 0xbf810000, --}; -- -diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm -deleted file mode 100644 -index ae2af3d..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm -+++ /dev/null -@@ -1,1388 +0,0 @@ --/* -- * Copyright 2016 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- --#if 0 --HW (GFX9) source code for CWSR trap handler --#Version 18 + multiple trap handler -- --// this performance-optimal version was originally from Seven Xu at SRDC -- --// Revison #18 --... --/* Rev History --** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) --** #4. SR Memory Layout: --** 1. VGPR-SGPR-HWREG-{LDS} --** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. --** #5. Update: 1. Accurate g8sr_ts_save_d timestamp --** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) --** #7. Update: 1. don't barrier if noLDS --** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version --** 2. Fix SQ issue by s_sleep 2 --** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last --** 2. optimize s_buffer save by burst 16sgprs... --** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. --** #11. Update 1. Add 2 more timestamp for debug version --** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance --** #13. Integ 1. Always use MUBUF for PV trap shader... --** #14. Update 1. s_buffer_store soft clause... --** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. --** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree --** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] --** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... --** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 --** 2. FUNC - Handle non-CWSR traps --*/ -- --var G8SR_WDMEM_HWREG_OFFSET = 0 --var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes -- --// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. -- --var G8SR_DEBUG_TIMESTAMP = 0 --var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset --var s_g8sr_ts_save_s = s[34:35] // save start --var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi --var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ --var s_g8sr_ts_save_d = s[40:41] // save end --var s_g8sr_ts_restore_s = s[42:43] // restore start --var s_g8sr_ts_restore_d = s[44:45] // restore end -- --var G8SR_VGPR_SR_IN_DWX4 = 0 --var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes --var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 -- -- --/*************************************************************************/ --/* control on how to run the shader */ --/*************************************************************************/ --//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) --var EMU_RUN_HACK = 0 --var EMU_RUN_HACK_RESTORE_NORMAL = 0 --var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 --var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 --var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK --var SAVE_LDS = 1 --var WG_BASE_ADDR_LO = 0x9000a000 --var WG_BASE_ADDR_HI = 0x0 --var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem --var CTX_SAVE_CONTROL = 0x0 --var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL --var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) --var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write --var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes --var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing -- --/**************************************************************************/ --/* variables */ --/**************************************************************************/ --var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 --var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 --var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 --var SQ_WAVE_STATUS_HALT_MASK = 0x2000 -- --var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 --var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 --var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 --var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 --var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 --var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits -- --var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 --var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask --var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 --var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 --var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 --var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF --var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 --var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 --var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 --var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 --var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 --var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 -- --var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME --var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME --var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME -- --var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 --var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 -- -- --/* Save */ --var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes --var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE -- --var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit --var S_SAVE_SPI_INIT_ATC_SHIFT = 27 --var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype --var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 --var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG --var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -- --var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used --var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME --var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME --var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME -- --var s_save_spi_init_lo = exec_lo --var s_save_spi_init_hi = exec_hi -- --var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} --var s_save_pc_hi = ttmp1 --var s_save_exec_lo = ttmp2 --var s_save_exec_hi = ttmp3 --var s_save_status = ttmp4 --var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine --var s_save_xnack_mask_lo = ttmp6 --var s_save_xnack_mask_hi = ttmp7 --var s_save_buf_rsrc0 = ttmp8 --var s_save_buf_rsrc1 = ttmp9 --var s_save_buf_rsrc2 = ttmp10 --var s_save_buf_rsrc3 = ttmp11 -- --var s_save_mem_offset = ttmp14 --var s_save_alloc_size = s_save_trapsts //conflict --var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) --var s_save_m0 = ttmp15 -- --/* Restore */ --var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE --var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC -- --var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit --var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 --var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype --var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 --var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG --var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -- --var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT --var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK --var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT --var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK -- --var s_restore_spi_init_lo = exec_lo --var s_restore_spi_init_hi = exec_hi -- --var s_restore_mem_offset = ttmp12 --var s_restore_alloc_size = ttmp3 --var s_restore_tmp = ttmp6 --var s_restore_mem_offset_save = s_restore_tmp //no conflict -- --var s_restore_m0 = s_restore_alloc_size //no conflict -- --var s_restore_mode = ttmp7 -- --var s_restore_pc_lo = ttmp0 --var s_restore_pc_hi = ttmp1 --var s_restore_exec_lo = ttmp14 --var s_restore_exec_hi = ttmp15 --var s_restore_status = ttmp4 --var s_restore_trapsts = ttmp5 --var s_restore_xnack_mask_lo = xnack_mask_lo --var s_restore_xnack_mask_hi = xnack_mask_hi --var s_restore_buf_rsrc0 = ttmp8 --var s_restore_buf_rsrc1 = ttmp9 --var s_restore_buf_rsrc2 = ttmp10 --var s_restore_buf_rsrc3 = ttmp11 -- --/**************************************************************************/ --/* trap handler entry points */ --/**************************************************************************/ --/* Shader Main*/ -- --shader main -- asic(GFX9) -- type(CS) -- -- -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore -- //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC -- s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC -- s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. -- s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE -- //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE -- s_branch L_SKIP_RESTORE //NOT restore, SAVE actually -- else -- s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save -- end -- --L_JUMP_TO_RESTORE: -- s_branch L_RESTORE //restore -- --L_SKIP_RESTORE: -- -- s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC -- s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save -- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -- s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save -- s_cbranch_scc1 L_SAVE //this is the operation for save -- -- // ********* Handle non-CWSR traps ******************* --if (!EMU_RUN_HACK) -- // Illegal instruction is a non-maskable exception which blocks context save. -- // Halt the wavefront and return from the trap. -- s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK -- s_cbranch_scc1 L_HALT_WAVE -- -- // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. -- // Instead, halt the wavefront and return from the trap. -- s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK -- s_cbranch_scc0 L_NO_MEM_VIOL -- --L_HALT_WAVE: -- s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK -- s_branch L_EXCP_CASE -- --L_NO_MEM_VIOL: -- /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ -- s_getreg_b32 ttmp14,hwreg(HW_REG_SQ_SHADER_TMA_LO) -- s_getreg_b32 ttmp15,hwreg(HW_REG_SQ_SHADER_TMA_HI) -- s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 -- s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [ttmp14, ttmp15], 0 -- s_waitcnt lgkmcnt(0) -- s_or_b32 ttmp7, ttmp8, ttmp9 -- s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set -- s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) -- s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler -- --L_NO_NEXT_TRAP: -- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception -- s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. -- s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 -- s_addc_u32 ttmp1, ttmp1, 0 --L_EXCP_CASE: -- s_and_b32 ttmp1, ttmp1, 0xFFFF -- s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) -- s_rfe_b64 [ttmp0, ttmp1] --end -- // ********* End handling of non-CWSR traps ******************* -- --/**************************************************************************/ --/* save routine */ --/**************************************************************************/ -- --L_SAVE: -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_save_s -- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? --end -- -- //check whether there is mem_viol -- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK -- s_cbranch_scc0 L_NO_PC_REWIND -- -- //if so, need rewind PC assuming GDS operation gets NACKed -- s_mov_b32 s_save_tmp, 0 //clear mem_viol bit -- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit -- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] -- s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 -- s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc -- --L_NO_PC_REWIND: -- s_mov_b32 s_save_tmp, 0 //clear saveCtx bit -- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit -- -- s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK -- s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation -- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT -- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT -- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp -- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY -- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp -- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS -- s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG -- -- s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp -- -- /* inform SPI the readiness and wait for SPI's go signal */ -- s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI -- s_mov_b32 s_save_exec_hi, exec_hi -- s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_sq_save_msg -- s_waitcnt lgkmcnt(0) --end -- -- if (EMU_RUN_HACK) -- -- else -- s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC -- end -- -- L_SLEEP: -- s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 -- -- if (EMU_RUN_HACK) -- -- else -- s_cbranch_execz L_SLEEP -- end -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_spi_wrexec -- s_waitcnt lgkmcnt(0) --end -- -- /* setup Resource Contants */ -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) -- //calculate wd_addr using absolute thread id -- v_readlane_b32 s_save_tmp, v9, 0 -- s_lshr_b32 s_save_tmp, s_save_tmp, 6 -- s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE -- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO -- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI -- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL -- else -- end -- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) -- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO -- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI -- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL -- else -- end -- -- -- s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo -- s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE -- s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited -- s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC -- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK -- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position -- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC -- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK -- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position -- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE -- -- //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) -- s_mov_b32 s_save_m0, m0 //save M0 -- -- /* global mem offset */ -- s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 -- -- -- -- -- /* save HW registers */ -- ////////////////////////////// -- -- L_SAVE_HWREG: -- // HWREG SR memory offset : size(VGPR)+size(SGPR) -- get_vgpr_size_bytes(s_save_mem_offset) -- get_sgpr_size_bytes(s_save_tmp) -- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp -- -- -- s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- -- write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 -- -- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) -- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 -- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over -- end -- -- write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC -- write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) -- write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC -- write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) -- write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS -- -- //s_save_trapsts conflicts with s_save_alloc_size -- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -- write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS -- -- write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO -- write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI -- -- //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 -- s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE -- write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) -- -- -- -- /* the first wave in the threadgroup */ -- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit -- s_mov_b32 s_save_exec_hi, 0x0 -- s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] -- -- -- /* save SGPRs */ -- // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... -- ////////////////////////////// -- -- // SGPR SR memory offset : size(VGPR) -- get_vgpr_size_bytes(s_save_mem_offset) -- // TODO, change RSRC word to rearrange memory layout for SGPRS -- -- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 -- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) -- -- if (SGPR_SAVE_USE_SQC) -- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes -- else -- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) -- end -- -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- -- // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 -- //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 -- s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 -- s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset -- s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 -- -- s_mov_b32 m0, 0x0 //SGPR initial index value =0 -- s_nop 0x0 //Manually inserted wait states -- L_SAVE_SGPR_LOOP: -- // SGPR is allocated in 16 SGPR granularity -- s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] -- s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] -- s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] -- s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] -- s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] -- s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] -- s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] -- s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] -- -- write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 -- s_add_u32 m0, m0, 16 //next sgpr index -- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? -- // restore s_save_buf_rsrc0,1 -- //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo -- s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo -- -- -- -- -- /* save first 4 VGPR, then LDS save could use */ -- // each wave will alloc 4 vgprs at least... -- ///////////////////////////////////////////////////////////////////////////////////// -- -- s_mov_b32 s_save_mem_offset, 0 -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- -- // VGPR Allocated in 4-GPR granularity -- --if G8SR_VGPR_SR_IN_DWX4 -- // the const stride for DWx4 is 4*4 bytes -- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -- -- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -- -- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes --else -- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 -- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 -- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 --end -- -- -- -- /* save LDS */ -- ////////////////////////////// -- -- L_SAVE_LDS: -- -- // Change EXEC to all threads... -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size -- s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? -- s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE -- -- s_barrier //LDS is used? wait for other waves in the same TG -- s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here -- s_cbranch_scc0 L_SAVE_LDS_DONE -- -- // first wave do LDS save; -- -- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw -- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes -- s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes -- -- // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) -- // -- get_vgpr_size_bytes(s_save_mem_offset) -- get_sgpr_size_bytes(s_save_tmp) -- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp -- s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() -- -- -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -- -- --var LDS_DMA_ENABLE = 0 --var UNROLL = 0 --if UNROLL==0 && LDS_DMA_ENABLE==1 -- s_mov_b32 s3, 256*2 -- s_nop 0 -- s_nop 0 -- s_nop 0 -- L_SAVE_LDS_LOOP: -- //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? -- if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity -- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW -- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW -- end -- -- s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes -- s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes -- s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? -- --elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss -- // store from higest LDS address to lowest -- s_mov_b32 s3, 256*2 -- s_sub_u32 m0, s_save_alloc_size, s3 -- s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 -- s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... -- s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest -- s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc -- s_nop 0 -- s_nop 0 -- s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes -- s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved -- s_add_u32 s0, s0,s_save_alloc_size -- s_addc_u32 s1, s1, 0 -- s_setpc_b64 s[0:1] -- -- -- for var i =0; i< 128; i++ -- // be careful to make here a 64Byte aligned address, which could improve performance... -- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW -- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW -- -- if i!=127 -- s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline -- s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 -- end -- end -- --else // BUFFER_STORE -- v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 -- v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid -- v_mul_i32_i24 v2, v3, 8 // tid*8 -- v_mov_b32 v3, 256*2 -- s_mov_b32 m0, 0x10000 -- s_mov_b32 s0, s_save_buf_rsrc3 -- s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid -- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT -- --L_SAVE_LDS_LOOP_VECTOR: -- ds_read_b64 v[0:1], v2 //x =LDS[a], byte address -- s_waitcnt lgkmcnt(0) -- buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 --// s_waitcnt vmcnt(0) --// v_add_u32 v2, vcc[0:1], v2, v3 -- v_add_u32 v2, v2, v3 -- v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size -- s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR -- -- // restore rsrc3 -- s_mov_b32 s_save_buf_rsrc3, s0 -- --end -- --L_SAVE_LDS_DONE: -- -- -- /* save VGPRs - set the Rest VGPRs */ -- ////////////////////////////////////////////////////////////////////////////////////// -- L_SAVE_VGPR: -- // VGPR SR memory offset: 0 -- // TODO rearrange the RSRC words to use swizzle for VGPR save... -- -- s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 -- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible -- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) -- if (SWIZZLE_EN) -- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- -- // VGPR Allocated in 4-GPR granularity -- --if G8SR_VGPR_SR_IN_DWX4 -- // the const stride for DWx4 is 4*4 bytes -- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -- -- s_mov_b32 m0, 4 // skip first 4 VGPRs -- s_cmp_lt_u32 m0, s_save_alloc_size -- s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs -- -- s_set_gpr_idx_on m0, 0x1 // This will change M0 -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 --L_SAVE_VGPR_LOOP: -- v_mov_b32 v0, v0 // v0 = v[0+m0] -- v_mov_b32 v1, v1 -- v_mov_b32 v2, v2 -- v_mov_b32 v3, v3 -- -- -- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -- s_add_u32 m0, m0, 4 -- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 -- s_cmp_lt_u32 m0, s_save_alloc_size -- s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? -- s_set_gpr_idx_off --L_SAVE_VGPR_LOOP_END: -- -- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes --else -- // VGPR store using dw burst -- s_mov_b32 m0, 0x4 //VGPR initial index value =0 -- s_cmp_lt_u32 m0, s_save_alloc_size -- s_cbranch_scc0 L_SAVE_VGPR_END -- -- -- s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 -- s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later -- -- L_SAVE_VGPR_LOOP: -- v_mov_b32 v0, v0 //v0 = v[0+m0] -- v_mov_b32 v1, v1 //v0 = v[0+m0] -- v_mov_b32 v2, v2 //v0 = v[0+m0] -- v_mov_b32 v3, v3 //v0 = v[0+m0] -- -- if(USE_MTBUF_INSTEAD_OF_MUBUF) -- tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -- else -- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 -- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 -- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -- end -- -- s_add_u32 m0, m0, 4 //next vgpr index -- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes -- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? -- s_set_gpr_idx_off --end -- --L_SAVE_VGPR_END: -- -- -- -- -- -- -- /* S_PGM_END_SAVED */ //FIXME graphics ONLY -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) -- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] -- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 -- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over -- s_rfe_b64 s_save_pc_lo //Return to the main shader program -- else -- end -- --// Save Done timestamp --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_save_d -- // SGPR SR memory offset : size(VGPR) -- get_vgpr_size_bytes(s_save_mem_offset) -- s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET -- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -- // Need reset rsrc2?? -- s_mov_b32 m0, s_save_mem_offset -- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 --end -- -- -- s_branch L_END_PGM -- -- -- --/**************************************************************************/ --/* restore routine */ --/**************************************************************************/ -- --L_RESTORE: -- /* Setup Resource Contants */ -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) -- //calculate wd_addr using absolute thread id -- v_readlane_b32 s_restore_tmp, v9, 0 -- s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 -- s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE -- s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO -- s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI -- s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL -- else -- end -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_restore_s -- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -- // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... -- s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] -- s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. --end -- -- -- -- s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo -- s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi -- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE -- s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) -- s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC -- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK -- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position -- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC -- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK -- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position -- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE -- -- /* global mem offset */ --// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 -- -- /* the first wave in the threadgroup */ -- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK -- s_cbranch_scc0 L_RESTORE_VGPR -- -- /* restore LDS */ -- ////////////////////////////// -- L_RESTORE_LDS: -- -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size -- s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? -- s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR -- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw -- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes -- s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes -- -- // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) -- // -- get_vgpr_size_bytes(s_restore_mem_offset) -- get_sgpr_size_bytes(s_restore_tmp) -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? -- -- -- if (SWIZZLE_EN) -- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -- -- L_RESTORE_LDS_LOOP: -- if (SAVE_LDS) -- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW -- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW -- end -- s_add_u32 m0, m0, 256*2 // 128 DW -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW -- s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? -- -- -- /* restore VGPRs */ -- ////////////////////////////// -- L_RESTORE_VGPR: -- // VGPR SR memory offset : 0 -- s_mov_b32 s_restore_mem_offset, 0x0 -- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead -- s_mov_b32 exec_hi, 0xFFFFFFFF -- -- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 -- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) -- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) -- if (SWIZZLE_EN) -- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- --if G8SR_VGPR_SR_IN_DWX4 -- get_vgpr_size_bytes(s_restore_mem_offset) -- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -- -- // the const stride for DWx4 is 4*4 bytes -- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -- -- s_mov_b32 m0, s_restore_alloc_size -- s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 -- --L_RESTORE_VGPR_LOOP: -- buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 -- s_waitcnt vmcnt(0) -- s_sub_u32 m0, m0, 4 -- v_mov_b32 v0, v0 // v[0+m0] = v0 -- v_mov_b32 v1, v1 -- v_mov_b32 v2, v2 -- v_mov_b32 v3, v3 -- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -- s_cmp_eq_u32 m0, 0x8000 -- s_cbranch_scc0 L_RESTORE_VGPR_LOOP -- s_set_gpr_idx_off -- -- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes -- --else -- // VGPR load using dw burst -- s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -- s_mov_b32 m0, 4 //VGPR initial index value = 1 -- s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 -- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later -- -- L_RESTORE_VGPR_LOOP: -- if(USE_MTBUF_INSTEAD_OF_MUBUF) -- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -- else -- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 -- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 -- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 -- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 -- end -- s_waitcnt vmcnt(0) //ensure data ready -- v_mov_b32 v0, v0 //v[0+m0] = v0 -- v_mov_b32 v1, v1 -- v_mov_b32 v2, v2 -- v_mov_b32 v3, v3 -- s_add_u32 m0, m0, 4 //next vgpr index -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes -- s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 -- s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? -- s_set_gpr_idx_off -- /* VGPR restore on v0 */ -- if(USE_MTBUF_INSTEAD_OF_MUBUF) -- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -- else -- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 -- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 -- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 -- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 -- end -- --end -- -- /* restore SGPRs */ -- ////////////////////////////// -- -- // SGPR SR memory offset : size(VGPR) -- get_vgpr_size_bytes(s_restore_mem_offset) -- get_sgpr_size_bytes(s_restore_tmp) -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group -- // TODO, change RSRC word to rearrange memory layout for SGPRS -- -- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 -- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) -- -- if (SGPR_SAVE_USE_SQC) -- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes -- else -- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) -- end -- if (SWIZZLE_EN) -- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- s_mov_b32 m0, s_restore_alloc_size -- -- L_RESTORE_SGPR_LOOP: -- read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made -- s_waitcnt lgkmcnt(0) //ensure data ready -- -- s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] -- s_nop 0 // hazard SALU M0=> S_MOVREL -- -- s_movreld_b64 s0, s0 //s[0+m0] = s0 -- s_movreld_b64 s2, s2 -- s_movreld_b64 s4, s4 -- s_movreld_b64 s6, s6 -- s_movreld_b64 s8, s8 -- s_movreld_b64 s10, s10 -- s_movreld_b64 s12, s12 -- s_movreld_b64 s14, s14 -- -- s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 -- s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? -- -- /* restore HW registers */ -- ////////////////////////////// -- L_RESTORE_HWREG: -- -- --if G8SR_DEBUG_TIMESTAMP -- s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo -- s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi --end -- -- // HWREG SR memory offset : size(VGPR)+size(SGPR) -- get_vgpr_size_bytes(s_restore_mem_offset) -- get_sgpr_size_bytes(s_restore_tmp) -- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -- -- -- s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes -- if (SWIZZLE_EN) -- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -- else -- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -- end -- -- read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 -- read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC -- read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) -- read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC -- read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) -- read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS -- read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS -- read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO -- read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI -- read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE -- -- s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS -- -- s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS -- -- //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: -- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) -- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) -- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over -- end -- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) -- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal -- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over -- end -- -- s_mov_b32 m0, s_restore_m0 -- s_mov_b32 exec_lo, s_restore_exec_lo -- s_mov_b32 exec_hi, s_restore_exec_hi -- -- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts -- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 -- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts -- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT -- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 -- //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore -- s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode -- //reuse s_restore_m0 as a temp register -- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK -- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT -- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT -- s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero -- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 -- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK -- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT -- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 -- s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK -- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT -- s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp -- -- s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 -- s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 -- s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu -- -- s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time -- --if G8SR_DEBUG_TIMESTAMP -- s_memrealtime s_g8sr_ts_restore_d -- s_waitcnt lgkmcnt(0) --end -- --// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution -- s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc -- -- --/**************************************************************************/ --/* the END */ --/**************************************************************************/ --L_END_PGM: -- s_endpgm -- --end -- -- --/**************************************************************************/ --/* the helper functions */ --/**************************************************************************/ -- --//Only for save hwreg to mem --function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) -- s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on -- s_mov_b32 m0, s_mem_offset -- s_buffer_store_dword s, s_rsrc, m0 glc:1 -- s_add_u32 s_mem_offset, s_mem_offset, 4 -- s_mov_b32 m0, exec_lo --end -- -- --// HWREG are saved before SGPRs, so all HWREG could be use. --function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) -- -- s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 -- s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 -- s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 -- s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 -- s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 -- s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc --end -- -- --function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) -- s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 -- s_add_u32 s_mem_offset, s_mem_offset, 4 --end -- --function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) -- s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 -- s_sub_u32 s_mem_offset, s_mem_offset, 4*16 --end -- -- -- --function get_lds_size_bytes(s_lds_size_byte) -- // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW -- s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size -- s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW --end -- --function get_vgpr_size_bytes(s_vgpr_size_byte) -- s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -- s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 -- s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible --end -- --function get_sgpr_size_bytes(s_sgpr_size_byte) -- s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -- s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 -- s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) --end -- --function get_hwreg_size_bytes -- return 128 //HWREG size 128 bytes --end -- -- -- --#endif -- --static const uint32_t cwsr_trap_gfx9_hex[] = { -- 0xbf820001, 0xbf820124, -- 0xb8f0f802, 0x89708670, -- 0xb8f1f803, 0x8674ff71, -- 0x00000400, 0xbf85001d, -- 0x8674ff71, 0x00000800, -- 0xbf850003, 0x8674ff71, -- 0x00000100, 0xbf840003, -- 0x8770ff70, 0x00002000, -- 0xbf820010, 0xb8faf812, -- 0xb8fbf813, 0x8efa887a, -- 0xc00a1d3d, 0x00000000, -- 0xbf8cc07f, 0x87737574, -- 0xbf840002, 0xb970f802, -- 0xbe801d74, 0xb8f1f803, -- 0x8671ff71, 0x000001ff, -- 0xbf850002, 0x806c846c, -- 0x826d806d, 0x866dff6d, -- 0x0000ffff, 0xb970f802, -- 0xbe801f6c, 0xb8f1f803, -- 0x8671ff71, 0x00000100, -- 0xbf840006, 0xbef60080, -- 0xb9760203, 0x866dff6d, -- 0x0000ffff, 0x80ec886c, -- 0x82ed806d, 0xbef60080, -- 0xb9760283, 0xbef20068, -- 0xbef30069, 0xb8f62407, -- 0x8e769c76, 0x876d766d, -- 0xb8f603c7, 0x8e769b76, -- 0x876d766d, 0xb8f6f807, -- 0x8676ff76, 0x00007fff, -- 0xb976f807, 0xbeee007e, -- 0xbeef007f, 0xbefe0180, -- 0xbf900004, 0xbf8e0002, -- 0xbf88fffe, 0xbef4007e, -- 0x8675ff7f, 0x0000ffff, -- 0x8775ff75, 0x00040000, -- 0xbef60080, 0xbef700ff, -- 0x00807fac, 0x8676ff7f, -- 0x08000000, 0x8f768376, -- 0x87777677, 0x8676ff7f, -- 0x70000000, 0x8f768176, -- 0x87777677, 0xbefb007c, -- 0xbefa0080, 0xb8fa2a05, -- 0x807a817a, 0x8e7a8a7a, -- 0xb8f61605, 0x80768176, -- 0x8e768676, 0x807a767a, -- 0xbef60084, 0xbef600ff, -- 0x01000000, 0xbefe007c, -- 0xbefc007a, 0xc0611efa, -- 0x0000007c, 0x807a847a, -- 0xbefc007e, 0xbefe007c, -- 0xbefc007a, 0xc0611b3a, -- 0x0000007c, 0x807a847a, -- 0xbefc007e, 0xbefe007c, -- 0xbefc007a, 0xc0611b7a, -- 0x0000007c, 0x807a847a, -- 0xbefc007e, 0xbefe007c, -- 0xbefc007a, 0xc0611bba, -- 0x0000007c, 0x807a847a, -- 0xbefc007e, 0xbefe007c, -- 0xbefc007a, 0xc0611bfa, -- 0x0000007c, 0x807a847a, -- 0xbefc007e, 0xbefe007c, -- 0xbefc007a, 0xc0611c3a, -- 0x0000007c, 0x807a847a, -- 0xbefc007e, 0xb8f1f803, -- 0xbefe007c, 0xbefc007a, -- 0xc0611c7a, 0x0000007c, -- 0x807a847a, 0xbefc007e, -- 0xbefe007c, 0xbefc007a, -- 0xc0611cba, 0x0000007c, -- 0x807a847a, 0xbefc007e, -- 0xbefe007c, 0xbefc007a, -- 0xc0611cfa, 0x0000007c, -- 0x807a847a, 0xbefc007e, -- 0xb8fbf801, 0xbefe007c, -- 0xbefc007a, 0xc0611efa, -- 0x0000007c, 0x807a847a, -- 0xbefc007e, 0x8676ff7f, -- 0x04000000, 0xbeef0080, -- 0x876f6f76, 0xb8fa2a05, -- 0x807a817a, 0x8e7a8a7a, -- 0xb8f11605, 0x80718171, -- 0x8e718471, 0x8e768271, -- 0xbef600ff, 0x01000000, -- 0xbef20174, 0x80747a74, -- 0x82758075, 0xbefc0080, -- 0xbf800000, 0xbe802b00, -- 0xbe822b02, 0xbe842b04, -- 0xbe862b06, 0xbe882b08, -- 0xbe8a2b0a, 0xbe8c2b0c, -- 0xbe8e2b0e, 0xc06b003a, -- 0x00000000, 0xc06b013a, -- 0x00000010, 0xc06b023a, -- 0x00000020, 0xc06b033a, -- 0x00000030, 0x8074c074, -- 0x82758075, 0x807c907c, -- 0xbf0a717c, 0xbf85ffeb, -- 0xbef40172, 0xbefa0080, -- 0xbefe00c1, 0xbeff00c1, -- 0xbef600ff, 0x01000000, -- 0xe0724000, 0x7a1d0000, -- 0xe0724100, 0x7a1d0100, -- 0xe0724200, 0x7a1d0200, -- 0xe0724300, 0x7a1d0300, -- 0xbefe00c1, 0xbeff00c1, -- 0xb8f14306, 0x8671c171, -- 0xbf84002c, 0xbf8a0000, -- 0x8676ff6f, 0x04000000, -- 0xbf840028, 0x8e718671, -- 0x8e718271, 0xbef60071, -- 0xb8fa2a05, 0x807a817a, -- 0x8e7a8a7a, 0xb8f61605, -- 0x80768176, 0x8e768676, -- 0x807a767a, 0x807aff7a, -- 0x00000080, 0xbef600ff, -- 0x01000000, 0xbefc0080, -- 0xd28c0002, 0x000100c1, -- 0xd28d0003, 0x000204c1, -- 0xd1060002, 0x00011103, -- 0x7e0602ff, 0x00000200, -- 0xbefc00ff, 0x00010000, -- 0xbe800077, 0x8677ff77, -- 0xff7fffff, 0x8777ff77, -- 0x00058000, 0xd8ec0000, -- 0x00000002, 0xbf8cc07f, -- 0xe0765000, 0x7a1d0002, -- 0x68040702, 0xd0c9006a, -- 0x0000e302, 0xbf87fff7, -- 0xbef70000, 0xbefa00ff, -- 0x00000400, 0xbefe00c1, -- 0xbeff00c1, 0xb8f12a05, -- 0x80718171, 0x8e718271, -- 0x8e768871, 0xbef600ff, -- 0x01000000, 0xbefc0084, -- 0xbf0a717c, 0xbf840015, -- 0xbf11017c, 0x8071ff71, -- 0x00001000, 0x7e000300, -- 0x7e020301, 0x7e040302, -- 0x7e060303, 0xe0724000, -- 0x7a1d0000, 0xe0724100, -- 0x7a1d0100, 0xe0724200, -- 0x7a1d0200, 0xe0724300, -- 0x7a1d0300, 0x807c847c, -- 0x807aff7a, 0x00000400, -- 0xbf0a717c, 0xbf85ffef, -- 0xbf9c0000, 0xbf8200c5, -- 0xbef4007e, 0x8675ff7f, -- 0x0000ffff, 0x8775ff75, -- 0x00040000, 0xbef60080, -- 0xbef700ff, 0x00807fac, -- 0x8672ff7f, 0x08000000, -- 0x8f728372, 0x87777277, -- 0x8672ff7f, 0x70000000, -- 0x8f728172, 0x87777277, -- 0x8672ff7f, 0x04000000, -- 0xbf84001e, 0xbefe00c1, -- 0xbeff00c1, 0xb8ef4306, -- 0x866fc16f, 0xbf840019, -- 0x8e6f866f, 0x8e6f826f, -- 0xbef6006f, 0xb8f82a05, -- 0x80788178, 0x8e788a78, -- 0xb8f21605, 0x80728172, -- 0x8e728672, 0x80787278, -- 0x8078ff78, 0x00000080, -- 0xbef600ff, 0x01000000, -- 0xbefc0080, 0xe0510000, -- 0x781d0000, 0xe0510100, -- 0x781d0000, 0x807cff7c, -- 0x00000200, 0x8078ff78, -- 0x00000200, 0xbf0a6f7c, -- 0xbf85fff6, 0xbef80080, -- 0xbefe00c1, 0xbeff00c1, -- 0xb8ef2a05, 0x806f816f, -- 0x8e6f826f, 0x8e76886f, -- 0xbef600ff, 0x01000000, -- 0xbef20078, 0x8078ff78, -- 0x00000400, 0xbefc0084, -- 0xbf11087c, 0x806fff6f, -- 0x00008000, 0xe0524000, -- 0x781d0000, 0xe0524100, -- 0x781d0100, 0xe0524200, -- 0x781d0200, 0xe0524300, -- 0x781d0300, 0xbf8c0f70, -- 0x7e000300, 0x7e020301, -- 0x7e040302, 0x7e060303, -- 0x807c847c, 0x8078ff78, -- 0x00000400, 0xbf0a6f7c, -- 0xbf85ffee, 0xbf9c0000, -- 0xe0524000, 0x721d0000, -- 0xe0524100, 0x721d0100, -- 0xe0524200, 0x721d0200, -- 0xe0524300, 0x721d0300, -- 0xb8f82a05, 0x80788178, -- 0x8e788a78, 0xb8f21605, -- 0x80728172, 0x8e728672, -- 0x80787278, 0x80f8c078, -- 0xb8ef1605, 0x806f816f, -- 0x8e6f846f, 0x8e76826f, -- 0xbef600ff, 0x01000000, -- 0xbefc006f, 0xc031003a, -- 0x00000078, 0x80f8c078, -- 0xbf8cc07f, 0x80fc907c, -- 0xbf800000, 0xbe802d00, -- 0xbe822d02, 0xbe842d04, -- 0xbe862d06, 0xbe882d08, -- 0xbe8a2d0a, 0xbe8c2d0c, -- 0xbe8e2d0e, 0xbf06807c, -- 0xbf84fff0, 0xb8f82a05, -- 0x80788178, 0x8e788a78, -- 0xb8f21605, 0x80728172, -- 0x8e728672, 0x80787278, -- 0xbef60084, 0xbef600ff, -- 0x01000000, 0xc0211bfa, -- 0x00000078, 0x80788478, -- 0xc0211b3a, 0x00000078, -- 0x80788478, 0xc0211b7a, -- 0x00000078, 0x80788478, -- 0xc0211eba, 0x00000078, -- 0x80788478, 0xc0211efa, -- 0x00000078, 0x80788478, -- 0xc0211c3a, 0x00000078, -- 0x80788478, 0xc0211c7a, -- 0x00000078, 0x80788478, -- 0xc0211a3a, 0x00000078, -- 0x80788478, 0xc0211a7a, -- 0x00000078, 0x80788478, -- 0xc0211cfa, 0x00000078, -- 0x80788478, 0xbf8cc07f, -- 0x866dff6d, 0x0000ffff, -- 0xbefc006f, 0xbefe007a, -- 0xbeff007b, 0x866f71ff, -- 0x000003ff, 0xb96f4803, -- 0x866f71ff, 0xfffff800, -- 0x8f6f8b6f, 0xb96fa2c3, -- 0xb973f801, 0x866fff6d, -- 0xf0000000, 0x8f6f9c6f, -- 0x8e6f906f, 0xbef20080, -- 0x87726f72, 0x866fff6d, -- 0x08000000, 0x8f6f9b6f, -- 0x8e6f8f6f, 0x87726f72, -- 0x866fff70, 0x00800000, -- 0x8f6f976f, 0xb972f807, -- 0x86fe7e7e, 0x86ea6a6a, -- 0xb970f802, 0xbf8a0000, -- 0x95806f6c, 0xbf810000, --}; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -index 207a05e..6316aad 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -@@ -25,7 +25,6 @@ - #include <linux/err.h> - #include <linux/fs.h> - #include <linux/sched.h> --#include <linux/sched/mm.h> - #include <linux/slab.h> - #include <linux/uaccess.h> - #include <linux/compat.h> -@@ -34,17 +33,13 @@ - #include <linux/mm.h> - #include <linux/mman.h> - #include <asm/processor.h> --#include <linux/ptrace.h> -- - #include "kfd_priv.h" - #include "kfd_device_queue_manager.h" - #include "kfd_dbgmgr.h" --#include "kfd_ipc.h" - - static long kfd_ioctl(struct file *, unsigned int, unsigned long); - static int kfd_open(struct inode *, struct file *); - static int kfd_mmap(struct file *, struct vm_area_struct *); --static bool kfd_is_large_bar(struct kfd_dev *dev); - - static const char kfd_dev_name[] = "kfd"; - -@@ -60,14 +55,6 @@ static int kfd_char_dev_major = -1; - static struct class *kfd_class; - struct device *kfd_device; - --static char *kfd_devnode(struct device *dev, umode_t *mode) --{ -- if (mode && dev->devt == MKDEV(kfd_char_dev_major, 0)) -- *mode = 0666; -- -- return NULL; --} -- - int kfd_chardev_init(void) - { - int err = 0; -@@ -82,8 +69,6 @@ int kfd_chardev_init(void) - if (IS_ERR(kfd_class)) - goto err_class_create; - -- kfd_class->devnode = kfd_devnode; -- - kfd_device = device_create(kfd_class, NULL, - MKDEV(kfd_char_dev_major, 0), - NULL, kfd_dev_name); -@@ -132,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep) - return -EPERM; - } - -- process = kfd_create_process(filep); -+ process = kfd_create_process(current); - if (IS_ERR(process)) - return PTR_ERR(process); - -@@ -157,12 +142,12 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, - struct kfd_ioctl_create_queue_args *args) - { - if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { -- pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); -+ pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); - return -EINVAL; - } - - if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { -- pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); -+ pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); - return -EINVAL; - } - -@@ -170,26 +155,26 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, - (!access_ok(VERIFY_WRITE, - (const void __user *) args->ring_base_address, - sizeof(uint64_t)))) { -- pr_err("Can't access ring base address\n"); -+ pr_err("kfd: can't access ring base address\n"); - return -EFAULT; - } - - if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { -- pr_err("Ring size must be a power of 2 or 0\n"); -+ pr_err("kfd: ring size must be a power of 2 or 0\n"); - return -EINVAL; - } - - if (!access_ok(VERIFY_WRITE, - (const void __user *) args->read_pointer_address, - sizeof(uint32_t))) { -- pr_err("Can't access read pointer\n"); -+ pr_err("kfd: can't access read pointer\n"); - return -EFAULT; - } - - if (!access_ok(VERIFY_WRITE, - (const void __user *) args->write_pointer_address, - sizeof(uint32_t))) { -- pr_err("Can't access write pointer\n"); -+ pr_err("kfd: can't access write pointer\n"); - return -EFAULT; - } - -@@ -197,7 +182,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, - !access_ok(VERIFY_WRITE, - (const void __user *) args->eop_buffer_address, - sizeof(uint32_t))) { -- pr_debug("Can't access eop buffer"); -+ pr_debug("kfd: can't access eop buffer"); - return -EFAULT; - } - -@@ -205,7 +190,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, - !access_ok(VERIFY_WRITE, - (const void __user *) args->ctx_save_restore_address, - sizeof(uint32_t))) { -- pr_debug("Can't access ctx save restore buffer"); -+ pr_debug("kfd: can't access ctx save restore buffer"); - return -EFAULT; - } - -@@ -221,7 +206,6 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, - q_properties->ctx_save_restore_area_address = - args->ctx_save_restore_address; - q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; -- q_properties->ctl_stack_size = args->ctl_stack_size; - if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || - args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) - q_properties->type = KFD_QUEUE_TYPE_COMPUTE; -@@ -235,27 +219,27 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, - else - q_properties->format = KFD_QUEUE_FORMAT_PM4; - -- pr_debug("Queue Percentage: %d, %d\n", -+ pr_debug("Queue Percentage (%d, %d)\n", - q_properties->queue_percent, args->queue_percentage); - -- pr_debug("Queue Priority: %d, %d\n", -+ pr_debug("Queue Priority (%d, %d)\n", - q_properties->priority, args->queue_priority); - -- pr_debug("Queue Address: 0x%llX, 0x%llX\n", -+ pr_debug("Queue Address (0x%llX, 0x%llX)\n", - q_properties->queue_address, args->ring_base_address); - -- pr_debug("Queue Size: 0x%llX, %u\n", -+ pr_debug("Queue Size (0x%llX, %u)\n", - q_properties->queue_size, args->ring_size); - -- pr_debug("Queue r/w Pointers: %p, %p\n", -- q_properties->read_ptr, -- q_properties->write_ptr); -+ pr_debug("Queue r/w Pointers (0x%llX, 0x%llX)\n", -+ (uint64_t) q_properties->read_ptr, -+ (uint64_t) q_properties->write_ptr); - -- pr_debug("Queue Format: %d\n", q_properties->format); -+ pr_debug("Queue Format (%d)\n", q_properties->format); - -- pr_debug("Queue EOP: 0x%llX\n", q_properties->eop_ring_buffer_address); -+ pr_debug("Queue EOP (0x%llX)\n", q_properties->eop_ring_buffer_address); - -- pr_debug("Queue CTX save area: 0x%llX\n", -+ pr_debug("Queue CTX save arex (0x%llX)\n", - q_properties->ctx_save_restore_area_address); - - return 0; -@@ -273,16 +257,16 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, - - memset(&q_properties, 0, sizeof(struct queue_properties)); - -- pr_debug("Creating queue ioctl\n"); -+ pr_debug("kfd: creating queue ioctl\n"); - - err = set_queue_properties_from_user(&q_properties, args); - if (err) - return err; - -- pr_debug("Looking for gpu id 0x%x\n", args->gpu_id); -+ pr_debug("kfd: looking for gpu id 0x%x\n", args->gpu_id); - dev = kfd_device_by_id(args->gpu_id); -- if (!dev) { -- pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); -+ if (dev == NULL) { -+ pr_debug("kfd: gpu id 0x%x was not found\n", args->gpu_id); - return -EINVAL; - } - -@@ -294,11 +278,12 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, - goto err_bind_process; - } - -- pr_debug("Creating queue for PASID %d on gpu 0x%x\n", -+ pr_debug("kfd: creating queue for PASID %d on GPU 0x%x\n", - p->pasid, - dev->id); - -- err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id); -+ err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, -+ 0, q_properties.type, &queue_id); - if (err != 0) - goto err_create_queue; - -@@ -306,28 +291,20 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, - - - /* Return gpu_id as doorbell offset for mmap usage */ -- args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; -- args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); -+ args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); - args->doorbell_offset <<= PAGE_SHIFT; -- if (KFD_IS_SOC15(dev->device_info->asic_family)) -- /* On SOC15 ASICs, doorbell allocation must be -- * per-device, and independent from the per-process -- * queue_id. Return the doorbell offset within the -- * doorbell aperture to user mode. -- */ -- args->doorbell_offset |= q_properties.doorbell_off; - - mutex_unlock(&p->mutex); - -- pr_debug("Queue id %d was created successfully\n", args->queue_id); -+ pr_debug("kfd: queue id %d was created successfully\n", args->queue_id); - -- pr_debug("Ring buffer address == 0x%016llX\n", -+ pr_debug("ring buffer address == 0x%016llX\n", - args->ring_base_address); - -- pr_debug("Read ptr address == 0x%016llX\n", -+ pr_debug("read ptr address == 0x%016llX\n", - args->read_pointer_address); - -- pr_debug("Write ptr address == 0x%016llX\n", -+ pr_debug("write ptr address == 0x%016llX\n", - args->write_pointer_address); - - return 0; -@@ -344,7 +321,7 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, - int retval; - struct kfd_ioctl_destroy_queue_args *args = data; - -- pr_debug("Destroying queue id %d for pasid %d\n", -+ pr_debug("kfd: destroying queue id %d for PASID %d\n", - args->queue_id, - p->pasid); - -@@ -364,12 +341,12 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, - struct queue_properties properties; - - if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { -- pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); -+ pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); - return -EINVAL; - } - - if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { -- pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); -+ pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); - return -EINVAL; - } - -@@ -377,12 +354,12 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, - (!access_ok(VERIFY_WRITE, - (const void __user *) args->ring_base_address, - sizeof(uint64_t)))) { -- pr_err("Can't access ring base address\n"); -+ pr_err("kfd: can't access ring base address\n"); - return -EFAULT; - } - - if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { -- pr_err("Ring size must be a power of 2 or 0\n"); -+ pr_err("kfd: ring size must be a power of 2 or 0\n"); - return -EINVAL; - } - -@@ -391,7 +368,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, - properties.queue_percent = args->queue_percentage; - properties.priority = args->queue_priority; - -- pr_debug("Updating queue id %d for pasid %d\n", -+ pr_debug("kfd: updating queue id %d for PASID %d\n", - args->queue_id, p->pasid); - - mutex_lock(&p->mutex); -@@ -403,58 +380,6 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, - return retval; - } - --static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, -- void *data) --{ -- int retval; -- const int max_num_cus = 1024; -- struct kfd_ioctl_set_cu_mask_args *args = data; -- struct queue_properties properties; -- uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; -- size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32); -- -- if ((args->num_cu_mask % 32) != 0) { -- pr_debug("num_cu_mask 0x%x must be a multiple of 32", -- args->num_cu_mask); -- return -EINVAL; -- } -- -- properties.cu_mask_count = args->num_cu_mask; -- if (properties.cu_mask_count == 0) { -- pr_debug("CU mask cannot be 0"); -- return -EINVAL; -- } -- -- /* To prevent an unreasonably large CU mask size, set an arbitrary -- * limit of max_num_cus bits. We can then just drop any CU mask bits -- * past max_num_cus bits and just use the first max_num_cus bits. -- */ -- if (properties.cu_mask_count > max_num_cus) { -- pr_debug("CU mask cannot be greater than 1024 bits"); -- properties.cu_mask_count = max_num_cus; -- cu_mask_size = sizeof(uint32_t) * (max_num_cus/32); -- } -- -- properties.cu_mask = kzalloc(cu_mask_size, GFP_KERNEL); -- if (!properties.cu_mask) -- return -ENOMEM; -- -- retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size); -- if (retval) { -- pr_debug("Could not copy CU mask from userspace"); -- kfree(properties.cu_mask); -- return -EFAULT; -- } -- -- mutex_lock(&p->mutex); -- -- retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); -- -- mutex_unlock(&p->mutex); -- -- return retval; --} -- - static int kfd_ioctl_set_memory_policy(struct file *filep, - struct kfd_process *p, void *data) - { -@@ -475,7 +400,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, - } - - dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -+ if (dev == NULL) - return -EINVAL; - - mutex_lock(&p->mutex); -@@ -507,38 +432,6 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, - return err; - } - --static int kfd_ioctl_set_trap_handler(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_set_trap_handler_args *args = data; -- struct kfd_dev *dev; -- int err = 0; -- struct kfd_process_device *pdd; -- -- dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -- return -EINVAL; -- -- mutex_lock(&p->mutex); -- -- pdd = kfd_bind_process_to_device(dev, p); -- if (IS_ERR(pdd)) { -- err = -ESRCH; -- goto out; -- } -- -- if (dev->dqm->ops.set_trap_handler(dev->dqm, -- &pdd->qpd, -- args->tba_addr, -- args->tma_addr)) -- err = -EINVAL; -- --out: -- mutex_unlock(&p->mutex); -- -- return err; --} -- - static int kfd_ioctl_dbg_register(struct file *filep, - struct kfd_process *p, void *data) - { -@@ -550,11 +443,16 @@ static int kfd_ioctl_dbg_register(struct file *filep, - long status = 0; - - dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -+ if (dev == NULL) - return -EINVAL; - -- mutex_lock(&p->mutex); -+ if (dev->device_info->asic_family == CHIP_CARRIZO) { -+ pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); -+ return -EINVAL; -+ } -+ - mutex_lock(kfd_get_dbgmgr_mutex()); -+ mutex_lock(&p->mutex); - - /* - * make sure that we have pdd, if this the first queue created for -@@ -562,11 +460,12 @@ static int kfd_ioctl_dbg_register(struct file *filep, - */ - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { -- status = PTR_ERR(pdd); -- goto out; -+ mutex_unlock(&p->mutex); -+ mutex_unlock(kfd_get_dbgmgr_mutex()); -+ return PTR_ERR(pdd); - } - -- if (!dev->dbgmgr) { -+ if (dev->dbgmgr == NULL) { - /* In case of a legal call, we have no dbgmgr yet */ - create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev); - if (create_ok) { -@@ -581,9 +480,8 @@ static int kfd_ioctl_dbg_register(struct file *filep, - status = -EINVAL; - } - --out: -- mutex_unlock(kfd_get_dbgmgr_mutex()); - mutex_unlock(&p->mutex); -+ mutex_unlock(kfd_get_dbgmgr_mutex()); - - return status; - } -@@ -596,7 +494,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, - long status; - - dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -+ if (dev == NULL) - return -EINVAL; - - if (dev->device_info->asic_family == CHIP_CARRIZO) { -@@ -607,7 +505,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, - mutex_lock(kfd_get_dbgmgr_mutex()); - - status = kfd_dbgmgr_unregister(dev->dbgmgr, p); -- if (!status) { -+ if (status == 0) { - kfd_dbgmgr_destroy(dev->dbgmgr); - dev->dbgmgr = NULL; - } -@@ -641,13 +539,21 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, - memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info)); - - dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -+ if (dev == NULL) -+ return -EINVAL; -+ -+ if (dev->device_info->asic_family == CHIP_CARRIZO) { -+ pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); - return -EINVAL; -+ } - - cmd_from_user = (void __user *) args->content_ptr; - -- if (args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE || -- (args->buf_size_in_bytes <= sizeof(*args))) -+ /* Validate arguments */ -+ -+ if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) || -+ (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) || -+ (cmd_from_user == NULL)) - return -EINVAL; - - /* this is the actual buffer to work with */ -@@ -673,9 +579,9 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, - /* skip over the addresses buffer */ - args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; - -- if (args_idx >= args->buf_size_in_bytes) { -- status = -EINVAL; -- goto out; -+ if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) { -+ kfree(args_buff); -+ return -EINVAL; - } - - watch_mask_value = (uint64_t) args_buff[args_idx]; -@@ -697,9 +603,9 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, - args_idx += sizeof(aw_info.watch_mask); - } - -- if (args_idx > args->buf_size_in_bytes) { -- status = -EINVAL; -- goto out; -+ if (args_idx >= args->buf_size_in_bytes - sizeof(args)) { -+ kfree(args_buff); -+ return -EINVAL; - } - - /* Currently HSA Event is not supported for DBG */ -@@ -711,7 +617,6 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, - - mutex_unlock(kfd_get_dbgmgr_mutex()); - --out: - kfree(args_buff); - - return status; -@@ -741,9 +646,14 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, - sizeof(wac_info.trapId); - - dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -+ if (dev == NULL) - return -EINVAL; - -+ if (dev->device_info->asic_family == CHIP_CARRIZO) { -+ pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); -+ return -EINVAL; -+ } -+ - /* input size must match the computed "compact" size */ - if (args->buf_size_in_bytes != computed_buff_size) { - pr_debug("size mismatch, computed : actual %u : %u\n", -@@ -802,37 +712,22 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, - { - struct kfd_ioctl_get_clock_counters_args *args = data; - struct kfd_dev *dev; --#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \ -- || (defined OS_NAME_RHEL_7_2) -- struct timespec time; --#else - struct timespec64 time; --#endif - - dev = kfd_device_by_id(args->gpu_id); -- if (dev) -- /* Reading GPU clock counter from KGD */ -- args->gpu_clock_counter = -- dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); -- else -- /* Node without GPU resource */ -- args->gpu_clock_counter = 0; -+ if (dev == NULL) -+ return -EINVAL; -+ -+ /* Reading GPU clock counter from KGD */ -+ args->gpu_clock_counter = -+ dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); - - /* No access to rdtsc. Using raw monotonic time */ --#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \ -- || (defined OS_NAME_RHEL_7_2) -- getrawmonotonic(&time); -- args->cpu_clock_counter = (uint64_t)timespec_to_ns(&time); -- -- get_monotonic_boottime(&time); -- args->system_clock_counter = (uint64_t)timespec_to_ns(&time); --#else - getrawmonotonic64(&time); - args->cpu_clock_counter = (uint64_t)timespec64_to_ns(&time); - - get_monotonic_boottime64(&time); - args->system_clock_counter = (uint64_t)timespec64_to_ns(&time); --#endif - - /* Since the counter is in nano-seconds we use 1GHz frequency */ - args->system_clock_freq = 1000000000; -@@ -887,104 +782,12 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, - "scratch_limit %llX\n", pdd->scratch_limit); - - args->num_of_nodes++; -- -- pdd = kfd_get_next_process_device_data(p, pdd); -- } while (pdd && (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); -- } -- -- mutex_unlock(&p->mutex); -- -- return 0; --} -- --static int kfd_ioctl_get_process_apertures_new(struct file *filp, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_get_process_apertures_new_args *args = data; -- struct kfd_process_device_apertures *pa; -- struct kfd_process_device *pdd; -- uint32_t nodes = 0; -- int ret; -- -- dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); -- -- if (args->num_of_nodes == 0) { -- /* Return number of nodes, so that user space can alloacate -- * sufficient memory -- */ -- mutex_lock(&p->mutex); -- -- if (!kfd_has_process_device_data(p)) -- goto out_upwrite; -- -- /* Run over all pdd of the process */ -- pdd = kfd_get_first_process_device_data(p); -- do { -- args->num_of_nodes++; -- pdd = kfd_get_next_process_device_data(p, pdd); -- } while (pdd); -- -- goto out_upwrite; -- } -- -- /* Fill in process-aperture information for all available -- * nodes, but not more than args->num_of_nodes as that is -- * the amount of memory allocated by user -- */ -- pa = kzalloc((sizeof(struct kfd_process_device_apertures) * -- args->num_of_nodes), GFP_KERNEL); -- if (!pa) -- return -ENOMEM; -- -- mutex_lock(&p->mutex); -- -- if (!kfd_has_process_device_data(p)) { -- args->num_of_nodes = 0; -- kfree(pa); -- goto out_upwrite; -+ } while ((pdd = kfd_get_next_process_device_data(p, pdd)) != NULL && -+ (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); - } - -- /* Run over all pdd of the process */ -- pdd = kfd_get_first_process_device_data(p); -- do { -- pa[nodes].gpu_id = pdd->dev->id; -- pa[nodes].lds_base = pdd->lds_base; -- pa[nodes].lds_limit = pdd->lds_limit; -- pa[nodes].gpuvm_base = pdd->gpuvm_base; -- pa[nodes].gpuvm_limit = pdd->gpuvm_limit; -- pa[nodes].scratch_base = pdd->scratch_base; -- pa[nodes].scratch_limit = pdd->scratch_limit; -- -- dev_dbg(kfd_device, -- "gpu id %u\n", pdd->dev->id); -- dev_dbg(kfd_device, -- "lds_base %llX\n", pdd->lds_base); -- dev_dbg(kfd_device, -- "lds_limit %llX\n", pdd->lds_limit); -- dev_dbg(kfd_device, -- "gpuvm_base %llX\n", pdd->gpuvm_base); -- dev_dbg(kfd_device, -- "gpuvm_limit %llX\n", pdd->gpuvm_limit); -- dev_dbg(kfd_device, -- "scratch_base %llX\n", pdd->scratch_base); -- dev_dbg(kfd_device, -- "scratch_limit %llX\n", pdd->scratch_limit); -- nodes++; -- -- pdd = kfd_get_next_process_device_data(p, pdd); -- } while (pdd && (nodes < args->num_of_nodes)); - mutex_unlock(&p->mutex); - -- args->num_of_nodes = nodes; -- ret = copy_to_user( -- (void __user *)args->kfd_process_device_apertures_ptr, -- pa, -- (nodes * sizeof(struct kfd_process_device_apertures))); -- kfree(pa); -- return ret ? -EFAULT : 0; -- --out_upwrite: -- mutex_unlock(&p->mutex); - return 0; - } - -@@ -992,57 +795,15 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, - void *data) - { - struct kfd_ioctl_create_event_args *args = data; -- struct kfd_dev *kfd; -- struct kfd_process_device *pdd; -- int err = -EINVAL; -- void *mem, *kern_addr = NULL; -- -- pr_debug("Event page offset 0x%llx\n", args->event_page_offset); -- -- if (args->event_page_offset) { -- kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); -- if (!kfd) { -- pr_err("Getting device by id failed in %s\n", __func__); -- return -EFAULT; -- } -- if (!kfd->device_info->is_need_iommu_device) { -- mutex_lock(&p->mutex); -- pdd = kfd_bind_process_to_device(kfd, p); -- if (IS_ERR(pdd)) { -- err = PTR_ERR(pdd); -- goto out_upwrite; -- } -- mem = kfd_process_device_translate_handle(pdd, -- GET_IDR_HANDLE(args->event_page_offset)); -- if (!mem) { -- pr_err("Can't find BO, offset is 0x%llx\n", -- args->event_page_offset); -- err = -EFAULT; -- goto out_upwrite; -- } -- mutex_unlock(&p->mutex); -- -- /* Map dGPU gtt BO to kernel */ -- kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, -- mem, &kern_addr); -- } -- } -+ int err; - -- err = kfd_event_create(filp, p, -- args->event_type, -- args->auto_reset != 0, -- args->node_id, -- &args->event_id, -- &args->event_trigger_data, -- &args->event_page_offset, -- &args->event_slot_index, -- kern_addr); -+ err = kfd_event_create(filp, p, args->event_type, -+ args->auto_reset != 0, args->node_id, -+ &args->event_id, &args->event_trigger_data, -+ &args->event_page_offset, -+ &args->event_slot_index); - - return err; -- --out_upwrite: -- mutex_unlock(&p->mutex); -- return err; - } - - static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, -@@ -1085,870 +846,9 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, - - return err; - } --static int kfd_ioctl_alloc_scratch_memory(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_alloc_memory_of_scratch_args *args = data; -- struct kfd_process_device *pdd; -- struct kfd_dev *dev; -- long err; -- -- if (args->size == 0) -- return -EINVAL; -- -- dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -- return -EINVAL; -- -- mutex_lock(&p->mutex); -- -- pdd = kfd_bind_process_to_device(dev, p); -- if (IS_ERR(pdd)) { -- err = PTR_ERR(pdd); -- goto bind_process_to_device_fail; -- } -- -- pdd->sh_hidden_private_base_vmid = args->va_addr; -- pdd->qpd.sh_hidden_private_base = args->va_addr; -- -- mutex_unlock(&p->mutex); -- -- if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && -- pdd->qpd.vmid != 0) { -- err = dev->kfd2kgd->alloc_memory_of_scratch( -- dev->kgd, args->va_addr, pdd->qpd.vmid); -- if (err != 0) -- goto alloc_memory_of_scratch_failed; -- } -- -- return 0; -- --bind_process_to_device_fail: -- mutex_unlock(&p->mutex); --alloc_memory_of_scratch_failed: -- return -EFAULT; --} -- --bool kfd_is_large_bar(struct kfd_dev *dev) --{ -- struct kfd_local_mem_info mem_info; -- -- if (debug_largebar) { -- pr_debug("Simulate large-bar allocation on non large-bar machine\n"); -- return true; -- } -- -- if (dev->device_info->is_need_iommu_device) -- return false; -- -- dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info); -- if (mem_info.local_mem_size_private == 0 && -- mem_info.local_mem_size_public > 0) -- return true; -- return false; --} -- --static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_alloc_memory_of_gpu_args *args = data; -- struct kfd_process_device *pdd; -- void *mem; -- struct kfd_dev *dev; -- int idr_handle; -- long err; -- uint64_t offset = args->mmap_offset; -- uint32_t flags = args->flags; -- struct vm_area_struct *vma; -- -- if (args->size == 0) -- return -EINVAL; -- -- dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -- return -EINVAL; -- -- if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { -- /* Check if the userptr corresponds to another (or third-party) -- * device local memory. If so treat is as a doorbell. User -- * space will be oblivious of this and will use this doorbell -- * BO as a regular userptr BO -- */ -- vma = find_vma(current->mm, args->mmap_offset); -- if (vma && (vma->vm_flags & VM_IO)) { -- unsigned long pfn; -- -- follow_pfn(vma, args->mmap_offset, &pfn); -- flags |= KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL; -- flags &= ~KFD_IOC_ALLOC_MEM_FLAGS_USERPTR; -- offset = (pfn << PAGE_SHIFT); -- } -- } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { -- if (args->size != kfd_doorbell_process_slice(dev)) -- return -EINVAL; -- offset = kfd_get_process_doorbells(dev, p); -- } -- -- mutex_lock(&p->mutex); -- -- pdd = kfd_bind_process_to_device(dev, p); -- if (IS_ERR(pdd)) { -- err = PTR_ERR(pdd); -- goto err_unlock; -- } -- -- err = dev->kfd2kgd->alloc_memory_of_gpu( -- dev->kgd, args->va_addr, args->size, -- pdd->vm, (struct kgd_mem **) &mem, &offset, -- flags); -- -- if (err) -- goto err_unlock; -- -- idr_handle = kfd_process_device_create_obj_handle(pdd, mem, -- args->va_addr, args->size, NULL); -- if (idr_handle < 0) { -- err = -EFAULT; -- goto err_free; -- } -- -- mutex_unlock(&p->mutex); -- -- args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); -- if ((args->flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) != 0 && -- !kfd_is_large_bar(dev)) { -- args->mmap_offset = 0; -- } else { -- args->mmap_offset = KFD_MMAP_TYPE_MAP_BO; -- args->mmap_offset |= KFD_MMAP_GPU_ID(args->gpu_id); -- args->mmap_offset <<= PAGE_SHIFT; -- args->mmap_offset |= offset; -- } -- -- return 0; -- --err_free: -- dev->kfd2kgd->free_memory_of_gpu(dev->kgd, -- (struct kgd_mem *) mem, -- pdd->vm); --err_unlock: -- mutex_unlock(&p->mutex); -- return err; --} -- --static int kfd_ioctl_free_memory_of_gpu(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_free_memory_of_gpu_args *args = data; -- struct kfd_process_device *pdd; -- struct kfd_bo *buf_obj; -- struct kfd_dev *dev; -- int ret; -- -- dev = kfd_device_by_id(GET_GPU_ID(args->handle)); -- if (!dev) -- return -EINVAL; -- -- mutex_lock(&p->mutex); -- -- pdd = kfd_get_process_device_data(dev, p); -- if (!pdd) { -- pr_err("Process device data doesn't exist\n"); -- ret = -EINVAL; -- goto err_unlock; -- } -- -- buf_obj = kfd_process_device_find_bo(pdd, -- GET_IDR_HANDLE(args->handle)); -- if (!buf_obj) { -- ret = -EINVAL; -- goto err_unlock; -- } -- run_rdma_free_callback(buf_obj); -- -- ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem, -- pdd->vm); -- -- /* If freeing the buffer failed, leave the handle in place for -- * clean-up during process tear-down. -- */ -- if (ret == 0) -- kfd_process_device_remove_obj_handle( -- pdd, GET_IDR_HANDLE(args->handle)); -- --err_unlock: -- mutex_unlock(&p->mutex); -- return ret; --} -- --static int kfd_ioctl_map_memory_to_gpu(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_map_memory_to_gpu_args *args = data; -- struct kfd_process_device *pdd, *peer_pdd; -- void *mem; -- struct kfd_dev *dev, *peer; -- long err = 0; -- int i, num_dev = 0; -- uint32_t *devices_arr = NULL; -- -- dev = kfd_device_by_id(GET_GPU_ID(args->handle)); -- if (!dev) -- return -EINVAL; -- -- if (args->device_ids_array_size > 0 && -- (args->device_ids_array_size < sizeof(uint32_t))) { -- pr_err("Node IDs array size %u\n", -- args->device_ids_array_size); -- return -EFAULT; -- } -- -- if (args->device_ids_array_size > 0) { -- devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); -- if (!devices_arr) -- return -ENOMEM; -- -- err = copy_from_user(devices_arr, -- (void __user *)args->device_ids_array_ptr, -- args->device_ids_array_size); -- if (err != 0) { -- err = -EFAULT; -- goto copy_from_user_failed; -- } -- } -- -- mutex_lock(&p->mutex); -- -- pdd = kfd_bind_process_to_device(dev, p); -- if (IS_ERR(pdd)) { -- err = PTR_ERR(pdd); -- goto bind_process_to_device_failed; -- } -- -- mem = kfd_process_device_translate_handle(pdd, -- GET_IDR_HANDLE(args->handle)); -- if (!mem) { -- err = PTR_ERR(mem); -- goto get_mem_obj_from_handle_failed; -- } -- -- if (args->device_ids_array_size > 0) { -- num_dev = args->device_ids_array_size / sizeof(uint32_t); -- for (i = 0 ; i < num_dev; i++) { -- peer = kfd_device_by_id(devices_arr[i]); -- if (!peer) { -- pr_err("Getting device by id failed for 0x%x\n", -- devices_arr[i]); -- err = -EFAULT; -- goto get_mem_obj_from_handle_failed; -- } -- -- peer_pdd = kfd_bind_process_to_device(peer, p); -- if (!peer_pdd) { -- err = -EFAULT; -- goto get_mem_obj_from_handle_failed; -- } -- err = peer->kfd2kgd->map_memory_to_gpu( -- peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); -- if (err != 0) -- pr_err("Failed to map\n"); -- } -- } else { -- err = dev->kfd2kgd->map_memory_to_gpu( -- dev->kgd, (struct kgd_mem *)mem, pdd->vm); -- if (err != 0) -- pr_err("Failed to map\n"); -- } -- -- mutex_unlock(&p->mutex); -- -- err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true); -- if (err) { -- pr_debug("Sync memory failed, wait interrupted by user signal\n"); -- goto sync_memory_failed; -- } -- -- /* Flush TLBs after waiting for the page table updates to complete */ -- if (args->device_ids_array_size > 0) { -- for (i = 0; i < num_dev; i++) { -- peer = kfd_device_by_id(devices_arr[i]); -- if (WARN_ON_ONCE(!peer)) -- continue; -- peer_pdd = kfd_get_process_device_data(dev, p); -- if (WARN_ON_ONCE(!peer_pdd)) -- continue; -- kfd_flush_tlb(peer, p->pasid); -- } -- } else { -- kfd_flush_tlb(dev, p->pasid); -- } -- -- if (args->device_ids_array_size > 0 && devices_arr) -- kfree(devices_arr); -- -- return err; -- --bind_process_to_device_failed: --get_mem_obj_from_handle_failed: -- mutex_unlock(&p->mutex); --copy_from_user_failed: --sync_memory_failed: -- kfree(devices_arr); -- return err; --} -- --int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd) --{ -- int err; -- struct kfd_dev *dev = pdd->dev; -- -- err = dev->kfd2kgd->unmap_memory_to_gpu( -- dev->kgd, (struct kgd_mem *) mem, pdd->vm); -- -- if (err != 0) -- return err; -- -- kfd_flush_tlb(dev, pdd->process->pasid); -- -- return 0; --} -- --static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; -- struct kfd_process_device *pdd, *peer_pdd; -- void *mem; -- struct kfd_dev *dev, *peer; -- long err = 0; -- uint32_t *devices_arr = NULL, num_dev, i; -- -- dev = kfd_device_by_id(GET_GPU_ID(args->handle)); -- if (!dev) -- return -EINVAL; -- -- if (args->device_ids_array_size > 0 && -- (args->device_ids_array_size < sizeof(uint32_t))) { -- pr_err("Node IDs array size %u\n", -- args->device_ids_array_size); -- return -EFAULT; -- } -- -- if (args->device_ids_array_size > 0) { -- devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); -- if (!devices_arr) -- return -ENOMEM; -- -- err = copy_from_user(devices_arr, -- (void __user *)args->device_ids_array_ptr, -- args->device_ids_array_size); -- if (err != 0) { -- err = -EFAULT; -- goto copy_from_user_failed; -- } -- } -- -- mutex_lock(&p->mutex); -- -- pdd = kfd_get_process_device_data(dev, p); -- if (!pdd) { -- pr_err("Process device data doesn't exist\n"); -- err = PTR_ERR(pdd); -- goto bind_process_to_device_failed; -- } -- -- mem = kfd_process_device_translate_handle(pdd, -- GET_IDR_HANDLE(args->handle)); -- if (!mem) { -- err = PTR_ERR(mem); -- goto get_mem_obj_from_handle_failed; -- } -- -- if (args->device_ids_array_size > 0) { -- num_dev = args->device_ids_array_size / sizeof(uint32_t); -- for (i = 0 ; i < num_dev; i++) { -- peer = kfd_device_by_id(devices_arr[i]); -- if (!peer) { -- err = -EFAULT; -- goto get_mem_obj_from_handle_failed; -- } -- -- peer_pdd = kfd_get_process_device_data(peer, p); -- if (!peer_pdd) { -- err = -EFAULT; -- goto get_mem_obj_from_handle_failed; -- } -- kfd_unmap_memory_from_gpu(mem, peer_pdd); -- } -- kfree(devices_arr); -- } else -- kfd_unmap_memory_from_gpu(mem, pdd); -- -- mutex_unlock(&p->mutex); -- -- return 0; -- --bind_process_to_device_failed: --get_mem_obj_from_handle_failed: -- mutex_unlock(&p->mutex); --copy_from_user_failed: -- kfree(devices_arr); -- return err; --} -- --static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_set_process_dgpu_aperture_args *args = data; -- struct kfd_dev *dev; -- struct kfd_process_device *pdd; -- long err; -- -- dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -- return -EINVAL; -- -- mutex_lock(&p->mutex); -- -- pdd = kfd_bind_process_to_device(dev, p); -- if (IS_ERR(pdd)) { -- err = PTR_ERR(pdd); -- goto exit; -- } -- -- err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base, -- args->dgpu_limit); -- --exit: -- mutex_unlock(&p->mutex); -- return err; --} -- --static int kfd_ioctl_get_dmabuf_info(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_get_dmabuf_info_args *args = data; -- struct kfd_dev *dev = NULL; -- struct kgd_dev *dma_buf_kgd; -- void *metadata_buffer = NULL; -- uint32_t flags; -- unsigned int i; -- int r; -- -- /* Find a KFD GPU device that supports the get_dmabuf_info query */ -- for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) -- if (dev && dev->kfd2kgd->get_dmabuf_info) -- break; -- if (!dev) -- return -EINVAL; -- -- if (args->metadata_ptr) { -- metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL); -- if (!metadata_buffer) -- return -ENOMEM; -- } -- -- /* Get dmabuf info from KGD */ -- r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd, -- &dma_buf_kgd, &args->size, -- metadata_buffer, args->metadata_size, -- &args->metadata_size, &flags); -- if (r) -- goto exit; -- -- /* Reverse-lookup gpu_id from kgd pointer */ -- dev = kfd_device_by_kgd(dma_buf_kgd); -- if (!dev) { -- r = -EINVAL; -- goto exit; -- } -- args->gpu_id = dev->id; -- args->flags = flags; -- -- /* Copy metadata buffer to user mode */ -- if (metadata_buffer) { -- r = copy_to_user((void __user *)args->metadata_ptr, -- metadata_buffer, args->metadata_size); -- if (r != 0) -- r = -EFAULT; -- } -- --exit: -- kfree(metadata_buffer); -- -- return r; --} -- --static int kfd_ioctl_import_dmabuf(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_import_dmabuf_args *args = data; -- struct kfd_dev *dev; -- int r; -- -- dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -- return -EINVAL; -- -- r = kfd_ipc_import_dmabuf(dev, p, args->gpu_id, args->dmabuf_fd, -- args->va_addr, &args->handle, NULL); -- if (r) -- pr_err("Failed to import dmabuf\n"); -- -- return r; --} -- --static int kfd_ioctl_ipc_export_handle(struct file *filep, -- struct kfd_process *p, -- void *data) --{ -- struct kfd_ioctl_ipc_export_handle_args *args = data; -- struct kfd_dev *dev; -- int r; -- -- dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -- return -EINVAL; -- -- r = kfd_ipc_export_as_handle(dev, p, args->handle, args->share_handle); -- if (r) -- pr_err("Failed to export IPC handle\n"); -- -- return r; --} -- --static int kfd_ioctl_ipc_import_handle(struct file *filep, -- struct kfd_process *p, -- void *data) --{ -- struct kfd_ioctl_ipc_import_handle_args *args = data; -- struct kfd_dev *dev = NULL; -- int r; -- -- dev = kfd_device_by_id(args->gpu_id); -- if (!dev) -- return -EINVAL; -- -- r = kfd_ipc_import_handle(dev, p, args->gpu_id, args->share_handle, -- args->va_addr, &args->handle, -- &args->mmap_offset); -- if (r) -- pr_err("Failed to import IPC handle\n"); -- -- return r; --} -- --static int kfd_ioctl_get_tile_config(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_get_tile_config_args *args = data; -- struct kfd_dev *dev; -- struct tile_config config; -- int err = 0; -- -- dev = kfd_device_by_id(args->gpu_id); -- -- dev->kfd2kgd->get_tile_config(dev->kgd, &config); -- -- args->gb_addr_config = config.gb_addr_config; -- args->num_banks = config.num_banks; -- args->num_ranks = config.num_ranks; -- -- if (args->num_tile_configs > config.num_tile_configs) -- args->num_tile_configs = config.num_tile_configs; -- err = copy_to_user((void __user *)args->tile_config_ptr, -- config.tile_config_ptr, -- args->num_tile_configs * sizeof(uint32_t)); -- if (err) { -- args->num_tile_configs = 0; -- return -EFAULT; -- } -- -- if (args->num_macro_tile_configs > config.num_macro_tile_configs) -- args->num_macro_tile_configs = -- config.num_macro_tile_configs; -- err = copy_to_user((void __user *)args->macro_tile_config_ptr, -- config.macro_tile_config_ptr, -- args->num_macro_tile_configs * sizeof(uint32_t)); -- if (err) { -- args->num_macro_tile_configs = 0; -- return -EFAULT; -- } -- -- return 0; --} -- --#if defined(BUILD_AS_DKMS) --static int kfd_ioctl_cross_memory_copy(struct file *filep, -- struct kfd_process *local_p, void *data) --{ -- return 0; --} --#else --static int kfd_ioctl_cross_memory_copy(struct file *filep, -- struct kfd_process *local_p, void *data) --{ -- struct kfd_ioctl_cross_memory_copy_args *args = data; -- struct kfd_memory_range *src_array, *dst_array; -- struct kfd_bo *src_bo, *dst_bo; -- struct kfd_process *remote_p, *src_p, *dst_p; -- struct task_struct *remote_task; -- struct mm_struct *remote_mm; -- struct pid *remote_pid; -- struct dma_fence *fence = NULL, *lfence = NULL; -- uint64_t dst_va_addr; -- uint64_t copied, total_copied = 0; -- uint64_t src_offset, dst_offset; -- int i, j = 0, err = 0; -- -- /* Check parameters */ -- if (args->src_mem_range_array == 0 || args->dst_mem_range_array == 0 || -- args->src_mem_array_size == 0 || args->dst_mem_array_size == 0) -- return -EINVAL; -- args->bytes_copied = 0; -- -- /* Allocate space for source and destination arrays */ -- src_array = kmalloc_array((args->src_mem_array_size + -- args->dst_mem_array_size), -- sizeof(struct kfd_memory_range), -- GFP_KERNEL); -- if (!src_array) -- return -ENOMEM; -- dst_array = &src_array[args->src_mem_array_size]; -- -- if (copy_from_user(src_array, (void __user *)args->src_mem_range_array, -- args->src_mem_array_size * -- sizeof(struct kfd_memory_range))) { -- err = -EFAULT; -- goto copy_from_user_fail; -- } -- if (copy_from_user(dst_array, (void __user *)args->dst_mem_range_array, -- args->dst_mem_array_size * -- sizeof(struct kfd_memory_range))) { -- err = -EFAULT; -- goto copy_from_user_fail; -- } -- -- /* Get remote process */ -- remote_pid = find_get_pid(args->pid); -- if (!remote_pid) { -- pr_err("Cross mem copy failed. Invalid PID %d\n", args->pid); -- err = -ESRCH; -- goto copy_from_user_fail; -- } -- -- remote_task = get_pid_task(remote_pid, PIDTYPE_PID); -- if (!remote_pid) { -- pr_err("Cross mem copy failed. Invalid PID or task died %d\n", -- args->pid); -- err = -ESRCH; -- goto get_pid_task_fail; -- } -- -- /* Check access permission */ -- remote_mm = mm_access(remote_task, PTRACE_MODE_ATTACH_REALCREDS); -- if (!remote_mm || IS_ERR(remote_mm)) { -- err = IS_ERR(remote_mm) ? PTR_ERR(remote_mm) : -ESRCH; -- if (err == -EACCES) { -- pr_err("Cross mem copy failed. Permission error\n"); -- err = -EPERM; -- } else -- pr_err("Cross mem copy failed. Invalid task %d\n", -- err); -- goto mm_access_fail; -- } -- -- remote_p = kfd_get_process(remote_task); -- if (!remote_p) { -- pr_err("Cross mem copy failed. Invalid kfd process %d\n", -- args->pid); -- err = -EINVAL; -- goto kfd_process_fail; -- } -- -- if (KFD_IS_CROSS_MEMORY_WRITE(args->flags)) { -- src_p = local_p; -- dst_p = remote_p; -- pr_debug("CMA WRITE: local -> remote\n"); -- } else { -- src_p = remote_p; -- dst_p = local_p; -- pr_debug("CMA READ: remote -> local\n"); -- } -- -- -- /* For each source kfd_range: -- * - Find the BO. Each range has to be within the same BO. -- * - Copy this range to single or multiple destination BOs. -- * - dst_va_addr - will point to next va address into which data will -- * be copied. -- * - dst_bo & src_bo - the current destination and source BOs -- * - src_offset & dst_offset - offset into the respective BOs from -- * data will be sourced or copied -- */ -- dst_va_addr = dst_array[0].va_addr; -- mutex_lock(&dst_p->mutex); -- dst_bo = kfd_process_find_bo_from_interval(dst_p, -- dst_va_addr, -- dst_va_addr + dst_array[0].size - 1); -- mutex_unlock(&dst_p->mutex); -- if (!dst_bo) { -- err = -EFAULT; -- goto kfd_process_fail; -- } -- dst_offset = dst_va_addr - dst_bo->it.start; -- -- for (i = 0; i < args->src_mem_array_size; i++) { -- uint64_t src_va_addr_end = src_array[i].va_addr + -- src_array[i].size - 1; -- uint64_t src_size_to_copy = src_array[i].size; -- -- mutex_lock(&src_p->mutex); -- src_bo = kfd_process_find_bo_from_interval(src_p, -- src_array[i].va_addr, -- src_va_addr_end); -- mutex_unlock(&src_p->mutex); -- if (!src_bo || src_va_addr_end > src_bo->it.last) { -- pr_err("Cross mem copy failed. Invalid range\n"); -- err = -EFAULT; -- break; -- } -- -- src_offset = src_array[i].va_addr - src_bo->it.start; -- -- /* Copy src_bo to one or multiple dst_bo(s) based on size and -- * and current copy location. -- */ -- while (j < args->dst_mem_array_size) { -- uint64_t copy_size; -- int64_t space_left; -- -- /* Find the current copy_size. This will be smaller of -- * the following -- * - space left in the current dest memory range -- * - data left to copy from source range -- */ -- space_left = (dst_array[j].va_addr + dst_array[j].size) -- - dst_va_addr; -- copy_size = (src_size_to_copy < space_left) ? -- src_size_to_copy : space_left; -- -- /* Check both BOs belong to same device */ -- if (src_bo->dev->kgd != dst_bo->dev->kgd) { -- pr_err("Cross Memory failed. Not same device\n"); -- err = -EINVAL; -- break; -- } -- -- /* Store prev fence. Release it when a later fence is -- * created -- */ -- lfence = fence; -- fence = NULL; -- -- err = dst_bo->dev->kfd2kgd->copy_mem_to_mem( -- src_bo->dev->kgd, -- src_bo->mem, src_offset, -- dst_bo->mem, dst_offset, -- copy_size, -- &fence, &copied); -- -- if (err) { -- pr_err("GPU Cross mem copy failed\n"); -- err = -EFAULT; -- break; -- } -- -- /* Later fence available. Release old fence */ -- if (fence && lfence) { -- dma_fence_put(lfence); -- lfence = NULL; -- } -- -- total_copied += copied; -- src_size_to_copy -= copied; -- space_left -= copied; -- dst_va_addr += copied; -- dst_offset += copied; -- src_offset += copied; -- if (dst_va_addr > dst_bo->it.last + 1) { -- pr_err("Cross mem copy failed. Memory overflow\n"); -- err = -EFAULT; -- break; -- } -- -- /* If the cur dest range is full move to next one */ -- if (space_left <= 0) { -- if (++j >= args->dst_mem_array_size) -- break; -- -- dst_va_addr = dst_array[j].va_addr; -- dst_bo = kfd_process_find_bo_from_interval( -- dst_p, -- dst_va_addr, -- dst_va_addr + -- dst_array[j].size - 1); -- dst_offset = dst_va_addr - dst_bo->it.start; -- } -- -- /* If the cur src range is done, move to next one */ -- if (src_size_to_copy <= 0) -- break; -- } -- if (err) -- break; -- } -- -- /* Wait for the last fence irrespective of error condition */ -- if (fence) { -- if (dma_fence_wait_timeout(fence, false, msecs_to_jiffies(1000)) -- < 0) -- pr_err("Cross mem copy failed. BO timed out\n"); -- dma_fence_put(fence); -- } else if (lfence) { -- pr_debug("GPU copy fail. But wait for prev DMA to finish\n"); -- dma_fence_wait_timeout(lfence, true, msecs_to_jiffies(1000)); -- dma_fence_put(lfence); -- } -- --kfd_process_fail: -- mmput(remote_mm); --mm_access_fail: -- put_task_struct(remote_task); --get_pid_task_fail: -- put_pid(remote_pid); --copy_from_user_fail: -- kfree(src_array); -- -- /* An error could happen after partial copy. In that case this will -- * reflect partial amount of bytes copied -- */ -- args->bytes_copied = total_copied; -- return err; --} --#endif -- --static int kfd_ioctl_get_queue_wave_state(struct file *filep, -- struct kfd_process *p, void *data) --{ -- struct kfd_ioctl_get_queue_wave_state_args *args = data; -- int r; -- -- mutex_lock(&p->mutex); -- -- r = pqm_get_wave_state(&p->pqm, args->queue_id, -- (void __user *)args->ctl_stack_address, -- &args->ctl_stack_used_size, -- &args->save_area_used_size); -- -- mutex_unlock(&p->mutex); -- -- return r; --} - - #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ -- [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ -- .cmd_drv = 0, .name = #ioctl} -+ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0, .name = #ioctl} - - /** Ioctl table */ - static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { -@@ -1999,55 +899,6 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, - kfd_ioctl_dbg_wave_control, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, -- kfd_ioctl_alloc_memory_of_gpu, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU, -- kfd_ioctl_free_memory_of_gpu, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU, -- kfd_ioctl_map_memory_to_gpu, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, -- kfd_ioctl_unmap_memory_from_gpu, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH, -- kfd_ioctl_alloc_scratch_memory, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, -- kfd_ioctl_set_cu_mask, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, -- kfd_ioctl_set_process_dgpu_aperture, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, -- kfd_ioctl_set_trap_handler, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, -- kfd_ioctl_get_process_apertures_new, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO, -- kfd_ioctl_get_dmabuf_info, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, -- kfd_ioctl_import_dmabuf, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, -- kfd_ioctl_get_tile_config, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_IMPORT_HANDLE, -- kfd_ioctl_ipc_import_handle, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_EXPORT_HANDLE, -- kfd_ioctl_ipc_export_handle, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_CROSS_MEMORY_COPY, -- kfd_ioctl_cross_memory_copy, 0), -- -- AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE, -- kfd_ioctl_get_queue_wave_state, 0) -- - }; - - #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) -@@ -2143,37 +994,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) - static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) - { - struct kfd_process *process; -- struct kfd_dev *kfd; -- unsigned long vm_pgoff; -- int retval; - - process = kfd_get_process(current); - if (IS_ERR(process)) - return PTR_ERR(process); - -- vm_pgoff = vma->vm_pgoff; -- vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff); -- -- switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { -- case KFD_MMAP_TYPE_DOORBELL: -- kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); -- if (!kfd) -- return -EFAULT; -- return kfd_doorbell_mmap(kfd, process, vma); -- -- case KFD_MMAP_TYPE_EVENTS: -+ if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == -+ KFD_MMAP_DOORBELL_MASK) { -+ vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; -+ return kfd_doorbell_mmap(process, vma); -+ } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == -+ KFD_MMAP_EVENTS_MASK) { -+ vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; - return kfd_event_mmap(process, vma); -- -- case KFD_MMAP_TYPE_MAP_BO: -- kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); -- if (!kfd) -- return -EFAULT; -- retval = kfd->kfd2kgd->mmap_bo(kfd->kgd, vma); -- return retval; -- -- case KFD_MMAP_TYPE_RESERVED_MEM: -- return kfd_reserved_mem_mmap(process, vma); -- - } - - return -EFAULT; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -deleted file mode 100644 -index 4e94081..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -+++ /dev/null -@@ -1,1304 +0,0 @@ --#include <linux/kernel.h> --#include <linux/acpi.h> --#include <linux/mm.h> --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) --#include <linux/amd-iommu.h> --#endif --#include <linux/pci.h> --#include "kfd_crat.h" --#include "kfd_priv.h" --#include "kfd_topology.h" -- --/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. -- * GPU processor ID are expressed with Bit[31]=1. -- * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs -- * used in the CRAT. -- */ --static uint32_t gpu_processor_id_low = 0x80001000; -- --/* Return the next available gpu_processor_id and increment it for next GPU -- * @total_cu_count - Total CUs present in the GPU including ones -- * masked off -- */ --static inline unsigned int get_and_inc_gpu_processor_id( -- unsigned int total_cu_count) --{ -- int current_id = gpu_processor_id_low; -- -- gpu_processor_id_low += total_cu_count; -- return current_id; --} -- --/* Static table to describe GPU Cache information */ --struct kfd_gpu_cache_info { -- uint32_t cache_size; -- uint32_t cache_level; -- uint32_t flags; -- /* Indicates how many Compute Units share this cache -- * Value = 1 indicates the cache is not shared -- */ -- uint32_t num_cu_shared; --}; -- --static struct kfd_gpu_cache_info kaveri_cache_info[] = { -- { -- /* TCP L1 Cache per CU */ -- .cache_size = 16, -- .cache_level = 1, -- .flags = (CRAT_CACHE_FLAGS_ENABLED | -- CRAT_CACHE_FLAGS_DATA_CACHE | -- CRAT_CACHE_FLAGS_SIMD_CACHE), -- .num_cu_shared = 1, -- -- }, -- { -- /* Scalar L1 Instruction Cache (in SQC module) per bank */ -- .cache_size = 16, -- .cache_level = 1, -- .flags = (CRAT_CACHE_FLAGS_ENABLED | -- CRAT_CACHE_FLAGS_INST_CACHE | -- CRAT_CACHE_FLAGS_SIMD_CACHE), -- .num_cu_shared = 2, -- }, -- { -- /* Scalar L1 Data Cache (in SQC module) per bank */ -- .cache_size = 8, -- .cache_level = 1, -- .flags = (CRAT_CACHE_FLAGS_ENABLED | -- CRAT_CACHE_FLAGS_DATA_CACHE | -- CRAT_CACHE_FLAGS_SIMD_CACHE), -- .num_cu_shared = 2, -- }, -- -- /* TODO: Add L2 Cache information */ --}; -- -- --static struct kfd_gpu_cache_info carrizo_cache_info[] = { -- { -- /* TCP L1 Cache per CU */ -- .cache_size = 16, -- .cache_level = 1, -- .flags = (CRAT_CACHE_FLAGS_ENABLED | -- CRAT_CACHE_FLAGS_DATA_CACHE | -- CRAT_CACHE_FLAGS_SIMD_CACHE), -- .num_cu_shared = 1, -- }, -- { -- /* Scalar L1 Instruction Cache (in SQC module) per bank */ -- .cache_size = 8, -- .cache_level = 1, -- .flags = (CRAT_CACHE_FLAGS_ENABLED | -- CRAT_CACHE_FLAGS_INST_CACHE | -- CRAT_CACHE_FLAGS_SIMD_CACHE), -- .num_cu_shared = 4, -- }, -- { -- /* Scalar L1 Data Cache (in SQC module) per bank. */ -- .cache_size = 4, -- .cache_level = 1, -- .flags = (CRAT_CACHE_FLAGS_ENABLED | -- CRAT_CACHE_FLAGS_DATA_CACHE | -- CRAT_CACHE_FLAGS_SIMD_CACHE), -- .num_cu_shared = 4, -- }, -- -- /* TODO: Add L2 Cache information */ --}; -- --/* NOTE: In future if more information is added to struct kfd_gpu_cache_info -- * the following ASICs may need a separate table. -- */ --#define hawaii_cache_info kaveri_cache_info --#define tonga_cache_info carrizo_cache_info --#define fiji_cache_info carrizo_cache_info --#define polaris10_cache_info carrizo_cache_info --#define polaris11_cache_info carrizo_cache_info --/* TODO - check & update Vega10 cache details */ --#define vega10_cache_info carrizo_cache_info --#define raven_cache_info carrizo_cache_info -- --static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, -- struct crat_subtype_computeunit *cu) --{ -- dev->node_props.cpu_cores_count = cu->num_cpu_cores; -- dev->node_props.cpu_core_id_base = cu->processor_id_low; --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) -- dev->node_props.capability |= HSA_CAP_ATS_PRESENT; --#endif -- -- pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, -- cu->processor_id_low); --} -- --static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, -- struct crat_subtype_computeunit *cu) --{ -- dev->node_props.simd_id_base = cu->processor_id_low; -- dev->node_props.simd_count = cu->num_simd_cores; -- dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; -- dev->node_props.max_waves_per_simd = cu->max_waves_simd; -- dev->node_props.wave_front_size = cu->wave_front_size; -- dev->node_props.array_count = cu->array_count; -- dev->node_props.cu_per_simd_array = cu->num_cu_per_array; -- dev->node_props.simd_per_cu = cu->num_simd_per_cu; -- dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; -- if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) -- dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; -- pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); --} -- --/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct -- * topology device present in the device_list -- */ --static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, -- struct list_head *device_list) --{ -- struct kfd_topology_device *dev; -- -- pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", -- cu->proximity_domain, cu->hsa_capability); -- list_for_each_entry(dev, device_list, list) { -- if (cu->proximity_domain == dev->proximity_domain) { -- if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) -- kfd_populated_cu_info_cpu(dev, cu); -- -- if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) -- kfd_populated_cu_info_gpu(dev, cu); -- break; -- } -- } -- -- return 0; --} -- --/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct -- * topology device present in the device_list -- */ --static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, -- struct list_head *device_list) --{ -- struct kfd_mem_properties *props; -- struct kfd_topology_device *dev; -- -- pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", -- mem->proximity_domain); -- list_for_each_entry(dev, device_list, list) { -- if (mem->proximity_domain == dev->proximity_domain) { -- props = kfd_alloc_struct(props); -- if (!props) -- return -ENOMEM; -- -- /* We're on GPU node */ -- if (dev->node_props.cpu_cores_count == 0) { -- /* APU */ -- if (mem->visibility_type == 0) -- props->heap_type = -- HSA_MEM_HEAP_TYPE_FB_PRIVATE; -- /* dGPU */ -- else -- props->heap_type = mem->visibility_type; -- } else -- props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; -- -- if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) -- props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; -- if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) -- props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; -- -- props->size_in_bytes = -- ((uint64_t)mem->length_high << 32) + -- mem->length_low; -- props->width = mem->width; -- -- dev->node_props.mem_banks_count++; -- list_add_tail(&props->list, &dev->mem_props); -- -- break; -- } -- } -- -- return 0; --} -- --/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct -- * topology device present in the device_list -- */ --static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, -- struct list_head *device_list) --{ -- struct kfd_cache_properties *props; -- struct kfd_topology_device *dev; -- uint32_t id; -- uint32_t total_num_of_cu; -- -- id = cache->processor_id_low; -- -- list_for_each_entry(dev, device_list, list) { -- total_num_of_cu = (dev->node_props.array_count * -- dev->node_props.cu_per_simd_array); -- -- /* Cache infomration in CRAT doesn't have proximity_domain -- * information as it is associated with a CPU core or GPU -- * Compute Unit. So map the cache using CPU core Id or SIMD -- * (GPU) ID. -- * TODO: This works because currently we can safely assume that -- * Compute Units are parsed before caches are parsed. In -- * future, remove this dependency -- */ -- if ((id >= dev->node_props.cpu_core_id_base && -- id <= dev->node_props.cpu_core_id_base + -- dev->node_props.cpu_cores_count) || -- (id >= dev->node_props.simd_id_base && -- id < dev->node_props.simd_id_base + -- total_num_of_cu)) { -- props = kfd_alloc_struct(props); -- if (!props) -- return -ENOMEM; -- -- props->processor_id_low = id; -- props->cache_level = cache->cache_level; -- props->cache_size = cache->cache_size; -- props->cacheline_size = cache->cache_line_size; -- props->cachelines_per_tag = cache->lines_per_tag; -- props->cache_assoc = cache->associativity; -- props->cache_latency = cache->cache_latency; -- memcpy(props->sibling_map, cache->sibling_map, -- sizeof(props->sibling_map)); -- -- if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) -- props->cache_type |= HSA_CACHE_TYPE_DATA; -- if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) -- props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; -- if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) -- props->cache_type |= HSA_CACHE_TYPE_CPU; -- if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) -- props->cache_type |= HSA_CACHE_TYPE_HSACU; -- -- dev->cache_count++; -- dev->node_props.caches_count++; -- list_add_tail(&props->list, &dev->cache_props); -- -- break; -- } -- } -- -- return 0; --} -- --/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct -- * topology device present in the device_list -- */ --static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, -- struct list_head *device_list) --{ -- struct kfd_iolink_properties *props = NULL, *props2; -- struct kfd_topology_device *dev, *cpu_dev; -- uint32_t id_from; -- uint32_t id_to; -- -- id_from = iolink->proximity_domain_from; -- id_to = iolink->proximity_domain_to; -- -- pr_debug("Found IO link entry in CRAT table with id_from=%d\n", -- id_from); -- list_for_each_entry(dev, device_list, list) { -- if (id_from == dev->proximity_domain) { -- props = kfd_alloc_struct(props); -- if (!props) -- return -ENOMEM; -- -- props->node_from = id_from; -- props->node_to = id_to; -- props->ver_maj = iolink->version_major; -- props->ver_min = iolink->version_minor; -- props->iolink_type = iolink->io_interface_type; -- -- if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) -- props->weight = 20; -- else -- props->weight = node_distance(id_from, id_to); -- -- props->min_latency = iolink->minimum_latency; -- props->max_latency = iolink->maximum_latency; -- props->min_bandwidth = iolink->minimum_bandwidth_mbs; -- props->max_bandwidth = iolink->maximum_bandwidth_mbs; -- props->rec_transfer_size = -- iolink->recommended_transfer_size; -- -- dev->io_link_count++; -- dev->node_props.io_links_count++; -- list_add_tail(&props->list, &dev->io_link_props); -- break; -- } -- } -- -- /* CPU topology is created before GPUs are detected, so CPU->GPU -- * links are not built at that time. If a PCIe type is discovered, it -- * means a GPU is detected and we are adding GPU->CPU to the topology. -- * At this time, also add the corresponded CPU->GPU link. -- */ -- if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) { -- cpu_dev = kfd_topology_device_by_proximity_domain(id_to); -- if (!cpu_dev) -- return -ENODEV; -- /* same everything but the other direction */ -- props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); -- props2->node_from = id_to; -- props2->node_to = id_from; -- props2->kobj = NULL; -- cpu_dev->io_link_count++; -- cpu_dev->node_props.io_links_count++; -- list_add_tail(&props2->list, &cpu_dev->io_link_props); -- } -- -- return 0; --} -- --/* kfd_parse_subtype - parse subtypes and attach it to correct topology device -- * present in the device_list -- * @sub_type_hdr - subtype section of crat_image -- * @device_list - list of topology devices present in this crat_image -- */ --static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, -- struct list_head *device_list) --{ -- struct crat_subtype_computeunit *cu; -- struct crat_subtype_memory *mem; -- struct crat_subtype_cache *cache; -- struct crat_subtype_iolink *iolink; -- int ret = 0; -- -- switch (sub_type_hdr->type) { -- case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: -- cu = (struct crat_subtype_computeunit *)sub_type_hdr; -- ret = kfd_parse_subtype_cu(cu, device_list); -- break; -- case CRAT_SUBTYPE_MEMORY_AFFINITY: -- mem = (struct crat_subtype_memory *)sub_type_hdr; -- ret = kfd_parse_subtype_mem(mem, device_list); -- break; -- case CRAT_SUBTYPE_CACHE_AFFINITY: -- cache = (struct crat_subtype_cache *)sub_type_hdr; -- ret = kfd_parse_subtype_cache(cache, device_list); -- break; -- case CRAT_SUBTYPE_TLB_AFFINITY: -- /* For now, nothing to do here */ -- pr_debug("Found TLB entry in CRAT table (not processing)\n"); -- break; -- case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: -- /* For now, nothing to do here */ -- pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); -- break; -- case CRAT_SUBTYPE_IOLINK_AFFINITY: -- iolink = (struct crat_subtype_iolink *)sub_type_hdr; -- ret = kfd_parse_subtype_iolink(iolink, device_list); -- break; -- default: -- pr_warn("Unknown subtype %d in CRAT\n", -- sub_type_hdr->type); -- } -- -- return ret; --} -- --/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT -- * create a kfd_topology_device and add in to device_list. Also parse -- * CRAT subtypes and attach it to appropriate kfd_topology_device -- * @crat_image - input image containing CRAT -- * @device_list - [OUT] list of kfd_topology_device generated after -- * parsing crat_image -- * @proximity_domain - Proximity domain of the first device in the table -- * -- * Return - 0 if successful else -ve value -- */ --int kfd_parse_crat_table(void *crat_image, -- struct list_head *device_list, -- uint32_t proximity_domain) --{ -- struct kfd_topology_device *top_dev = NULL; -- struct crat_subtype_generic *sub_type_hdr; -- uint16_t node_id; -- int ret = 0; -- struct crat_header *crat_table = (struct crat_header *)crat_image; -- uint16_t num_nodes; -- uint32_t image_len; -- uint32_t last_header_type, last_header_length; -- -- if (!crat_image) -- return -EINVAL; -- -- if (!list_empty(device_list)) { -- pr_warn("Error device list should be empty\n"); -- return -EINVAL; -- } -- -- num_nodes = crat_table->num_domains; -- image_len = crat_table->length; -- -- pr_info("Parsing CRAT table with %d nodes\n", num_nodes); -- -- for (node_id = 0; node_id < num_nodes; node_id++) { -- top_dev = kfd_create_topology_device(device_list); -- if (!top_dev) -- break; -- top_dev->proximity_domain = proximity_domain++; -- } -- -- if (!top_dev) { -- ret = -ENOMEM; -- goto err; -- } -- -- memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); -- memcpy(top_dev->oem_table_id, crat_table->oem_table_id, -- CRAT_OEMTABLEID_LENGTH); -- top_dev->oem_revision = crat_table->oem_revision; -- -- last_header_type = last_header_length = 0; -- sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); -- while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < -- ((char *)crat_image) + image_len) { -- pr_debug("Parsing CRAT subtype header %p enabled: %s type: 0x%x length %d\n", -- sub_type_hdr, -- (sub_type_hdr->flags & -- CRAT_SUBTYPE_FLAGS_ENABLED) -- ? "true" : "false", -- sub_type_hdr->type, -- sub_type_hdr->length); -- -- if (sub_type_hdr->length == 0) { -- pr_err("Parsing wrong CRAT's subtype header last header type: %d last header len %d\n", -- last_header_type, last_header_type); -- pr_err("Current header type %d length %d\n", -- sub_type_hdr->type, sub_type_hdr->length); -- break; -- } -- -- if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { -- ret = kfd_parse_subtype(sub_type_hdr, device_list); -- if (ret != 0) -- break; -- } -- -- last_header_type = sub_type_hdr->type; -- last_header_length = sub_type_hdr->length; -- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -- sub_type_hdr->length); -- } -- --err: -- if (ret) -- kfd_release_topology_device_list(device_list); -- -- return ret; --} -- --/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ --static int fill_in_pcache(struct crat_subtype_cache *pcache, -- struct kfd_gpu_cache_info *pcache_info, -- struct kfd_cu_info *cu_info, -- int mem_available, -- int cu_bitmask, -- int cache_type, unsigned int cu_processor_id, -- int cu_block) --{ -- unsigned int cu_sibling_map_mask; -- int first_active_cu; -- -- /* First check if enough memory is available */ -- if (mem_available - sizeof(struct crat_subtype_cache) < 0) -- return -ENOMEM; -- -- cu_sibling_map_mask = cu_bitmask; -- cu_sibling_map_mask >>= cu_block; -- cu_sibling_map_mask &= -- ((1 << pcache_info[cache_type].num_cu_shared) - 1); -- first_active_cu = ffs(cu_sibling_map_mask); -- -- /* CU could be inactive. In case of shared cache find the first active -- * CU. and incase of non-shared cache check if the CU is inactive. If -- * inactive active skip it -- */ -- if (first_active_cu) { -- memset(pcache, 0, sizeof(struct crat_subtype_cache)); -- pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; -- pcache->length = sizeof(struct crat_subtype_cache); -- pcache->flags = pcache_info[cache_type].flags; -- pcache->processor_id_low = cu_processor_id -- + (first_active_cu - 1); -- pcache->cache_level = pcache_info[cache_type].cache_level; -- pcache->cache_size = pcache_info[cache_type].cache_size; -- -- /* Sibling map is w.r.t processor_id_low, so shift out -- * inactive CU -- */ -- cu_sibling_map_mask = -- cu_sibling_map_mask >> (first_active_cu - 1); -- -- pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); -- pcache->sibling_map[1] = -- (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); -- pcache->sibling_map[2] = -- (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); -- pcache->sibling_map[3] = -- (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); -- return 0; -- } -- return 1; --} -- --/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info -- * tables -- * -- * @kdev - [IN] GPU device -- * @gpu_processor_id - [IN] GPU processor ID to which these caches -- * associate -- * @available_size - [IN] Amount of memory available in pcache -- * @cu_info - [IN] Compute Unit info obtained from KGD -- * @pcache - [OUT] memory into which cache data is to be filled in. -- * @size_filled - [OUT] amount of data used up in pcache. -- * @num_of_entries - [OUT] number of caches added -- */ --static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, -- int gpu_processor_id, -- int available_size, -- struct kfd_cu_info *cu_info, -- struct crat_subtype_cache *pcache, -- int *size_filled, -- int *num_of_entries) --{ -- struct kfd_gpu_cache_info *pcache_info; -- int num_of_cache_types = 0; -- int i, j, k; -- int ct = 0; -- int mem_available = available_size; -- unsigned int cu_processor_id; -- int ret; -- -- switch (kdev->device_info->asic_family) { -- case CHIP_KAVERI: -- pcache_info = kaveri_cache_info; -- num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); -- break; -- case CHIP_HAWAII: -- pcache_info = hawaii_cache_info; -- num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); -- break; -- case CHIP_CARRIZO: -- pcache_info = carrizo_cache_info; -- num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); -- break; -- case CHIP_TONGA: -- pcache_info = tonga_cache_info; -- num_of_cache_types = ARRAY_SIZE(tonga_cache_info); -- break; -- case CHIP_FIJI: -- pcache_info = fiji_cache_info; -- num_of_cache_types = ARRAY_SIZE(fiji_cache_info); -- break; -- case CHIP_POLARIS10: -- pcache_info = polaris10_cache_info; -- num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); -- break; -- case CHIP_POLARIS11: -- pcache_info = polaris11_cache_info; -- num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); -- break; -- case CHIP_VEGA10: -- pcache_info = vega10_cache_info; -- num_of_cache_types = ARRAY_SIZE(vega10_cache_info); -- break; -- case CHIP_RAVEN: -- pcache_info = raven_cache_info; -- num_of_cache_types = ARRAY_SIZE(raven_cache_info); -- break; -- default: -- return -EINVAL; -- } -- -- *size_filled = 0; -- *num_of_entries = 0; -- -- /* For each type of cache listed in the kfd_gpu_cache_info table, -- * go through all available Compute Units. -- * The [i,j,k] loop will -- * if kfd_gpu_cache_info.num_cu_shared = 1 -- * will parse through all available CU -- * If (kfd_gpu_cache_info.num_cu_shared != 1) -- * then it will consider only one CU from -- * the shared unit -- */ -- -- for (ct = 0; ct < num_of_cache_types; ct++) { -- cu_processor_id = gpu_processor_id; -- for (i = 0; i < cu_info->num_shader_engines; i++) { -- for (j = 0; j < cu_info->num_shader_arrays_per_engine; -- j++) { -- for (k = 0; k < cu_info->num_cu_per_sh; -- k += pcache_info[ct].num_cu_shared) { -- -- ret = fill_in_pcache(pcache, -- pcache_info, -- cu_info, -- mem_available, -- cu_info->cu_bitmap[i][j], -- ct, -- cu_processor_id, -- k); -- -- if (ret < 0) -- break; -- -- if (!ret) { -- pcache++; -- (*num_of_entries)++; -- mem_available -= -- sizeof(*pcache); -- (*size_filled) += -- sizeof(*pcache); -- } -- -- /* Move to next CU block */ -- cu_processor_id += -- pcache_info[ct].num_cu_shared; -- } -- } -- } -- } -- -- pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); -- -- return 0; --} -- --/* -- * kfd_create_crat_image_acpi - Allocates memory for CRAT image and -- * copies CRAT from ACPI (if available). -- * NOTE: Call kfd_destroy_crat_image to free CRAT image memory -- * -- * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then -- * crat_image will be NULL -- * @size: [OUT] size of crat_image -- * -- * Return 0 if successful else return -ve value -- */ --#ifdef CONFIG_ACPI --int kfd_create_crat_image_acpi(void **crat_image, size_t *size) --{ -- struct acpi_table_header *crat_table; -- acpi_status status; -- void *pcrat_image; -- -- if (!crat_image) -- return -EINVAL; -- -- *crat_image = NULL; -- -- /* Fetch the CRAT table from ACPI */ -- status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); -- if (status == AE_NOT_FOUND) { -- pr_warn("CRAT table not found\n"); -- return -ENODATA; -- } else if (ACPI_FAILURE(status)) { -- const char *err = acpi_format_exception(status); -- -- pr_err("CRAT table error: %s\n", err); -- return -EINVAL; -- } -- -- if (ignore_crat) { -- pr_info("CRAT table disabled by module option\n"); -- return -ENODATA; -- } -- -- pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); -- if (!pcrat_image) { -- pr_err("No memory for allocating CRAT image\n"); -- return -ENOMEM; -- } -- -- memcpy(pcrat_image, crat_table, crat_table->length); -- -- *crat_image = pcrat_image; -- *size = crat_table->length; -- -- return 0; --} --#endif -- --/* Memory required to create Virtual CRAT. -- * Since there is no easy way to predict the amount of memory required, the -- * following amount are allocated for CPU and GPU Virtual CRAT. This is -- * expected to cover all known conditions. But to be safe additional check -- * is put in the code to ensure we don't overwrite. -- */ --#define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) --#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) -- --/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node -- * -- * @numa_node_id: CPU NUMA node id -- * @avail_size: Available size in the memory -- * @sub_type_hdr: Memory into which compute info will be filled in -- * -- * Return 0 if successful else return -ve value -- */ --static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, -- int proximity_domain, -- struct crat_subtype_computeunit *sub_type_hdr) --{ -- const struct cpumask *cpumask; -- -- *avail_size -= sizeof(struct crat_subtype_computeunit); -- if (*avail_size < 0) -- return -ENOMEM; -- -- memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); -- -- /* Fill in subtype header data */ -- sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; -- sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); -- sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; -- -- cpumask = cpumask_of_node(numa_node_id); -- -- /* Fill in CU data */ -- sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; -- sub_type_hdr->proximity_domain = proximity_domain; -- sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); -- if (sub_type_hdr->processor_id_low == -1) -- return -EINVAL; -- -- sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); -- -- return 0; --} -- --/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node -- * -- * @numa_node_id: CPU NUMA node id -- * @avail_size: Available size in the memory -- * @sub_type_hdr: Memory into which compute info will be filled in -- * -- * Return 0 if successful else return -ve value -- */ --static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, -- int proximity_domain, -- struct crat_subtype_memory *sub_type_hdr) --{ -- uint64_t mem_in_bytes = 0; -- pg_data_t *pgdat; -- int zone_type; -- -- *avail_size -= sizeof(struct crat_subtype_memory); -- if (*avail_size < 0) -- return -ENOMEM; -- -- memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); -- -- /* Fill in subtype header data */ -- sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; -- sub_type_hdr->length = sizeof(struct crat_subtype_memory); -- sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; -- -- /* Fill in Memory Subunit data */ -- -- /* Unlike si_meminfo, si_meminfo_node is not exported. So -- * the following lines are duplicated from si_meminfo_node -- * function -- */ -- pgdat = NODE_DATA(numa_node_id); -- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) -- mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; -- mem_in_bytes <<= PAGE_SHIFT; -- -- sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); -- sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); -- sub_type_hdr->proximity_domain = proximity_domain; -- -- return 0; --} -- --#ifdef CONFIG_X86_64 --static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, -- uint32_t *num_entries, -- struct crat_subtype_iolink *sub_type_hdr) --{ -- int nid; -- struct cpuinfo_x86 *c = &cpu_data(0); -- uint8_t link_type; -- -- if (c->x86_vendor == X86_VENDOR_AMD) -- link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; -- else -- link_type = CRAT_IOLINK_TYPE_QPI_1_1; -- -- *num_entries = 0; -- -- /* Create IO links from this node to other CPU nodes */ -- for_each_online_node(nid) { -- if (nid == numa_node_id) /* node itself */ -- continue; -- -- *avail_size -= sizeof(struct crat_subtype_iolink); -- if (*avail_size < 0) -- return -ENOMEM; -- -- memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); -- -- /* Fill in subtype header data */ -- sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; -- sub_type_hdr->length = sizeof(struct crat_subtype_iolink); -- sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; -- -- /* Fill in IO link data */ -- sub_type_hdr->proximity_domain_from = numa_node_id; -- sub_type_hdr->proximity_domain_to = nid; -- sub_type_hdr->io_interface_type = link_type; -- -- (*num_entries)++; -- sub_type_hdr++; -- } -- -- return 0; --} --#endif -- --/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU -- * -- * @pcrat_image: Fill in VCRAT for CPU -- * @size: [IN] allocated size of crat_image. -- * [OUT] actual size of data filled in crat_image -- */ --static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) --{ -- struct crat_header *crat_table = (struct crat_header *)pcrat_image; -- struct crat_subtype_generic *sub_type_hdr; -- int avail_size = *size; -- int numa_node_id; -- int ret = 0; --#ifdef CONFIG_ACPI -- struct acpi_table_header *acpi_table; -- acpi_status status; --#endif --#ifdef CONFIG_X86_64 -- uint32_t entries = 0; --#endif -- -- if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) -- return -EINVAL; -- -- /* Fill in CRAT Header. -- * Modify length and total_entries as subunits are added. -- */ -- avail_size -= sizeof(struct crat_header); -- if (avail_size < 0) -- return -ENOMEM; -- -- memset(crat_table, 0, sizeof(struct crat_header)); -- memcpy(&crat_table->signature, CRAT_SIGNATURE, -- sizeof(crat_table->signature)); -- crat_table->length = sizeof(struct crat_header); -- --#ifdef CONFIG_ACPI -- status = acpi_get_table("DSDT", 0, &acpi_table); -- if (status == AE_NOT_FOUND) -- pr_warn("DSDT table not found for OEM information\n"); -- else { -- crat_table->oem_revision = acpi_table->revision; -- memcpy(crat_table->oem_id, acpi_table->oem_id, -- CRAT_OEMID_LENGTH); -- memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, -- CRAT_OEMTABLEID_LENGTH); -- } --#else -- crat_table->oem_revision = 0; -- memcpy(crat_table->oem_id, "INV", CRAT_OEMID_LENGTH); -- memcpy(crat_table->oem_table_id, "UNAVAIL", CRAT_OEMTABLEID_LENGTH); --#endif -- crat_table->total_entries = 0; -- crat_table->num_domains = 0; -- -- sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); -- -- for_each_online_node(numa_node_id) { -- if (kfd_numa_node_to_apic_id(numa_node_id) == -1) -- continue; -- -- /* Fill in Subtype: Compute Unit */ -- ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, -- crat_table->num_domains, -- (struct crat_subtype_computeunit *)sub_type_hdr); -- if (ret < 0) -- return ret; -- crat_table->length += sub_type_hdr->length; -- crat_table->total_entries++; -- -- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -- sub_type_hdr->length); -- -- /* Fill in Subtype: Memory */ -- ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, -- crat_table->num_domains, -- (struct crat_subtype_memory *)sub_type_hdr); -- if (ret < 0) -- return ret; -- crat_table->length += sub_type_hdr->length; -- crat_table->total_entries++; -- -- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -- sub_type_hdr->length); -- -- /* Fill in Subtype: IO Link */ --#ifdef CONFIG_X86_64 -- ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, -- &entries, -- (struct crat_subtype_iolink *)sub_type_hdr); -- if (ret < 0) -- return ret; -- crat_table->length += (sub_type_hdr->length * entries); -- crat_table->total_entries += entries; -- -- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -- sub_type_hdr->length * entries); --#else -- pr_info("IO link not available for non x86 platforms\n"); --#endif -- -- crat_table->num_domains++; -- } -- -- /* TODO: Add cache Subtype for CPU. -- * Currently, CPU cache information is available in function -- * detect_cache_attributes(cpu) defined in the file -- * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not -- * exported and to get the same information the code needs to be -- * duplicated. -- */ -- -- *size = crat_table->length; -- pr_info("Virtual CRAT table created for CPU\n"); -- -- return 0; --} -- --static int kfd_fill_gpu_memory_affinity(int *avail_size, -- struct kfd_dev *kdev, uint8_t type, uint64_t size, -- struct crat_subtype_memory *sub_type_hdr, -- uint32_t proximity_domain, -- const struct kfd_local_mem_info *local_mem_info) --{ -- *avail_size -= sizeof(struct crat_subtype_memory); -- if (*avail_size < 0) -- return -ENOMEM; -- -- memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); -- sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; -- sub_type_hdr->length = sizeof(struct crat_subtype_memory); -- sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; -- -- sub_type_hdr->proximity_domain = proximity_domain; -- -- pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", -- type, size); -- -- sub_type_hdr->length_low = lower_32_bits(size); -- sub_type_hdr->length_high = upper_32_bits(size); -- -- sub_type_hdr->width = local_mem_info->vram_width; -- sub_type_hdr->visibility_type = type; -- -- return 0; --} -- --/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU -- * to its NUMA node -- * @avail_size: Available size in the memory -- * @kdev - [IN] GPU device -- * @sub_type_hdr: Memory into which io link info will be filled in -- * @proximity_domain - proximity domain of the GPU node -- * -- * Return 0 if successful else return -ve value -- */ --static int kfd_fill_gpu_direct_io_link(int *avail_size, -- struct kfd_dev *kdev, -- struct crat_subtype_iolink *sub_type_hdr, -- uint32_t proximity_domain) --{ -- *avail_size -= sizeof(struct crat_subtype_iolink); -- if (*avail_size < 0) -- return -ENOMEM; -- -- memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); -- -- /* Fill in subtype header data */ -- sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; -- sub_type_hdr->length = sizeof(struct crat_subtype_iolink); -- sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; -- -- /* Fill in IOLINK subtype. -- * TODO: Fill-in other fields of iolink subtype -- */ -- sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; -- sub_type_hdr->proximity_domain_from = proximity_domain; --#ifdef CONFIG_NUMA -- if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) -- sub_type_hdr->proximity_domain_to = 0; -- else -- sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; --#else -- sub_type_hdr->proximity_domain_to = 0; --#endif -- return 0; --} -- --/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU -- * -- * @pcrat_image: Fill in VCRAT for GPU -- * @size: [IN] allocated size of crat_image. -- * [OUT] actual size of data filled in crat_image -- */ --static int kfd_create_vcrat_image_gpu(void *pcrat_image, -- size_t *size, struct kfd_dev *kdev, -- uint32_t proximity_domain) --{ -- struct crat_header *crat_table = (struct crat_header *)pcrat_image; -- struct crat_subtype_generic *sub_type_hdr; -- struct crat_subtype_computeunit *cu; -- struct kfd_cu_info cu_info; -- int avail_size = *size; -- uint32_t total_num_of_cu; -- int num_of_cache_entries = 0; -- int cache_mem_filled = 0; -- int ret = 0; --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- struct amd_iommu_device_info iommu_info; -- const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | -- AMD_IOMMU_DEVICE_FLAG_PRI_SUP | -- AMD_IOMMU_DEVICE_FLAG_PASID_SUP; --#endif -- struct kfd_local_mem_info local_mem_info; -- -- if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) -- return -EINVAL; -- -- /* Fill the CRAT Header. -- * Modify length and total_entries as subunits are added. -- */ -- avail_size -= sizeof(struct crat_header); -- if (avail_size < 0) -- return -ENOMEM; -- -- memset(crat_table, 0, sizeof(struct crat_header)); -- -- memcpy(&crat_table->signature, CRAT_SIGNATURE, -- sizeof(crat_table->signature)); -- /* Change length as we add more subtypes*/ -- crat_table->length = sizeof(struct crat_header); -- crat_table->num_domains = 1; -- crat_table->total_entries = 0; -- -- /* Fill in Subtype: Compute Unit -- * First fill in the sub type header and then sub type data -- */ -- avail_size -= sizeof(struct crat_subtype_computeunit); -- if (avail_size < 0) -- return -ENOMEM; -- -- sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); -- memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); -- -- sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; -- sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); -- sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; -- -- /* Fill CU subtype data */ -- cu = (struct crat_subtype_computeunit *)sub_type_hdr; -- cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; -- cu->proximity_domain = proximity_domain; -- -- kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); -- cu->num_simd_per_cu = cu_info.simd_per_cu; -- cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; -- cu->max_waves_simd = cu_info.max_waves_per_simd; -- -- cu->wave_front_size = cu_info.wave_front_size; -- cu->array_count = cu_info.num_shader_arrays_per_engine * -- cu_info.num_shader_engines; -- total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); -- cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); -- cu->num_cu_per_array = cu_info.num_cu_per_sh; -- cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; -- cu->num_banks = cu_info.num_shader_engines; -- cu->lds_size_in_kb = cu_info.lds_size; -- -- cu->hsa_capability = 0; -- -- /* Check if this node supports IOMMU. During parsing this flag will -- * translate to HSA_CAP_ATS_PRESENT -- */ --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- iommu_info.flags = 0; -- if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) { -- if ((iommu_info.flags & required_iommu_flags) == -- required_iommu_flags) -- cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; -- } --#endif -- -- crat_table->length += sub_type_hdr->length; -- crat_table->total_entries++; -- -- /* Fill in Subtype: Memory. Only on systems with large BAR (no -- * private FB), report memory as public. On other systems -- * report the total FB size (public+private) as a single -- * private heap. -- */ -- kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); -- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -- sub_type_hdr->length); -- -- if (debug_largebar) -- local_mem_info.local_mem_size_private = 0; -- -- if (local_mem_info.local_mem_size_private == 0) -- ret = kfd_fill_gpu_memory_affinity(&avail_size, -- kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, -- local_mem_info.local_mem_size_public, -- (struct crat_subtype_memory *)sub_type_hdr, -- proximity_domain, -- &local_mem_info); -- else -- ret = kfd_fill_gpu_memory_affinity(&avail_size, -- kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, -- local_mem_info.local_mem_size_public + -- local_mem_info.local_mem_size_private, -- (struct crat_subtype_memory *)sub_type_hdr, -- proximity_domain, -- &local_mem_info); -- if (ret < 0) -- return ret; -- -- crat_table->length += sizeof(struct crat_subtype_memory); -- crat_table->total_entries++; -- -- /* TODO: Fill in cache information. This information is NOT readily -- * available in KGD -- */ -- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -- sub_type_hdr->length); -- ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, -- avail_size, -- &cu_info, -- (struct crat_subtype_cache *)sub_type_hdr, -- &cache_mem_filled, -- &num_of_cache_entries); -- -- if (ret < 0) -- return ret; -- -- crat_table->length += cache_mem_filled; -- crat_table->total_entries += num_of_cache_entries; -- avail_size -= cache_mem_filled; -- -- /* Fill in Subtype: IO_LINKS -- * Only direct links are added here which is Link from GPU to -- * to its NUMA node. Indirect links are added by userspace. -- */ -- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -- cache_mem_filled); -- ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, -- (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); -- -- if (ret < 0) -- return ret; -- -- crat_table->length += sub_type_hdr->length; -- crat_table->total_entries++; -- -- *size = crat_table->length; -- pr_info("Virtual CRAT table created for GPU\n"); -- -- return ret; --} -- --/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and -- * creates a Virtual CRAT (VCRAT) image -- * -- * NOTE: Call kfd_destroy_crat_image to free CRAT image memory -- * -- * @crat_image: VCRAT image created because ACPI does not have a -- * CRAT for this device -- * @size: [OUT] size of virtual crat_image -- * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device -- * COMPUTE_UNIT_GPU - Create VCRAT for GPU -- * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU -- * -- this option is not currently implemented. -- * The assumption is that all AMD APUs will have CRAT -- * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU -- * -- * Return 0 if successful else return -ve value -- */ --int kfd_create_crat_image_virtual(void **crat_image, size_t *size, -- int flags, struct kfd_dev *kdev, uint32_t proximity_domain) --{ -- void *pcrat_image = NULL; -- int ret = 0; -- -- if (!crat_image) -- return -EINVAL; -- -- *crat_image = NULL; -- -- /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and -- * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover -- * all the current conditions. A check is put not to overwrite beyond -- * allocated size -- */ -- switch (flags) { -- case COMPUTE_UNIT_CPU: -- pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); -- if (!pcrat_image) -- return -ENOMEM; -- *size = VCRAT_SIZE_FOR_CPU; -- ret = kfd_create_vcrat_image_cpu(pcrat_image, size); -- break; -- case COMPUTE_UNIT_GPU: -- if (!kdev) -- return -EINVAL; -- pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); -- if (!pcrat_image) -- return -ENOMEM; -- *size = VCRAT_SIZE_FOR_GPU; -- ret = kfd_create_vcrat_image_gpu(pcrat_image, size, -- kdev, proximity_domain); -- break; -- case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): -- /* TODO: */ -- ret = -EINVAL; -- pr_err("VCRAT not implemented for APU\n"); -- break; -- default: -- ret = -EINVAL; -- } -- -- if (!ret) -- *crat_image = pcrat_image; -- else -- kfree(pcrat_image); -- -- return ret; --} -- -- --/* kfd_destroy_crat_image -- * -- * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) -- * -- */ --void kfd_destroy_crat_image(void *crat_image) --{ -- kfree(crat_image); --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h -index 00de41f..a374fa3 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h -@@ -24,7 +24,6 @@ - #define KFD_CRAT_H_INCLUDED - - #include <linux/types.h> --#include "kfd_priv.h" - - #pragma pack(1) - -@@ -45,10 +44,6 @@ - - #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) - --/* Compute Unit flags */ --#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ --#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ -- - struct crat_header { - uint32_t signature; - uint32_t length; -@@ -110,7 +105,7 @@ struct crat_subtype_computeunit { - uint8_t wave_front_size; - uint8_t num_banks; - uint16_t micro_engine_id; -- uint8_t array_count; -+ uint8_t num_arrays; - uint8_t num_cu_per_array; - uint8_t num_simd_per_cu; - uint8_t max_slots_scatch_cu; -@@ -132,14 +127,13 @@ struct crat_subtype_memory { - uint8_t length; - uint16_t reserved; - uint32_t flags; -- uint32_t proximity_domain; -+ uint32_t promixity_domain; - uint32_t base_addr_low; - uint32_t base_addr_high; - uint32_t length_low; - uint32_t length_high; - uint32_t width; -- uint8_t visibility_type; /* for virtual (dGPU) CRAT */ -- uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; -+ uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; - }; - - /* -@@ -228,12 +222,9 @@ struct crat_subtype_ccompute { - /* - * HSA IO Link Affinity structure and definitions - */ --#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) --#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) --#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) --#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) --#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) --#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 -+#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 -+#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 -+#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc - - /* - * IO interface types -@@ -241,16 +232,8 @@ struct crat_subtype_ccompute { - #define CRAT_IOLINK_TYPE_UNDEFINED 0 - #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 - #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 --#define CRAT_IOLINK_TYPE_AMBA 3 --#define CRAT_IOLINK_TYPE_MIPI 4 --#define CRAT_IOLINK_TYPE_QPI_1_1 5 --#define CRAT_IOLINK_TYPE_RESERVED1 6 --#define CRAT_IOLINK_TYPE_RESERVED2 7 --#define CRAT_IOLINK_TYPE_RAPID_IO 8 --#define CRAT_IOLINK_TYPE_INFINIBAND 9 --#define CRAT_IOLINK_TYPE_RESERVED3 10 --#define CRAT_IOLINK_TYPE_OTHER 11 --#define CRAT_IOLINK_TYPE_MAX 255 -+#define CRAT_IOLINK_TYPE_OTHER 3 -+#define CRAT_IOLINK_TYPE_MAX 255 - - #define CRAT_IOLINK_RESERVED_LENGTH 24 - -@@ -308,13 +291,4 @@ struct cdit_header { - - #pragma pack() - --#ifdef CONFIG_ACPI --int kfd_create_crat_image_acpi(void **crat_image, size_t *size); --#endif --void kfd_destroy_crat_image(void *crat_image); --int kfd_parse_crat_table(void *crat_image, -- struct list_head *device_list, -- uint32_t proximity_domain); --int kfd_create_crat_image_virtual(void **crat_image, size_t *size, -- int flags, struct kfd_dev *kdev, uint32_t proximity_domain); - #endif /* KFD_CRAT_H_INCLUDED */ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c -index af6d736..d5e19b5 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c -@@ -29,7 +29,7 @@ - #include <linux/mutex.h> - #include <linux/device.h> - --#include "kfd_pm4_headers_vi.h" -+#include "kfd_pm4_headers.h" - #include "kfd_pm4_headers_diq.h" - #include "kfd_kernel_queue.h" - #include "kfd_priv.h" -@@ -42,15 +42,16 @@ - - static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev) - { -+ BUG_ON(!dev || !dev->kfd2kgd); -+ - dev->kfd2kgd->address_watch_disable(dev->kgd); - } - - static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - unsigned int pasid, uint64_t vmid0_address, -- uint32_t *packet_buff, size_t size_in_bytes, -- bool sync) -+ uint32_t *packet_buff, size_t size_in_bytes) - { -- struct pm4_mec_release_mem *rm_packet; -+ struct pm4__release_mem *rm_packet; - struct pm4__indirect_buffer_pasid *ib_packet; - struct kfd_mem_obj *mem_obj; - size_t pq_packets_size_in_bytes; -@@ -61,14 +62,12 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - unsigned int *ib_packet_buff; - int status; - -- if (WARN_ON(!size_in_bytes)) -- return -EINVAL; -+ BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes); - - kq = dbgdev->kq; - -- pq_packets_size_in_bytes = sizeof(struct pm4__indirect_buffer_pasid); -- if (sync) -- pq_packets_size_in_bytes += sizeof(struct pm4_mec_release_mem); -+ pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + -+ sizeof(struct pm4__indirect_buffer_pasid); - - /* - * We acquire a buffer from DIQ -@@ -78,8 +77,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - status = kq->ops.acquire_packet_buffer(kq, - pq_packets_size_in_bytes / sizeof(uint32_t), - &ib_packet_buff); -- if (status) { -- pr_err("acquire_packet_buffer failed\n"); -+ if (status != 0) { -+ pr_err("amdkfd: acquire_packet_buffer failed\n"); - return status; - } - -@@ -101,11 +100,6 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - - ib_packet->bitfields5.pasid = pasid; - -- if (!sync) { -- kq->ops.submit_packet(kq); -- return status; -- } -- - /* - * for now we use release mem for GPU-CPU synchronization - * Consider WaitRegMem + WriteData as a better alternative -@@ -114,15 +108,15 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - * (a) Sync with HW - * (b) Sync var is written by CP to mem. - */ -- rm_packet = (struct pm4_mec_release_mem *) (ib_packet_buff + -+ rm_packet = (struct pm4__release_mem *) (ib_packet_buff + - (sizeof(struct pm4__indirect_buffer_pasid) / - sizeof(unsigned int))); - - status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), - &mem_obj); - -- if (status) { -- pr_err("Failed to allocate GART memory\n"); -+ if (status != 0) { -+ pr_err("amdkfd: Failed to allocate GART memory\n"); - kq->ops.rollback_packet(kq); - return status; - } -@@ -133,7 +127,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - - rm_packet->header.opcode = IT_RELEASE_MEM; - rm_packet->header.type = PM4_TYPE_3; -- rm_packet->header.count = sizeof(struct pm4_mec_release_mem) / -+ rm_packet->header.count = sizeof(struct pm4__release_mem) / - sizeof(unsigned int) - 2; - - rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; -@@ -174,6 +168,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - - static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) - { -+ BUG_ON(!dbgdev); -+ - /* - * no action is needed in this case, - * just make sure diq will not be used -@@ -191,12 +187,14 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) - struct kernel_queue *kq = NULL; - int status; - -- properties.type = KFD_QUEUE_TYPE_DIQ; -+ BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->dev); -+ - status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, -- &properties, &qid); -+ &properties, 0, KFD_QUEUE_TYPE_DIQ, -+ &qid); - - if (status) { -- pr_err("Failed to create DIQ\n"); -+ pr_err("amdkfd: Failed to create DIQ\n"); - return status; - } - -@@ -204,8 +202,8 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) - - kq = pqm_get_kernel_queue(dbgdev->pqm, qid); - -- if (!kq) { -- pr_err("Error getting DIQ\n"); -+ if (kq == NULL) { -+ pr_err("amdkfd: Error getting DIQ\n"); - pqm_destroy_queue(dbgdev->pqm, qid); - return -EFAULT; - } -@@ -217,6 +215,8 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) - - static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev) - { -+ BUG_ON(!dbgdev || !dbgdev->dev); -+ - /* disable watch address */ - dbgdev_address_watch_disable_nodiq(dbgdev->dev); - return 0; -@@ -227,6 +227,8 @@ static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev) - /* todo - disable address watch */ - int status; - -+ BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->kq); -+ - status = pqm_destroy_queue(dbgdev->pqm, - dbgdev->kq->queue->properties.queue_id); - dbgdev->kq = NULL; -@@ -239,17 +241,18 @@ static void dbgdev_address_watch_set_registers( - union TCP_WATCH_ADDR_H_BITS *addrHi, - union TCP_WATCH_ADDR_L_BITS *addrLo, - union TCP_WATCH_CNTL_BITS *cntl, -- unsigned int index, unsigned int vmid, -- bool is_apu) -+ unsigned int index, unsigned int vmid) - { - union ULARGE_INTEGER addr; - -+ BUG_ON(!adw_info || !addrHi || !addrLo || !cntl); -+ - addr.quad_part = 0; - addrHi->u32All = 0; - addrLo->u32All = 0; - cntl->u32All = 0; - -- if (adw_info->watch_mask) -+ if (adw_info->watch_mask != NULL) - cntl->bitfields.mask = - (uint32_t) (adw_info->watch_mask[index] & - ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); -@@ -265,9 +268,9 @@ static void dbgdev_address_watch_set_registers( - - cntl->bitfields.mode = adw_info->watch_mode[index]; - cntl->bitfields.vmid = (uint32_t) vmid; -- /* for APU assume it is an ATC address */ -- if (is_apu) -- cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; -+ /* for now assume it is an ATC address */ -+ cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; -+ - pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask); - pr_debug("\t\t%20s %08x\n", "set reg add high :", - addrHi->bitfields.addr); -@@ -276,7 +279,7 @@ static void dbgdev_address_watch_set_registers( - } - - static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, -- struct dbg_address_watch_info *adw_info) -+ struct dbg_address_watch_info *adw_info) - { - union TCP_WATCH_ADDR_H_BITS addrHi; - union TCP_WATCH_ADDR_L_BITS addrLo; -@@ -284,11 +287,13 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, - struct kfd_process_device *pdd; - unsigned int i; - -+ BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); -+ - /* taking the vmid for that process on the safe way using pdd */ - pdd = kfd_get_process_device_data(dbgdev->dev, - adw_info->process); - if (!pdd) { -- pr_err("Failed to get pdd for wave control no DIQ\n"); -+ pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); - return -EFAULT; - } - -@@ -298,19 +303,19 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, - - if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || - (adw_info->num_watch_points == 0)) { -- pr_err("num_watch_points is invalid\n"); -+ pr_err("amdkfd: num_watch_points is invalid\n"); - return -EINVAL; - } - -- if (!adw_info->watch_mode || !adw_info->watch_address) { -- pr_err("adw_info fields are not valid\n"); -+ if ((adw_info->watch_mode == NULL) || -+ (adw_info->watch_address == NULL)) { -+ pr_err("amdkfd: adw_info fields are not valid\n"); - return -EINVAL; - } - -- for (i = 0; i < adw_info->num_watch_points; i++) { -+ for (i = 0 ; i < adw_info->num_watch_points ; i++) { - dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, -- &cntl, i, pdd->qpd.vmid, -- dbgdev->dev->device_info->is_need_iommu_device); -+ &cntl, i, pdd->qpd.vmid); - - pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); - pr_debug("\t\t%20s %08x\n", "register index :", i); -@@ -343,43 +348,48 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, - } - - static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, -- struct dbg_address_watch_info *adw_info) -+ struct dbg_address_watch_info *adw_info) - { - struct pm4__set_config_reg *packets_vec; - union TCP_WATCH_ADDR_H_BITS addrHi; - union TCP_WATCH_ADDR_L_BITS addrLo; - union TCP_WATCH_CNTL_BITS cntl; -+ struct kfd_mem_obj *mem_obj; - unsigned int aw_reg_add_dword; - uint32_t *packet_buff_uint; -- uint64_t packet_buff_gpu_addr; - unsigned int i; - int status; - size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; - /* we do not control the vmid in DIQ mode, just a place holder */ - unsigned int vmid = 0; - -+ BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); -+ - addrHi.u32All = 0; - addrLo.u32All = 0; - cntl.u32All = 0; - - if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || - (adw_info->num_watch_points == 0)) { -- pr_err("num_watch_points is invalid\n"); -+ pr_err("amdkfd: num_watch_points is invalid\n"); - return -EINVAL; - } - -- if (!adw_info->watch_mode || !adw_info->watch_address) { -- pr_err("adw_info fields are not valid\n"); -+ if ((NULL == adw_info->watch_mode) || -+ (NULL == adw_info->watch_address)) { -+ pr_err("amdkfd: adw_info fields are not valid\n"); - return -EINVAL; - } - -- status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, -- ib_size/sizeof(uint32_t), -- &packet_buff_uint, &packet_buff_gpu_addr); -- if (status) { -- pr_err("Failed to allocate IB from DIQ ring\n"); -+ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); -+ -+ if (status != 0) { -+ pr_err("amdkfd: Failed to allocate GART memory\n"); - return status; - } -+ -+ packet_buff_uint = mem_obj->cpu_ptr; -+ - memset(packet_buff_uint, 0, ib_size); - - packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); -@@ -398,9 +408,12 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, - packets_vec[3].bitfields2.insert_vmid = 1; - - for (i = 0; i < adw_info->num_watch_points; i++) { -- dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, -- &cntl, i, vmid, -- dbgdev->dev->device_info->is_need_iommu_device); -+ dbgdev_address_watch_set_registers(adw_info, -+ &addrHi, -+ &addrLo, -+ &cntl, -+ i, -+ vmid); - - pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); - pr_debug("\t\t%20s %08x\n", "register index :", i); -@@ -429,6 +442,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, - i, - ADDRESS_WATCH_REG_CNTL); - -+ aw_reg_add_dword /= sizeof(uint32_t); -+ - packets_vec[0].bitfields2.reg_offset = - aw_reg_add_dword - AMD_CONFIG_REG_BASE; - -@@ -440,6 +455,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, - i, - ADDRESS_WATCH_REG_ADDR_HI); - -+ aw_reg_add_dword /= sizeof(uint32_t); -+ - packets_vec[1].bitfields2.reg_offset = - aw_reg_add_dword - AMD_CONFIG_REG_BASE; - packets_vec[1].reg_data[0] = addrHi.u32All; -@@ -450,6 +467,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, - i, - ADDRESS_WATCH_REG_ADDR_LO); - -+ aw_reg_add_dword /= sizeof(uint32_t); -+ - packets_vec[2].bitfields2.reg_offset = - aw_reg_add_dword - AMD_CONFIG_REG_BASE; - packets_vec[2].reg_data[0] = addrLo.u32All; -@@ -466,6 +485,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, - i, - ADDRESS_WATCH_REG_CNTL); - -+ aw_reg_add_dword /= sizeof(uint32_t); -+ - packets_vec[3].bitfields2.reg_offset = - aw_reg_add_dword - AMD_CONFIG_REG_BASE; - packets_vec[3].reg_data[0] = cntl.u32All; -@@ -473,30 +494,32 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, - status = dbgdev_diq_submit_ib( - dbgdev, - adw_info->process->pasid, -- packet_buff_gpu_addr, -+ mem_obj->gpu_addr, - packet_buff_uint, -- ib_size, true); -+ ib_size); - -- if (status) { -- pr_err("Failed to submit IB to DIQ\n"); -- return status; -+ if (status != 0) { -+ pr_err("amdkfd: Failed to submit IB to DIQ\n"); -+ break; - } - } - -+ kfd_gtt_sa_free(dbgdev->dev, mem_obj); - return status; - } - - static int dbgdev_wave_control_set_registers( - struct dbg_wave_control_info *wac_info, - union SQ_CMD_BITS *in_reg_sq_cmd, -- union GRBM_GFX_INDEX_BITS *in_reg_gfx_index, -- unsigned int asic_family) -+ union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) - { - int status = 0; - union SQ_CMD_BITS reg_sq_cmd; - union GRBM_GFX_INDEX_BITS reg_gfx_index; - struct HsaDbgWaveMsgAMDGen2 *pMsg; - -+ BUG_ON(!wac_info || !in_reg_sq_cmd || !in_reg_gfx_index); -+ - reg_sq_cmd.u32All = 0; - reg_gfx_index.u32All = 0; - pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2; -@@ -548,25 +571,11 @@ static int dbgdev_wave_control_set_registers( - - switch (wac_info->operand) { - case HSA_DBG_WAVEOP_HALT: -- if (asic_family == CHIP_KAVERI) { -- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; -- pr_debug("Halting KV\n"); -- } else { -- reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; -- reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT; -- pr_debug("Halting CZ\n"); -- } -+ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; - break; - - case HSA_DBG_WAVEOP_RESUME: -- if (asic_family == CHIP_KAVERI) { -- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; -- pr_debug("Resuming KV\n"); -- } else { -- reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; -- reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME; -- pr_debug("Resuming CZ\n"); -- } -+ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; - break; - - case HSA_DBG_WAVEOP_KILL: -@@ -606,21 +615,23 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, - int status; - union SQ_CMD_BITS reg_sq_cmd; - union GRBM_GFX_INDEX_BITS reg_gfx_index; -+ struct kfd_mem_obj *mem_obj; - uint32_t *packet_buff_uint; -- uint64_t packet_buff_gpu_addr; - struct pm4__set_config_reg *packets_vec; - size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; - -+ BUG_ON(!dbgdev || !wac_info); -+ - reg_sq_cmd.u32All = 0; - - status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, -- ®_gfx_index, dbgdev->dev->device_info->asic_family); -+ ®_gfx_index); - if (status) { -- pr_err("Failed to set wave control registers\n"); -+ pr_err("amdkfd: Failed to set wave control registers\n"); - return status; - } - -- /* we do not control the VMID in DIQ, so reset it to a known value */ -+ /* we do not control the VMID in DIQ,so reset it to a known value */ - reg_sq_cmd.bits.vm_id = 0; - - pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); -@@ -653,13 +664,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, - - pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); - -- status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, -- ib_size / sizeof(uint32_t), -- &packet_buff_uint, &packet_buff_gpu_addr); -- if (status) { -- pr_err("Failed to allocate IB from DIQ ring\n"); -+ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); -+ -+ if (status != 0) { -+ pr_err("amdkfd: Failed to allocate GART memory\n"); - return status; - } -+ -+ packet_buff_uint = mem_obj->cpu_ptr; -+ - memset(packet_buff_uint, 0, ib_size); - - packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; -@@ -702,12 +715,14 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, - status = dbgdev_diq_submit_ib( - dbgdev, - wac_info->process->pasid, -- packet_buff_gpu_addr, -+ mem_obj->gpu_addr, - packet_buff_uint, -- ib_size, false); -+ ib_size); - -- if (status) -- pr_err("Failed to submit IB to DIQ\n"); -+ if (status != 0) -+ pr_err("amdkfd: Failed to submit IB to DIQ\n"); -+ -+ kfd_gtt_sa_free(dbgdev->dev, mem_obj); - - return status; - } -@@ -720,19 +735,21 @@ static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, - union GRBM_GFX_INDEX_BITS reg_gfx_index; - struct kfd_process_device *pdd; - -+ BUG_ON(!dbgdev || !dbgdev->dev || !wac_info); -+ - reg_sq_cmd.u32All = 0; - - /* taking the VMID for that process on the safe way using PDD */ - pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process); - - if (!pdd) { -- pr_err("Failed to get pdd for wave control no DIQ\n"); -+ pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); - return -EFAULT; - } - status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, -- ®_gfx_index, dbgdev->dev->device_info->asic_family); -+ ®_gfx_index); - if (status) { -- pr_err("Failed to set wave control registers\n"); -+ pr_err("amdkfd: Failed to set wave control registers\n"); - return status; - } - -@@ -783,8 +800,13 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - union GRBM_GFX_INDEX_BITS reg_gfx_index; - struct kfd_process_device *pdd; - struct dbg_wave_control_info wac_info; -- int first_vmid_to_scan = dev->vm_info.first_vmid_kfd; -- int last_vmid_to_scan = dev->vm_info.last_vmid_kfd; -+ int temp; -+ int first_vmid_to_scan = 8; -+ int last_vmid_to_scan = 15; -+ -+ first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1; -+ temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan; -+ last_vmid_to_scan = first_vmid_to_scan + ffz(temp); - - reg_sq_cmd.u32All = 0; - status = 0; -@@ -796,13 +818,12 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - - /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. - * ATC_VMID15_PASID_MAPPING -- * to check which VMID the current process is mapped to. -- */ -+ * to check which VMID the current process is mapped to. */ - - for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { - if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid - (dev->kgd, vmid)) { -- if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid -+ if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid - (dev->kgd, vmid) == p->pasid) { - pr_debug("Killing wave fronts of vmid %d and pasid %d\n", - vmid, p->pasid); -@@ -812,7 +833,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - } - - if (vmid > last_vmid_to_scan) { -- pr_err("Didn't find vmid for pasid %d\n", p->pasid); -+ pr_err("amdkfd: didn't found vmid for pasid (%d)\n", p->pasid); - return -EFAULT; - } - -@@ -822,7 +843,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - return -EFAULT; - - status = dbgdev_wave_control_set_registers(&wac_info, ®_sq_cmd, -- ®_gfx_index, dev->device_info->asic_family); -+ ®_gfx_index); - if (status != 0) - return -EINVAL; - -@@ -839,6 +860,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) - void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, - enum DBGDEV_TYPE type) - { -+ BUG_ON(!pdbgdev || !pdev); -+ - pdbgdev->dev = pdev; - pdbgdev->kq = NULL; - pdbgdev->type = type; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h -index 583aaa9..03424c2 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h -@@ -60,24 +60,6 @@ enum { - SH_REG_SIZE = SH_REG_END - SH_REG_BASE - }; - --/* SQ_CMD definitions */ -- --enum { -- SQ_IND_CMD_DATA_RESUME = 0, -- SQ_IND_CMD_DATA_HALT = 1 --}; -- --enum SQ_IND_CMD_NEW { -- SQ_IND_CMD_NEW_NULL = 0x00000000, -- SQ_IND_CMD_NEW_SETHALT = 0x00000001, -- SQ_IND_CMD_NEW_SAVECTX = 0x00000002, -- SQ_IND_CMD_NEW_KILL = 0x00000003, -- SQ_IND_CMD_NEW_DEBUG = 0x00000004, -- SQ_IND_CMD_NEW_TRAP = 0x00000005, -- SQ_IND_CMD_NEW_SET_PRIO = 0x00000006 -- --}; -- - enum SQ_IND_CMD_CMD { - SQ_IND_CMD_CMD_NULL = 0x00000000, - SQ_IND_CMD_CMD_HALT = 0x00000001, -@@ -136,20 +118,6 @@ union SQ_CMD_BITS { - uint32_t:1; - uint32_t vm_id:4; - } bitfields, bits; -- struct { -- uint32_t cmd:3; -- uint32_t:1; -- uint32_t mode:3; -- uint32_t check_vmid:1; -- uint32_t data:3; -- uint32_t:5; -- uint32_t wave_id:4; -- uint32_t simd_id:2; -- uint32_t:2; -- uint32_t queue_id:3; -- uint32_t:1; -- uint32_t vm_id:4; -- } bitfields_sethalt, bits_sethalt; - uint32_t u32All; - signed int i32All; - float f32All; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c -index 9d4af96..56d6763 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c -@@ -33,7 +33,6 @@ - #include "kfd_pm4_headers_diq.h" - #include "kfd_dbgmgr.h" - #include "kfd_dbgdev.h" --#include "kfd_device_queue_manager.h" - - static DEFINE_MUTEX(kfd_dbgmgr_mutex); - -@@ -45,6 +44,8 @@ struct mutex *kfd_get_dbgmgr_mutex(void) - - static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) - { -+ BUG_ON(!pmgr); -+ - kfree(pmgr->dbgdev); - - pmgr->dbgdev = NULL; -@@ -54,7 +55,7 @@ static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) - - void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) - { -- if (pmgr) { -+ if (pmgr != NULL) { - kfd_dbgmgr_uninitialize(pmgr); - kfree(pmgr); - } -@@ -65,12 +66,12 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) - enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; - struct kfd_dbgmgr *new_buff; - -- if (WARN_ON(!pdev->init_complete)) -- return false; -+ BUG_ON(pdev == NULL); -+ BUG_ON(!pdev->init_complete); - - new_buff = kfd_alloc_struct(new_buff); - if (!new_buff) { -- pr_err("Failed to allocate dbgmgr instance\n"); -+ pr_err("amdkfd: Failed to allocate dbgmgr instance\n"); - return false; - } - -@@ -78,13 +79,13 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) - new_buff->dev = pdev; - new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev); - if (!new_buff->dbgdev) { -- pr_err("Failed to allocate dbgdev instance\n"); -+ pr_err("amdkfd: Failed to allocate dbgdev instance\n"); - kfree(new_buff); - return false; - } - - /* get actual type of DBGDevice cpsch or not */ -- if (pdev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) -+ if (sched_policy == KFD_SCHED_POLICY_NO_HWS) - type = DBGDEV_TYPE_NODIQ; - - kfd_dbgdev_init(new_buff->dbgdev, pdev, type); -@@ -95,6 +96,8 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) - - long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) - { -+ BUG_ON(!p || !pmgr || !pmgr->dbgdev); -+ - if (pmgr->pasid != 0) { - pr_debug("H/W debugger is already active using pasid %d\n", - pmgr->pasid); -@@ -115,6 +118,8 @@ long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) - - long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) - { -+ BUG_ON(!p || !pmgr || !pmgr->dbgdev); -+ - /* Is the requests coming from the already registered process? */ - if (pmgr->pasid != p->pasid) { - pr_debug("H/W debugger is not registered by calling pasid %d\n", -@@ -132,6 +137,8 @@ long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) - long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, - struct dbg_wave_control_info *wac_info) - { -+ BUG_ON(!pmgr || !pmgr->dbgdev || !wac_info); -+ - /* Is the requests coming from the already registered process? */ - if (pmgr->pasid != wac_info->process->pasid) { - pr_debug("H/W debugger support was not registered for requester pasid %d\n", -@@ -145,6 +152,9 @@ long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, - long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, - struct dbg_address_watch_info *adw_info) - { -+ BUG_ON(!pmgr || !pmgr->dbgdev || !adw_info); -+ -+ - /* Is the requests coming from the already registered process? */ - if (pmgr->pasid != adw_info->process->pasid) { - pr_debug("H/W debugger support was not registered for requester pasid %d\n", -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h -index a04a1fe..257a745 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h -@@ -30,11 +30,13 @@ - #pragma pack(push, 4) - - enum HSA_DBG_WAVEOP { -- HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ -- HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ -- HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ -- HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter dbg mode */ -- HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take a trap */ -+ HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ -+ HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ -+ HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ -+ HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter -+ debug mode */ -+ HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take -+ a trap */ - HSA_DBG_NUM_WAVEOP = 5, - HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF - }; -@@ -79,13 +81,15 @@ struct HsaDbgWaveMsgAMDGen2 { - uint32_t UserData:8; /* user data */ - uint32_t ShaderArray:1; /* Shader array */ - uint32_t Priv:1; /* Privileged */ -- uint32_t Reserved0:4; /* Reserved, should be 0 */ -+ uint32_t Reserved0:4; /* This field is reserved, -+ should be 0 */ - uint32_t WaveId:4; /* wave id */ - uint32_t SIMD:2; /* SIMD id */ - uint32_t HSACU:4; /* Compute unit */ - uint32_t ShaderEngine:2;/* Shader engine */ - uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ -- uint32_t Reserved1:4; /* Reserved, should be 0 */ -+ uint32_t Reserved1:4; /* This field is reserved, -+ should be 0 */ - } ui32; - uint32_t Value; - }; -@@ -117,23 +121,20 @@ struct HsaDbgWaveMessage { - * in the user mode instruction stream. The OS scheduler event is typically - * associated and signaled by an interrupt issued by the GPU, but other HSA - * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced -- * by the KFD by this mechanism, too. -- */ -+ * by the KFD by this mechanism, too. */ - - /* these are the new definitions for events */ - enum HSA_EVENTTYPE { - HSA_EVENTTYPE_SIGNAL = 0, /* user-mode generated GPU signal */ - HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ - HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change -- * (start/stop) -- */ -+ (start/stop) */ - HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ - HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ - HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ - HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */ - HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state -- * (EOP pm4) -- */ -+ (EOP pm4) */ - /* ... */ - HSA_EVENTTYPE_MAXID, - HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c -deleted file mode 100644 -index 232e28f..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c -+++ /dev/null -@@ -1,75 +0,0 @@ --/* -- * Copyright 2014 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- --#include <linux/debugfs.h> --#include "kfd_priv.h" -- --static struct dentry *debugfs_root; -- --static int kfd_debugfs_open(struct inode *inode, struct file *file) --{ -- int (*show)(struct seq_file *, void *) = inode->i_private; -- -- return single_open(file, show, NULL); --} -- --static const struct file_operations kfd_debugfs_fops = { -- .owner = THIS_MODULE, -- .open = kfd_debugfs_open, -- .read = seq_read, -- .llseek = seq_lseek, -- .release = single_release, --}; -- --void kfd_debugfs_init(void) --{ -- struct dentry *ent; -- -- debugfs_root = debugfs_create_dir("kfd", NULL); -- if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) { -- pr_warn("Failed to create kfd debugfs dir\n"); -- return; -- } -- -- ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, -- kfd_debugfs_mqds_by_process, -- &kfd_debugfs_fops); -- if (!ent) -- pr_warn("Failed to create mqds in kfd debugfs\n"); -- -- ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, -- kfd_debugfs_hqds_by_device, -- &kfd_debugfs_fops); -- if (!ent) -- pr_warn("Failed to create hqds in kfd debugfs\n"); -- -- ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, -- kfd_debugfs_rls_by_device, -- &kfd_debugfs_fops); -- if (!ent) -- pr_warn("Failed to create rls in kfd debugfs\n"); --} -- --void kfd_debugfs_fini(void) --{ -- debugfs_remove_recursive(debugfs_root); --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c -index 6b3a1fa..3f95f7c 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c -@@ -20,209 +20,36 @@ - * OTHER DEALINGS IN THE SOFTWARE. - */ - --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - #include <linux/amd-iommu.h> --#endif -+#include <linux/bsearch.h> - #include <linux/pci.h> - #include <linux/slab.h> --#include <linux/highmem.h> - #include "kfd_priv.h" - #include "kfd_device_queue_manager.h" --#include "kfd_pm4_headers_vi.h" --#include "cwsr_trap_handler_carrizo.h" --#include "cwsr_trap_handler_gfx9.asm" -+#include "kfd_pm4_headers.h" - - #define MQD_SIZE_ALIGNED 768 --static atomic_t kfd_device_suspended = ATOMIC_INIT(0); - --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - static const struct kfd_device_info kaveri_device_info = { - .asic_family = CHIP_KAVERI, - .max_pasid_bits = 16, - /* max num of queues for KV.TODO should be a dynamic value */ - .max_no_of_hqd = 24, -- .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = true, -- .supports_cwsr = false, -- .needs_pci_atomics = false, -+ .mqd_size_aligned = MQD_SIZE_ALIGNED - }; --#endif - --static const struct kfd_device_info hawaii_device_info = { -- .asic_family = CHIP_HAWAII, -- .max_pasid_bits = 16, -- /* max num of queues for KV.TODO should be a dynamic value */ -- .max_no_of_hqd = 24, -- .doorbell_size = 4, -- .ih_ring_entry_size = 4 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_cik, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = false, -- .needs_pci_atomics = false, --}; -- --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - static const struct kfd_device_info carrizo_device_info = { - .asic_family = CHIP_CARRIZO, - .max_pasid_bits = 16, - /* max num of queues for CZ.TODO should be a dynamic value */ - .max_no_of_hqd = 24, -- .doorbell_size = 4, -- .ih_ring_entry_size = 4 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_cik, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = true, -- .supports_cwsr = true, -- .needs_pci_atomics = false, --}; --#endif -- --static const struct kfd_device_info tonga_device_info = { -- .asic_family = CHIP_TONGA, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 4, -- .ih_ring_entry_size = 4 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_cik, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = false, -- .needs_pci_atomics = true, --}; -- --static const struct kfd_device_info tonga_vf_device_info = { -- .asic_family = CHIP_TONGA, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 4, -- .ih_ring_entry_size = 4 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_cik, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = false, -- .needs_pci_atomics = false, --}; -- --static const struct kfd_device_info fiji_device_info = { -- .asic_family = CHIP_FIJI, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 4, -- .ih_ring_entry_size = 4 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_cik, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = true, -- .needs_pci_atomics = true, --}; -- --static const struct kfd_device_info fiji_vf_device_info = { -- .asic_family = CHIP_FIJI, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = true, -- .needs_pci_atomics = false, --}; -- -- --static const struct kfd_device_info polaris10_device_info = { -- .asic_family = CHIP_POLARIS10, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 4, -- .ih_ring_entry_size = 4 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_cik, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = true, -- .needs_pci_atomics = true, --}; -- --static const struct kfd_device_info polaris10_vf_device_info = { -- .asic_family = CHIP_POLARIS10, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 4, -- .ih_ring_entry_size = 4 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_cik, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = true, -- .needs_pci_atomics = false, --}; -- --static const struct kfd_device_info polaris11_device_info = { -- .asic_family = CHIP_POLARIS11, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 4, -- .ih_ring_entry_size = 4 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_cik, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = true, -- .needs_pci_atomics = true, --}; -- --static const struct kfd_device_info vega10_device_info = { -- .asic_family = CHIP_VEGA10, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 8, -- .ih_ring_entry_size = 8 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_v9, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = true, -- .needs_pci_atomics = true, --}; -- --static const struct kfd_device_info vega10_vf_device_info = { -- .asic_family = CHIP_VEGA10, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 8, -- .ih_ring_entry_size = 8 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_v9, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = false, -- .supports_cwsr = true, -- .needs_pci_atomics = false, --}; -- --static const struct kfd_device_info raven_device_info = { -- .asic_family = CHIP_RAVEN, -- .max_pasid_bits = 16, -- .max_no_of_hqd = 24, -- .doorbell_size = 8, -- .ih_ring_entry_size = 8 * sizeof(uint32_t), -- .event_interrupt_class = &event_interrupt_class_v9, -- .num_of_watch_points = 4, -- .mqd_size_aligned = MQD_SIZE_ALIGNED, -- .is_need_iommu_device = true, -- .supports_cwsr = true, -- .needs_pci_atomics = true, -+ .mqd_size_aligned = MQD_SIZE_ALIGNED - }; - - struct kfd_deviceid { -@@ -232,7 +59,6 @@ struct kfd_deviceid { - - /* Please keep this sorted by increasing device id. */ - static const struct kfd_deviceid supported_devices[] = { --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - { 0x1304, &kaveri_device_info }, /* Kaveri */ - { 0x1305, &kaveri_device_info }, /* Kaveri */ - { 0x1306, &kaveri_device_info }, /* Kaveri */ -@@ -255,90 +81,28 @@ static const struct kfd_deviceid supported_devices[] = { - { 0x131B, &kaveri_device_info }, /* Kaveri */ - { 0x131C, &kaveri_device_info }, /* Kaveri */ - { 0x131D, &kaveri_device_info }, /* Kaveri */ --#endif -- { 0x67A0, &hawaii_device_info }, /* Hawaii */ -- { 0x67A1, &hawaii_device_info }, /* Hawaii */ -- { 0x67A2, &hawaii_device_info }, /* Hawaii */ -- { 0x67A8, &hawaii_device_info }, /* Hawaii */ -- { 0x67A9, &hawaii_device_info }, /* Hawaii */ -- { 0x67AA, &hawaii_device_info }, /* Hawaii */ -- { 0x67B0, &hawaii_device_info }, /* Hawaii */ -- { 0x67B1, &hawaii_device_info }, /* Hawaii */ -- { 0x67B8, &hawaii_device_info }, /* Hawaii */ -- { 0x67B9, &hawaii_device_info }, /* Hawaii */ -- { 0x67BA, &hawaii_device_info }, /* Hawaii */ -- { 0x67BE, &hawaii_device_info }, /* Hawaii */ --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - { 0x9870, &carrizo_device_info }, /* Carrizo */ - { 0x9874, &carrizo_device_info }, /* Carrizo */ - { 0x9875, &carrizo_device_info }, /* Carrizo */ - { 0x9876, &carrizo_device_info }, /* Carrizo */ -- { 0x9877, &carrizo_device_info }, /* Carrizo */ --#endif -- { 0x6920, &tonga_device_info }, /* Tonga */ -- { 0x6921, &tonga_device_info }, /* Tonga */ -- { 0x6928, &tonga_device_info }, /* Tonga */ -- { 0x6929, &tonga_device_info }, /* Tonga */ -- { 0x692B, &tonga_device_info }, /* Tonga */ -- { 0x692F, &tonga_vf_device_info }, /* Tonga vf */ -- { 0x6938, &tonga_device_info }, /* Tonga */ -- { 0x6939, &tonga_device_info }, /* Tonga */ -- { 0x7300, &fiji_device_info }, /* Fiji */ -- { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ -- { 0x67C0, &polaris10_device_info }, /* Polaris10 */ -- { 0x67C1, &polaris10_device_info }, /* Polaris10 */ -- { 0x67C2, &polaris10_device_info }, /* Polaris10 */ -- { 0x67C4, &polaris10_device_info }, /* Polaris10 */ -- { 0x67C7, &polaris10_device_info }, /* Polaris10 */ -- { 0x67C8, &polaris10_device_info }, /* Polaris10 */ -- { 0x67C9, &polaris10_device_info }, /* Polaris10 */ -- { 0x67CA, &polaris10_device_info }, /* Polaris10 */ -- { 0x67CC, &polaris10_device_info }, /* Polaris10 */ -- { 0x67CF, &polaris10_device_info }, /* Polaris10 */ -- { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ -- { 0x67DF, &polaris10_device_info }, /* Polaris10 */ -- { 0x67E0, &polaris11_device_info }, /* Polaris11 */ -- { 0x67E1, &polaris11_device_info }, /* Polaris11 */ -- { 0x67E3, &polaris11_device_info }, /* Polaris11 */ -- { 0x67E7, &polaris11_device_info }, /* Polaris11 */ -- { 0x67E8, &polaris11_device_info }, /* Polaris11 */ -- { 0x67E9, &polaris11_device_info }, /* Polaris11 */ -- { 0x67EB, &polaris11_device_info }, /* Polaris11 */ -- { 0x67EF, &polaris11_device_info }, /* Polaris11 */ -- { 0x67FF, &polaris11_device_info }, /* Polaris11 */ -- { 0x6860, &vega10_device_info }, /* Vega10 */ -- { 0x6861, &vega10_device_info }, /* Vega10 */ -- { 0x6862, &vega10_device_info }, /* Vega10 */ -- { 0x6863, &vega10_device_info }, /* Vega10 */ -- { 0x6864, &vega10_device_info }, /* Vega10 */ -- { 0x6867, &vega10_device_info }, /* Vega10 */ -- { 0x6868, &vega10_device_info }, /* Vega10 */ -- { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ -- { 0x687F, &vega10_device_info }, /* Vega10 */ --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- { 0x15DD, &raven_device_info } /* Raven */ --#endif -+ { 0x9877, &carrizo_device_info } /* Carrizo */ - }; - - static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, - unsigned int chunk_size); - static void kfd_gtt_sa_fini(struct kfd_dev *kfd); - --static int kfd_resume(struct kfd_dev *kfd); -- - static const struct kfd_device_info *lookup_device_info(unsigned short did) - { - size_t i; - - for (i = 0; i < ARRAY_SIZE(supported_devices); i++) { - if (supported_devices[i].did == did) { -- WARN_ON(!supported_devices[i].device_info); -+ BUG_ON(supported_devices[i].device_info == NULL); - return supported_devices[i].device_info; - } - } - -- WARN(1, "device is not added to supported_devices\n"); -- - return NULL; - } - -@@ -350,21 +114,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, - const struct kfd_device_info *device_info = - lookup_device_info(pdev->device); - -- if (!device_info) { -- dev_err(kfd_device, "kgd2kfd_probe failed\n"); -+ if (!device_info) - return NULL; -- } -- -- if (device_info->needs_pci_atomics) { -- /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. -- */ -- if (pci_enable_atomic_ops_to_root(pdev) < 0) { -- dev_info(kfd_device, -- "skipped device %x:%x, PCI rejects atomics", -- pdev->vendor, pdev->device); -- return NULL; -- } -- } - - kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); - if (!kfd) -@@ -383,7 +134,6 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, - return kfd; - } - --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - static bool device_iommu_pasid_init(struct kfd_dev *kfd) - { - const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | -@@ -402,16 +152,15 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) - } - - if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) { -- dev_err(kfd_device, "error required iommu flags ats %i, pri %i, pasid %i\n", -+ dev_err(kfd_device, "error required iommu flags ats(%i), pri(%i), pasid(%i)\n", - (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0, - (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0, -- (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) -- != 0); -+ (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) != 0); - return false; - } - - pasid_limit = min_t(unsigned int, -- (unsigned int)(1 << kfd->device_info->max_pasid_bits), -+ (unsigned int)1 << kfd->device_info->max_pasid_bits, - iommu_info.max_pasids); - /* - * last pasid is used for kernel queues doorbells -@@ -421,8 +170,15 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) - pasid_limit, - kfd->doorbell_process_limit - 1); - -+ err = amd_iommu_init_device(kfd->pdev, pasid_limit); -+ if (err < 0) { -+ dev_err(kfd_device, "error initializing iommu device\n"); -+ return false; -+ } -+ - if (!kfd_set_pasid_limit(pasid_limit)) { - dev_err(kfd_device, "error setting pasid limit\n"); -+ amd_iommu_free_device(kfd->pdev); - return false; - } - -@@ -434,7 +190,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) - struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); - - if (dev) -- kfd_process_iommu_unbind_callback(dev, pasid); -+ kfd_unbind_process_from_device(dev, pasid); - } - - /* -@@ -455,108 +211,21 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, - flags); - - dev = kfd_device_by_pci_dev(pdev); -- if (!WARN_ON(!dev)) -- kfd_signal_iommu_event(dev, pasid, address, -+ BUG_ON(dev == NULL); -+ -+ kfd_signal_iommu_event(dev, pasid, address, - flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC); - - return AMD_IOMMU_INV_PRI_RSP_INVALID; - } --#endif /* CONFIG_AMD_IOMMU_V2 */ -- --static int kfd_cwsr_init(struct kfd_dev *kfd) --{ -- /* -- * Initialize the CWSR required memory for TBA and TMA -- */ -- if (cwsr_enable && kfd->device_info->supports_cwsr) { -- const uint32_t *cwsr_hex; -- void *cwsr_addr = NULL; -- unsigned int size; -- -- if (kfd->device_info->asic_family < CHIP_VEGA10) { -- cwsr_hex = cwsr_trap_carrizo_hex; -- size = sizeof(cwsr_trap_carrizo_hex); -- } else { -- cwsr_hex = cwsr_trap_gfx9_hex; -- size = sizeof(cwsr_trap_gfx9_hex); -- } -- -- if (size > PAGE_SIZE) { -- pr_err("Wrong CWSR ISA size.\n"); -- return -EINVAL; -- } -- kfd->cwsr_size = -- ALIGN(size, PAGE_SIZE) + PAGE_SIZE; -- kfd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, -- get_order(kfd->cwsr_size)); -- if (!kfd->cwsr_pages) { -- pr_err("Failed to allocate CWSR isa memory.\n"); -- return -ENOMEM; -- } -- /*Only first page used for cwsr ISA code */ -- cwsr_addr = kmap(kfd->cwsr_pages); -- memset(cwsr_addr, 0, PAGE_SIZE); -- memcpy(cwsr_addr, cwsr_hex, size); -- kunmap(kfd->cwsr_pages); -- kfd->tma_offset = ALIGN(size, PAGE_SIZE); -- kfd->cwsr_enabled = true; -- dev_info(kfd_device, -- "Reserved %d pages for cwsr.\n", -- (kfd->cwsr_size >> PAGE_SHIFT)); -- } -- -- return 0; --} -- --static void kfd_cwsr_fini(struct kfd_dev *kfd) --{ -- if (kfd->cwsr_pages) -- __free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size)); --} -- --static void kfd_ib_mem_init(struct kfd_dev *kdev) --{ -- /* In certain cases we need to send IB from kernel using the GPU address -- * space created by user applications. -- * For example, on GFX v7, we need to flush TC associated to the VMID -- * before tearing down the VMID. In order to do so, we need an address -- * valid to the VMID to place the IB while this space was created on -- * the user's side, not the kernel. -- * Since kfd_set_process_dgpu_aperture reserves "cwsr_base + cwsr_size" -- * but CWSR only uses pages above cwsr_base, we'll use one page memory -- * under cwsr_base for IB submissions -- */ -- kdev->ib_size = PAGE_SIZE; --} - - bool kgd2kfd_device_init(struct kfd_dev *kfd, - const struct kgd2kfd_shared_resources *gpu_resources) - { - unsigned int size; -- unsigned int vmid_bitmap_kfd, vmid_num_kfd; -- -- kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, -- KGD_ENGINE_MEC1); - - kfd->shared_resources = *gpu_resources; - -- vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap; -- kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1; -- kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1; -- vmid_num_kfd = kfd->vm_info.last_vmid_kfd -- - kfd->vm_info.first_vmid_kfd + 1; -- kfd->vm_info.vmid_num_kfd = vmid_num_kfd; -- -- /* Verify module parameters regarding mapped process number*/ -- if ((hws_max_conc_proc < 0) -- || (hws_max_conc_proc > vmid_num_kfd)) { -- dev_err(kfd_device, -- "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n", -- hws_max_conc_proc, vmid_num_kfd, vmid_num_kfd); -- kfd->max_proc_per_quantum = vmid_num_kfd; -- } else -- kfd->max_proc_per_quantum = hws_max_conc_proc; -- - /* calculate max size of mqds needed for queues */ - size = max_num_of_queues_per_device * - kfd->device_info->mqd_size_aligned; -@@ -565,9 +234,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - * calculate max size of runlist packet. - * There can be only 2 packets at once - */ -- size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_mes_map_process) + -- max_num_of_queues_per_device * sizeof(struct pm4_mes_map_queues) -- + sizeof(struct pm4_mes_runlist)) * 2; -+ size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_map_process) + -+ max_num_of_queues_per_device * -+ sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2; - - /* Add size of HIQ & DIQ */ - size += KFD_KERNEL_QUEUE_SIZE * 2; -@@ -578,88 +247,89 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - if (kfd->kfd2kgd->init_gtt_mem_allocation( - kfd->kgd, size, &kfd->gtt_mem, - &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)){ -- dev_err(kfd_device, "Could not allocate %d bytes\n", size); -+ dev_err(kfd_device, -+ "Could not allocate %d bytes for device (%x:%x)\n", -+ size, kfd->pdev->vendor, kfd->pdev->device); - goto out; - } - -- dev_info(kfd_device, "Allocated %d bytes on gart\n", size); -+ dev_info(kfd_device, -+ "Allocated %d bytes on gart for device(%x:%x)\n", -+ size, kfd->pdev->vendor, kfd->pdev->device); - - /* Initialize GTT sa with 512 byte chunk size */ - if (kfd_gtt_sa_init(kfd, size, 512) != 0) { -- dev_err(kfd_device, "Error initializing gtt sub-allocator\n"); -+ dev_err(kfd_device, -+ "Error initializing gtt sub-allocator\n"); - goto kfd_gtt_sa_init_error; - } - -- if (kfd_doorbell_init(kfd)) { -- dev_err(kfd_device, -- "Error initializing doorbell aperture\n"); -- goto kfd_doorbell_error; -- } -+ kfd_doorbell_init(kfd); - -- if (kfd_topology_add_device(kfd)) { -- dev_err(kfd_device, "Error adding device to topology\n"); -+ if (kfd_topology_add_device(kfd) != 0) { -+ dev_err(kfd_device, -+ "Error adding device (%x:%x) to topology\n", -+ kfd->pdev->vendor, kfd->pdev->device); - goto kfd_topology_add_device_error; - } - - if (kfd_interrupt_init(kfd)) { -- dev_err(kfd_device, "Error initializing interrupts\n"); -+ dev_err(kfd_device, -+ "Error initializing interrupts for device (%x:%x)\n", -+ kfd->pdev->vendor, kfd->pdev->device); - goto kfd_interrupt_error; - } - -+ if (!device_iommu_pasid_init(kfd)) { -+ dev_err(kfd_device, -+ "Error initializing iommuv2 for device (%x:%x)\n", -+ kfd->pdev->vendor, kfd->pdev->device); -+ goto device_iommu_pasid_error; -+ } -+ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, -+ iommu_pasid_shutdown_callback); -+ amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); -+ - kfd->dqm = device_queue_manager_init(kfd); - if (!kfd->dqm) { -- dev_err(kfd_device, "Error initializing queue manager\n"); -+ dev_err(kfd_device, -+ "Error initializing queue manager for device (%x:%x)\n", -+ kfd->pdev->vendor, kfd->pdev->device); - goto device_queue_manager_error; - } - --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- if (kfd->device_info->is_need_iommu_device) { -- if (!device_iommu_pasid_init(kfd)) { -- dev_err(kfd_device, "Error initializing iommuv2\n"); -- goto device_iommu_pasid_error; -- } -- } --#endif -- -- if (kfd_cwsr_init(kfd)) { -- dev_err(kfd_device, "Error initializing cwsr\n"); -- goto device_iommu_pasid_error; -- } -- -- kfd_ib_mem_init(kfd); -- -- if (kfd_resume(kfd)) { -- dev_err(kfd_device, "Error resuming kfd\n"); -- goto kfd_resume_error; -+ if (kfd->dqm->ops.start(kfd->dqm) != 0) { -+ dev_err(kfd_device, -+ "Error starting queuen manager for device (%x:%x)\n", -+ kfd->pdev->vendor, kfd->pdev->device); -+ goto dqm_start_error; - } - - kfd->dbgmgr = NULL; - - kfd->init_complete = true; -- dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor, -+ dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor, - kfd->pdev->device); - -- pr_debug("Starting kfd with the following scheduling policy %d\n", -- kfd->dqm->sched_policy); -+ pr_debug("kfd: Starting kfd with the following scheduling policy %d\n", -+ sched_policy); - - goto out; - --kfd_resume_error: -- kfd_cwsr_fini(kfd); --device_iommu_pasid_error: -+dqm_start_error: - device_queue_manager_uninit(kfd->dqm); - device_queue_manager_error: -+ amd_iommu_free_device(kfd->pdev); -+device_iommu_pasid_error: - kfd_interrupt_exit(kfd); - kfd_interrupt_error: - kfd_topology_remove_device(kfd); - kfd_topology_add_device_error: -- kfd_doorbell_fini(kfd); --kfd_doorbell_error: - kfd_gtt_sa_fini(kfd); - kfd_gtt_sa_init_error: - kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); - dev_err(kfd_device, -- "device %x:%x NOT added due to errors\n", -+ "device (%x:%x) NOT added due to errors\n", - kfd->pdev->vendor, kfd->pdev->device); - out: - return kfd->init_complete; -@@ -668,12 +338,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - void kgd2kfd_device_exit(struct kfd_dev *kfd) - { - if (kfd->init_complete) { -- kgd2kfd_suspend(kfd); -- kfd_cwsr_fini(kfd); - device_queue_manager_uninit(kfd->dqm); -+ amd_iommu_free_device(kfd->pdev); - kfd_interrupt_exit(kfd); - kfd_topology_remove_device(kfd); -- kfd_doorbell_fini(kfd); - kfd_gtt_sa_fini(kfd); - kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); - } -@@ -683,419 +351,77 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) - - void kgd2kfd_suspend(struct kfd_dev *kfd) - { -- if (!kfd->init_complete) -- return; -- -- /* For first KFD device suspend all the KFD processes */ -- if (atomic_inc_return(&kfd_device_suspended) == 1) -- kfd_suspend_all_processes(); -- -- kfd->dqm->ops.stop(kfd->dqm); -- --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- if (!kfd->device_info->is_need_iommu_device) -- return; -+ BUG_ON(kfd == NULL); - -- kfd_unbind_processes_from_device(kfd); -- -- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); -- amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); -- amd_iommu_free_device(kfd->pdev); --#endif -+ if (kfd->init_complete) { -+ kfd->dqm->ops.stop(kfd->dqm); -+ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); -+ amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); -+ amd_iommu_free_device(kfd->pdev); -+ } - } - - int kgd2kfd_resume(struct kfd_dev *kfd) - { -- int ret; -- -- if (!kfd->init_complete) -- return 0; -- -- ret = kfd_resume(kfd); -- if (ret) -- return ret; -- -- if (atomic_dec_return(&kfd_device_suspended) == 0) -- ret = kfd_resume_all_processes(); -- WARN(atomic_read(&kfd_device_suspended) < 0, -- "KFD suspend / resume ref. error\n"); -- return ret; --} -+ unsigned int pasid_limit; -+ int err; - --static int kfd_resume(struct kfd_dev *kfd) --{ -- int err = 0; -+ BUG_ON(kfd == NULL); - --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- if (kfd->device_info->is_need_iommu_device) { -- unsigned int pasid_limit = kfd_get_pasid_limit(); -+ pasid_limit = kfd_get_pasid_limit(); - -+ if (kfd->init_complete) { - err = amd_iommu_init_device(kfd->pdev, pasid_limit); -- if (err) { -- dev_err(kfd_device, "failed to initialize iommu\n"); -+ if (err < 0) - return -ENXIO; -- } -- - amd_iommu_set_invalidate_ctx_cb(kfd->pdev, -- iommu_pasid_shutdown_callback); -- amd_iommu_set_invalid_ppr_cb(kfd->pdev, -- iommu_invalid_ppr_cb); -- -- err = kfd_bind_processes_to_device(kfd); -- if (err) { -- dev_err(kfd_device, -- "failed to bind process to device\n"); -- return -ENXIO; -- } -- } --#endif -- -- err = kfd->dqm->ops.start(kfd->dqm); -- if (err) { -- dev_err(kfd_device, -- "Error starting queue manager for device %x:%x\n", -- kfd->pdev->vendor, kfd->pdev->device); -- goto dqm_start_error; -+ iommu_pasid_shutdown_callback); -+ amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); -+ kfd->dqm->ops.start(kfd->dqm); - } - -- kfd->kfd2kgd->write_config_static_mem(kfd->kgd, true, 1, 3, 0); -- -- return err; -- --dqm_start_error: --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- if (kfd->device_info->is_need_iommu_device) -- amd_iommu_free_device(kfd->pdev); --#endif -- -- return err; -+ return 0; - } - - /* This is called directly from KGD at ISR. */ - void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) - { -- uint32_t patched_ihre[DIV_ROUND_UP( -- kfd->device_info->ih_ring_entry_size, -- sizeof(uint32_t))]; -- bool is_patched = false; -- - if (!kfd->init_complete) - return; - - spin_lock(&kfd->interrupt_lock); - -- if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry, -- patched_ihre, &is_patched) -- && enqueue_ih_ring_entry(kfd, -- is_patched ? patched_ihre : ih_ring_entry)) -- queue_work(kfd->ih_wq, &kfd->interrupt_work); -+ if (kfd->interrupts_active -+ && interrupt_is_wanted(kfd, ih_ring_entry) -+ && enqueue_ih_ring_entry(kfd, ih_ring_entry)) -+ schedule_work(&kfd->interrupt_work); - - spin_unlock(&kfd->interrupt_lock); - } - --/* quiesce_process_mm - -- * Quiesce all user queues that belongs to given process p -- */ --int quiesce_process_mm(struct kfd_process *p) --{ -- struct kfd_process_device *pdd; -- int r = 0; -- unsigned int n_evicted = 0; -- -- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -- r = process_evict_queues(pdd->dev->dqm, &pdd->qpd); -- if (r != 0) { -- pr_err("Failed to evict process queues\n"); -- goto fail; -- } -- n_evicted++; -- } -- -- return r; -- --fail: -- /* To keep state consistent, roll back partial eviction by -- * restoring queues -- */ -- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -- if (n_evicted == 0) -- break; -- if (process_restore_queues(pdd->dev->dqm, &pdd->qpd)) -- pr_err("Failed to restore queues\n"); -- -- n_evicted--; -- } -- -- return r; --} -- --/* resume_process_mm - -- * Resume all user queues that belongs to given process p. The caller must -- * ensure that process p context is valid. -- */ --static int resume_process_mm(struct kfd_process *p) --{ -- struct kfd_process_device *pdd; -- struct mm_struct *mm = (struct mm_struct *)p->mm; -- int r, ret = 0; -- -- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -- if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) -- down_read(&mm->mmap_sem); -- -- r = process_restore_queues(pdd->dev->dqm, &pdd->qpd); -- if (r != 0) { -- pr_err("Failed to restore process queues\n"); -- if (ret == 0) -- ret = r; -- } -- -- if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) -- up_read(&mm->mmap_sem); -- } -- -- return ret; --} -- --int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm) --{ -- struct kfd_process *p; -- struct kfd_process_device *pdd; -- int r; -- -- /* Because we are called from arbitrary context (workqueue) as opposed -- * to process context, kfd_process could attempt to exit while we are -- * running so the lookup function increments the process ref count. -- */ -- p = kfd_lookup_process_by_mm(mm); -- if (!p) -- return -ENODEV; -- -- if (kfd) { -- r = -ENODEV; -- pdd = kfd_get_process_device_data(kfd, p); -- if (pdd) -- r = process_evict_queues(kfd->dqm, &pdd->qpd); -- } else { -- r = quiesce_process_mm(p); -- } -- -- kfd_unref_process(p); -- return r; --} -- --int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) --{ -- struct kfd_process *p; -- struct kfd_process_device *pdd; -- int r; -- -- /* Because we are called from arbitrary context (workqueue) as opposed -- * to process context, kfd_process could attempt to exit while we are -- * running so the lookup function increments the process ref count. -- */ -- p = kfd_lookup_process_by_mm(mm); -- if (!p) -- return -ENODEV; -- -- if (kfd) { -- r = -ENODEV; -- pdd = kfd_get_process_device_data(kfd, p); -- if (pdd) -- r = process_restore_queues(kfd->dqm, &pdd->qpd); -- } else { -- r = resume_process_mm(p); -- } -- -- kfd_unref_process(p); -- return r; --} -- -- --void kfd_restore_bo_worker(struct work_struct *work) --{ -- struct delayed_work *dwork; -- struct kfd_process *p; -- struct kfd_process_device *pdd; -- int ret = 0; -- -- dwork = to_delayed_work(work); -- -- /* Process termination destroys this worker thread. So during the -- * lifetime of this thread, kfd_process p will be valid -- */ -- p = container_of(dwork, struct kfd_process, restore_work); -- -- /* Call restore_process_bos on the first KGD device. This function -- * takes care of restoring the whole process including other devices. -- * Restore can fail if enough memory is not available. If so, -- * reschedule again. -- */ -- pdd = list_first_entry(&p->per_device_data, -- struct kfd_process_device, -- per_device_list); -- -- pr_info("Started restoring process of pasid %d\n", p->pasid); -- -- /* Setting last_restore_timestamp before successful restoration. -- * Otherwise this would have to be set by KGD (restore_process_bos) -- * before KFD BOs are unreserved. If not, the process can be evicted -- * again before the timestamp is set. -- * If restore fails, the timestamp will be set again in the next -- * attempt. This would mean that the minimum GPU quanta would be -- * PROCESS_ACTIVE_TIME_MS - (time to execute the following two -- * functions) -- */ -- -- p->last_restore_timestamp = get_jiffies_64(); -- ret = pdd->dev->kfd2kgd->restore_process_bos(p->process_info, &p->ef); -- if (ret) { -- pr_info("Restore failed, try again after %d ms\n", -- PROCESS_BACK_OFF_TIME_MS); -- ret = schedule_delayed_work(&p->restore_work, -- msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); -- WARN(!ret, "reschedule restore work failed\n"); -- return; -- } -- -- ret = resume_process_mm(p); -- if (ret) -- pr_err("Failed to resume user queues\n"); -- -- pr_info("Finished restoring process of pasid %d\n", p->pasid); --} -- --/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will -- * prepare for safe eviction of KFD BOs that belong to the specified -- * process. -- * -- * @mm: mm_struct that identifies the specified KFD process -- * @fence: eviction fence attached to KFD process BOs -- * -- */ --int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, -- struct dma_fence *fence) --{ -- struct kfd_process *p; -- unsigned long active_time; -- unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS); -- -- if (!fence) -- return -EINVAL; -- -- if (dma_fence_is_signaled(fence)) -- return 0; -- -- p = kfd_lookup_process_by_mm(mm); -- if (!p) -- return -ENODEV; -- -- if (delayed_work_pending(&p->eviction_work.dwork)) { -- /* It is possible has TTM has lined up couple of BOs of the same -- * process to be evicted. Check if the fence is same which -- * indicates that previous work item scheduled is not completed -- */ -- if (p->eviction_work.quiesce_fence == fence) -- goto out; -- else { -- WARN(1, "Starting new evict with previous evict is not completed\n"); -- if (cancel_delayed_work_sync(&p->eviction_work.dwork)) -- dma_fence_put(p->eviction_work.quiesce_fence); -- } -- } -- -- p->eviction_work.quiesce_fence = dma_fence_get(fence); -- -- /* Avoid KFD process starvation. Wait for at least -- * PROCESS_ACTIVE_TIME_MS before evicting the process again -- */ -- active_time = get_jiffies_64() - p->last_restore_timestamp; -- if (delay_jiffies > active_time) -- delay_jiffies -= active_time; -- else -- delay_jiffies = 0; -- -- /* During process initialization eviction_work.dwork is initialized -- * to kfd_evict_bo_worker -- */ -- schedule_delayed_work(&p->eviction_work.dwork, delay_jiffies); --out: -- kfd_unref_process(p); -- return 0; --} -- --void kfd_evict_bo_worker(struct work_struct *work) --{ -- int ret; -- struct kfd_process *p; -- struct kfd_eviction_work *eviction_work; -- struct delayed_work *dwork; -- -- dwork = to_delayed_work(work); -- eviction_work = container_of(dwork, struct kfd_eviction_work, -- dwork); -- -- /* Process termination destroys this worker thread. So during the -- * lifetime of this thread, kfd_process p will be valid -- */ -- p = container_of(eviction_work, struct kfd_process, eviction_work); -- -- /* Narrow window of overlap between restore and evict work item is -- * possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos unreserves -- * KFD BOs, it is possible to evicted again. But restore has few more -- * steps of finish. So lets wait for the restore work to complete -- */ -- if (delayed_work_pending(&p->restore_work)) -- flush_delayed_work(&p->restore_work); -- -- pr_info("Started evicting process of pasid %d\n", p->pasid); -- ret = quiesce_process_mm(p); -- if (!ret) { -- dma_fence_signal(eviction_work->quiesce_fence); -- WARN_ONCE(eviction_work->quiesce_fence != p->ef, -- "Eviction fence mismatch\n"); -- dma_fence_put(p->ef); -- /* TODO: quiesce_fence is same as kfd_process->ef. But -- * quiesce_fence is also used to avoid starting multiple -- * eviction work items. This might not be necessary and -- * one of the variables could be removed -- */ -- p->ef = NULL; -- schedule_delayed_work(&p->restore_work, -- msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); -- } else -- pr_err("Failed to quiesce user queues. Cannot evict BOs\n"); -- -- dma_fence_put(eviction_work->quiesce_fence); -- -- pr_info("Finished evicting process of pasid %d\n", p->pasid); -- --} -- - static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, - unsigned int chunk_size) - { -- unsigned int num_of_longs; -+ unsigned int num_of_bits; - -- if (WARN_ON(buf_size < chunk_size)) -- return -EINVAL; -- if (WARN_ON(buf_size == 0)) -- return -EINVAL; -- if (WARN_ON(chunk_size == 0)) -- return -EINVAL; -+ BUG_ON(!kfd); -+ BUG_ON(!kfd->gtt_mem); -+ BUG_ON(buf_size < chunk_size); -+ BUG_ON(buf_size == 0); -+ BUG_ON(chunk_size == 0); - - kfd->gtt_sa_chunk_size = chunk_size; - kfd->gtt_sa_num_of_chunks = buf_size / chunk_size; - -- num_of_longs = (kfd->gtt_sa_num_of_chunks + BITS_PER_LONG - 1) / -- BITS_PER_LONG; -+ num_of_bits = kfd->gtt_sa_num_of_chunks / BITS_PER_BYTE; -+ BUG_ON(num_of_bits == 0); - -- kfd->gtt_sa_bitmap = kcalloc(num_of_longs, sizeof(long), GFP_KERNEL); -+ kfd->gtt_sa_bitmap = kzalloc(num_of_bits, GFP_KERNEL); - - if (!kfd->gtt_sa_bitmap) - return -ENOMEM; - -- pr_debug("gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n", -+ pr_debug("kfd: gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n", - kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap); - - mutex_init(&kfd->gtt_sa_lock); -@@ -1129,17 +455,19 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, - { - unsigned int found, start_search, cur_size; - -+ BUG_ON(!kfd); -+ - if (size == 0) - return -EINVAL; - - if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) - return -ENOMEM; - -- *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); -- if (!(*mem_obj)) -+ *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); -+ if ((*mem_obj) == NULL) - return -ENOMEM; - -- pr_debug("Allocated mem_obj = %p for size = %d\n", *mem_obj, size); -+ pr_debug("kfd: allocated mem_obj = %p for size = %d\n", *mem_obj, size); - - start_search = 0; - -@@ -1151,7 +479,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, - kfd->gtt_sa_num_of_chunks, - start_search); - -- pr_debug("Found = %d\n", found); -+ pr_debug("kfd: found = %d\n", found); - - /* If there wasn't any free chunk, bail out */ - if (found == kfd->gtt_sa_num_of_chunks) -@@ -1169,12 +497,12 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, - found, - kfd->gtt_sa_chunk_size); - -- pr_debug("gpu_addr = %p, cpu_addr = %p\n", -+ pr_debug("kfd: gpu_addr = %p, cpu_addr = %p\n", - (uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr); - - /* If we need only one chunk, mark it as allocated and get out */ - if (size <= kfd->gtt_sa_chunk_size) { -- pr_debug("Single bit\n"); -+ pr_debug("kfd: single bit\n"); - set_bit(found, kfd->gtt_sa_bitmap); - goto kfd_gtt_out; - } -@@ -1209,7 +537,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, - - } while (cur_size > 0); - -- pr_debug("range_start = %d, range_end = %d\n", -+ pr_debug("kfd: range_start = %d, range_end = %d\n", - (*mem_obj)->range_start, (*mem_obj)->range_end); - - /* Mark the chunks as allocated */ -@@ -1223,7 +551,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, - return 0; - - kfd_gtt_no_free_chunk: -- pr_debug("Allocation failed with mem_obj = %p\n", mem_obj); -+ pr_debug("kfd: allocation failed with mem_obj = %p\n", mem_obj); - mutex_unlock(&kfd->gtt_sa_lock); - kfree(mem_obj); - return -ENOMEM; -@@ -1233,11 +561,13 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) - { - unsigned int bit; - -+ BUG_ON(!kfd); -+ - /* Act like kfree when trying to free a NULL object */ - if (!mem_obj) - return 0; - -- pr_debug("Free mem_obj = %p, range_start = %d, range_end = %d\n", -+ pr_debug("kfd: free mem_obj = %p, range_start = %d, range_end = %d\n", - mem_obj, mem_obj->range_start, mem_obj->range_end); - - mutex_lock(&kfd->gtt_sa_lock); -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c -index 1abbaa0..42de22b 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c -@@ -44,13 +44,9 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - --static int execute_queues_cpsch(struct device_queue_manager *dqm, -- bool static_queues_included); --static int unmap_queues_cpsch(struct device_queue_manager *dqm, -- enum kfd_unmap_queues_filter filter, -- uint32_t filter_param); -- --static int map_queues_cpsch(struct device_queue_manager *dqm); -+static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); -+static int destroy_queues_cpsch(struct device_queue_manager *dqm, -+ bool preempt_static_queues, bool lock); - - static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, -@@ -83,17 +79,20 @@ static bool is_pipe_enabled(struct device_queue_manager *dqm, int mec, int pipe) - - unsigned int get_queues_num(struct device_queue_manager *dqm) - { -+ BUG_ON(!dqm || !dqm->dev); - return bitmap_weight(dqm->dev->shared_resources.queue_bitmap, - KGD_MAX_QUEUES); - } - - unsigned int get_queues_per_pipe(struct device_queue_manager *dqm) - { -+ BUG_ON(!dqm || !dqm->dev); - return dqm->dev->shared_resources.num_queue_per_pipe; - } - - unsigned int get_pipes_per_mec(struct device_queue_manager *dqm) - { -+ BUG_ON(!dqm || !dqm->dev); - return dqm->dev->shared_resources.num_pipe_per_mec; - } - -@@ -108,57 +107,6 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, - qpd->sh_mem_bases); - } - --static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) --{ -- struct kfd_dev *dev = qpd->dqm->dev; -- -- if (!KFD_IS_SOC15(dev->device_info->asic_family)) { -- /* On pre-SOC15 chips we need to use the queue ID to -- * preserve the user mode ABI. -- */ -- q->doorbell_id = q->properties.queue_id; -- } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -- /* For SDMA queues on SOC15, use static doorbell -- * assignments based on the engine and queue. -- */ -- q->doorbell_id = dev->shared_resources.sdma_doorbell -- [q->properties.sdma_engine_id] -- [q->properties.sdma_queue_id]; -- } else { -- /* For CP queues on SOC15 reserve a free doorbell ID */ -- unsigned int found; -- -- found = find_first_zero_bit(qpd->doorbell_bitmap, -- KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); -- if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { -- pr_debug("No doorbells available"); -- return -EBUSY; -- } -- set_bit(found, qpd->doorbell_bitmap); -- q->doorbell_id = found; -- } -- -- q->properties.doorbell_off = -- kfd_doorbell_id_to_offset(dev, q->process, -- q->doorbell_id); -- -- return 0; --} -- --static void deallocate_doorbell(struct qcm_process_device *qpd, -- struct queue *q) --{ -- unsigned int old; -- struct kfd_dev *dev = qpd->dqm->dev; -- -- if (!KFD_IS_SOC15(dev->device_info->asic_family) || -- q->properties.type == KFD_QUEUE_TYPE_SDMA) -- return; -- -- old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); -- WARN_ON(!old); --} -- - static int allocate_vmid(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - struct queue *q) -@@ -168,59 +116,31 @@ static int allocate_vmid(struct device_queue_manager *dqm, - if (dqm->vmid_bitmap == 0) - return -ENOMEM; - -- bit = ffs(dqm->vmid_bitmap) - 1; -- dqm->vmid_bitmap &= ~(1 << bit); -+ bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM); -+ clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); - -- allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; -- pr_debug("vmid allocation %d\n", allocated_vmid); -+ /* Kaveri kfd vmid's starts from vmid 8 */ -+ allocated_vmid = bit + KFD_VMID_START_OFFSET; -+ pr_debug("kfd: vmid allocation %d\n", allocated_vmid); - qpd->vmid = allocated_vmid; - q->properties.vmid = allocated_vmid; - - set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); - program_sh_mem_settings(dqm, qpd); - -- /* qpd->page_table_base is set earlier when register_process() -- * is called, i.e. when the first queue is created. -- */ -- dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, -- qpd->vmid, -- qpd->page_table_base); -- /*invalidate the VM context after pasid and vmid mapping is set up*/ -- kfd_flush_tlb(dqm->dev, qpd->pqm->process->pasid); -- - return 0; - } - --static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, -- struct qcm_process_device *qpd) --{ -- uint32_t len; -- -- if (!qpd->ib_kaddr) -- return -ENOMEM; -- -- len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base, -- (uint32_t *)qpd->ib_kaddr); -- -- return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, -- qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); --} -- - static void deallocate_vmid(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - struct queue *q) - { -- int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; -- -- /* On GFX v7, CP doesn't flush TC at dequeue */ -- if (q->device->device_info->asic_family == CHIP_HAWAII) -- if (flush_texture_cache_nocpsch(q->device, qpd)) -- pr_err("Failed to flush TC\n"); -+ int bit = qpd->vmid - KFD_VMID_START_OFFSET; - - /* Release the vmid mapping */ - set_pasid_vmid_mapping(dqm, 0, qpd->vmid); - -- dqm->vmid_bitmap |= (1 << bit); -+ set_bit(bit, (unsigned long *)&dqm->vmid_bitmap); - qpd->vmid = 0; - q->properties.vmid = 0; - } -@@ -230,53 +150,47 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - int *allocated_vmid) - { -- int retval = 0; -+ int retval; -+ -+ BUG_ON(!dqm || !q || !qpd || !allocated_vmid); - -+ pr_debug("kfd: In func %s\n", __func__); - print_queue(q); - - mutex_lock(&dqm->lock); - - if (dqm->total_queue_count >= max_num_of_queues_per_device) { -- pr_warn("Can't create new usermode queue because %d queues were already created\n", -+ pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", - dqm->total_queue_count); -- retval = -EPERM; -- goto out_unlock; -+ mutex_unlock(&dqm->lock); -+ return -EPERM; - } - - if (list_empty(&qpd->queues_list)) { - retval = allocate_vmid(dqm, qpd, q); -- if (retval) -- goto out_unlock; -+ if (retval != 0) { -+ mutex_unlock(&dqm->lock); -+ return retval; -+ } - } - *allocated_vmid = qpd->vmid; - q->properties.vmid = qpd->vmid; -- /* -- * Eviction state logic: we only mark active queues as evicted -- * to avoid the overhead of restoring inactive queues later -- */ -- if (qpd->evicted) -- q->properties.is_evicted = (q->properties.queue_size > 0 && -- q->properties.queue_percent > 0 && -- q->properties.queue_address != 0); -- -- q->properties.tba_addr = qpd->tba_addr; -- q->properties.tma_addr = qpd->tma_addr; - - if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) - retval = create_compute_queue_nocpsch(dqm, q, qpd); -- else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) -+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - retval = create_sdma_queue_nocpsch(dqm, q, qpd); - -- if (retval) { -+ if (retval != 0) { - if (list_empty(&qpd->queues_list)) { - deallocate_vmid(dqm, qpd, q); - *allocated_vmid = 0; - } -- goto out_unlock; -+ mutex_unlock(&dqm->lock); -+ return retval; - } - - list_add(&q->list, &qpd->queues_list); -- qpd->queue_count++; - if (q->properties.is_active) - dqm->queue_count++; - -@@ -291,9 +205,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, - pr_debug("Total of %d queues are accountable so far\n", - dqm->total_queue_count); - --out_unlock: - mutex_unlock(&dqm->lock); -- return retval; -+ return 0; - } - - static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) -@@ -303,16 +216,19 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) - - set = false; - -- for (pipe = dqm->next_pipe_to_allocate, i = 0; -- i < get_pipes_per_mec(dqm); -+ for (pipe = dqm->next_pipe_to_allocate, i = 0; i < get_pipes_per_mec(dqm); - pipe = ((pipe + 1) % get_pipes_per_mec(dqm)), ++i) { - - if (!is_pipe_enabled(dqm, 0, pipe)) - continue; - - if (dqm->allocated_queues[pipe] != 0) { -- bit = ffs(dqm->allocated_queues[pipe]) - 1; -- dqm->allocated_queues[pipe] &= ~(1 << bit); -+ bit = find_first_bit( -+ (unsigned long *)&dqm->allocated_queues[pipe], -+ get_queues_per_pipe(dqm)); -+ -+ clear_bit(bit, -+ (unsigned long *)&dqm->allocated_queues[pipe]); - q->pipe = pipe; - q->queue = bit; - set = true; -@@ -323,7 +239,8 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) - if (!set) - return -EBUSY; - -- pr_debug("hqd slot - pipe %d, queue %d\n", q->pipe, q->queue); -+ pr_debug("kfd: DQM %s hqd slot - pipe (%d) queue(%d)\n", -+ __func__, q->pipe, q->queue); - /* horizontal hqd allocation */ - dqm->next_pipe_to_allocate = (pipe + 1) % get_pipes_per_mec(dqm); - -@@ -333,7 +250,7 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) - static inline void deallocate_hqd(struct device_queue_manager *dqm, - struct queue *q) - { -- dqm->allocated_queues[q->pipe] |= (1 << q->queue); -+ set_bit(q->queue, (unsigned long *)&dqm->allocated_queues[q->pipe]); - } - - static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, -@@ -343,203 +260,138 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - int retval; - struct mqd_manager *mqd; - -+ BUG_ON(!dqm || !q || !qpd); -+ - mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); -- if (!mqd) -+ if (mqd == NULL) - return -ENOMEM; - - retval = allocate_hqd(dqm, q); -- if (retval) -+ if (retval != 0) - return retval; - -- retval = allocate_doorbell(qpd, q); -- if (retval) -- goto out_deallocate_hqd; -- - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); -- if (retval) -- goto out_deallocate_doorbell; -- -- pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", -- q->pipe, q->queue); -- -- dqm->dev->kfd2kgd->alloc_memory_of_scratch( -- dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); -+ if (retval != 0) { -+ deallocate_hqd(dqm, q); -+ return retval; -+ } - -- if (!q->properties.is_active) -- return 0; -+ pr_debug("kfd: loading mqd to hqd on pipe (%d) queue (%d)\n", -+ q->pipe, -+ q->queue); - -- retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, -- q->process->mm); -- if (retval) -- goto out_uninit_mqd; -+ retval = mqd->load_mqd(mqd, q->mqd, q->pipe, -+ q->queue, (uint32_t __user *) q->properties.write_ptr); -+ if (retval != 0) { -+ deallocate_hqd(dqm, q); -+ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -+ return retval; -+ } - - return 0; -- --out_uninit_mqd: -- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); --out_deallocate_doorbell: -- deallocate_doorbell(qpd, q); --out_deallocate_hqd: -- deallocate_hqd(dqm, q); -- -- return retval; - } - --/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked -- * to avoid asynchronized access -- */ --static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, -+static int destroy_queue_nocpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - struct queue *q) - { - int retval; - struct mqd_manager *mqd; - -- mqd = dqm->ops.get_mqd_manager(dqm, -- get_mqd_type_from_queue_type(q->properties.type)); -- if (!mqd) -- return -ENOMEM; -+ BUG_ON(!dqm || !q || !q->mqd || !qpd); -+ -+ retval = 0; - -- deallocate_doorbell(qpd, q); -+ pr_debug("kfd: In Func %s\n", __func__); - -- if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) -+ mutex_lock(&dqm->lock); -+ -+ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { -+ mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); -+ if (mqd == NULL) { -+ retval = -ENOMEM; -+ goto out; -+ } - deallocate_hqd(dqm, q); -- else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -+ mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); -+ if (mqd == NULL) { -+ retval = -ENOMEM; -+ goto out; -+ } - dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); - } else { -- pr_debug("q->properties.type %d is invalid\n", -+ pr_debug("q->properties.type is invalid (%d)\n", - q->properties.type); - retval = -EINVAL; -+ goto out; - } -- dqm->total_queue_count--; - - retval = mqd->destroy_mqd(mqd, q->mqd, - KFD_PREEMPT_TYPE_WAVEFRONT_RESET, -- KFD_HIQ_TIMEOUT, -+ QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, - q->pipe, q->queue); -- if (retval == -ETIME) -- qpd->reset_wavefronts = true; -+ -+ if (retval != 0) -+ goto out; - - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); - - list_del(&q->list); -- if (list_empty(&qpd->queues_list)) { -- if (qpd->reset_wavefronts) { -- pr_warn("Resetting wave fronts (nocpsch) on dev %p\n", -- dqm->dev); -- /* dbgdev_wave_reset_wavefronts has to be called before -- * deallocate_vmid(), i.e. when vmid is still in use. -- */ -- dbgdev_wave_reset_wavefronts(dqm->dev, -- qpd->pqm->process); -- qpd->reset_wavefronts = false; -- } -- -+ if (list_empty(&qpd->queues_list)) - deallocate_vmid(dqm, qpd, q); -- } -- qpd->queue_count--; - if (q->properties.is_active) - dqm->queue_count--; - -- return retval; --} -- --static int destroy_queue_nocpsch(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd, -- struct queue *q) --{ -- int retval; -+ /* -+ * Unconditionally decrement this counter, regardless of the queue's -+ * type -+ */ -+ dqm->total_queue_count--; -+ pr_debug("Total of %d queues are accountable so far\n", -+ dqm->total_queue_count); - -- mutex_lock(&dqm->lock); -- retval = destroy_queue_nocpsch_locked(dqm, qpd, q); -+out: - mutex_unlock(&dqm->lock); -- - return retval; - } - --static bool is_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q) --{ -- return (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && -- (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || -- q->properties.type == KFD_QUEUE_TYPE_SDMA)); --} -- - static int update_queue(struct device_queue_manager *dqm, struct queue *q) - { - int retval; - struct mqd_manager *mqd; -- struct kfd_process_device *pdd; -- - bool prev_active = false; - -- mutex_lock(&dqm->lock); -+ BUG_ON(!dqm || !q || !q->mqd); - -- pdd = kfd_get_process_device_data(q->device, q->process); -- if (!pdd) { -- retval = -ENODEV; -- goto out_unlock; -- } -+ mutex_lock(&dqm->lock); - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); -- if (!mqd) { -- retval = -ENOMEM; -- goto out_unlock; -- } -- /* -- * Eviction state logic: we only mark active queues as evicted -- * to avoid the overhead of restoring inactive queues later -- */ -- if (pdd->qpd.evicted > 0) -- q->properties.is_evicted = (q->properties.queue_size > 0 && -- q->properties.queue_percent > 0 && -- q->properties.queue_address != 0); -- -- /* save previous activity state for counters */ -- prev_active = q->properties.is_active; -- -- /* HWS mode, unmap first to own mqd */ -- if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) { -- retval = unmap_queues_cpsch(dqm, -- KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); -- if (retval) { -- pr_err("unmap queue failed"); -- goto out_unlock; -- } -- } else if (is_queue_nocpsch(dqm, q) && prev_active) { -- retval = mqd->destroy_mqd(mqd, q->mqd, -- KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, -- KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); -- if (retval) { -- pr_err("destroy mqd failed"); -- goto out_unlock; -- } -+ if (mqd == NULL) { -+ mutex_unlock(&dqm->lock); -+ return -ENOMEM; - } - -- retval = mqd->update_mqd(mqd, q->mqd, &q->properties); -+ if (q->properties.is_active) -+ prev_active = true; - -- if (is_queue_nocpsch(dqm, q)) { -- if (q->properties.is_active) -- retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, -- &q->properties, q->process->mm); -- } - /* - * - * check active state vs. the previous state - * and modify counter accordingly - */ -- if (q->properties.is_active && !prev_active) -+ retval = mqd->update_mqd(mqd, q->mqd, &q->properties); -+ if ((q->properties.is_active) && (!prev_active)) - dqm->queue_count++; -- else if (!q->properties.is_active && prev_active) -+ else if ((!q->properties.is_active) && (prev_active)) - dqm->queue_count--; - -- if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) -- retval = map_queues_cpsch(dqm); -+ if (sched_policy != KFD_SCHED_POLICY_NO_HWS) -+ retval = execute_queues_cpsch(dqm, false); - --out_unlock: - mutex_unlock(&dqm->lock); -- - return retval; - } - -@@ -548,169 +400,41 @@ static struct mqd_manager *get_mqd_manager_nocpsch( - { - struct mqd_manager *mqd; - -- if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) -- return NULL; -+ BUG_ON(!dqm || type >= KFD_MQD_TYPE_MAX); - -- pr_debug("mqd type %d\n", type); -+ pr_debug("kfd: In func %s mqd type %d\n", __func__, type); - - mqd = dqm->mqds[type]; - if (!mqd) { - mqd = mqd_manager_init(type, dqm->dev); -- if (!mqd) -- pr_err("mqd manager is NULL"); -+ if (mqd == NULL) -+ pr_err("kfd: mqd manager is NULL"); - dqm->mqds[type] = mqd; - } - - return mqd; - } - --int process_evict_queues(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd) --{ -- struct queue *q, *next; -- struct mqd_manager *mqd; -- struct kfd_process_device *pdd; -- int retval = 0; -- -- mutex_lock(&dqm->lock); -- if (qpd->evicted++ > 0) /* already evicted, do nothing */ -- goto out; -- -- pdd = qpd_to_pdd(qpd); -- pr_info_ratelimited("Evicting PASID %u queues\n", -- pdd->process->pasid); -- -- /* unactivate all active queues on the qpd */ -- list_for_each_entry_safe(q, next, &qpd->queues_list, list) { -- mqd = dqm->ops.get_mqd_manager(dqm, -- get_mqd_type_from_queue_type(q->properties.type)); -- if (!mqd) { /* should not be here */ -- pr_err("Cannot evict queue, mqd is NULL\n"); -- retval = -ENOMEM; -- goto out; -- } -- /* if the queue is not active anyway, it is not evicted */ -- if (q->properties.is_active) { -- q->properties.is_evicted = true; -- q->properties.is_active = false; -- } -- -- if (is_queue_nocpsch(dqm, q) && -- q->properties.is_evicted) -- retval = mqd->destroy_mqd(mqd, q->mqd, -- KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, -- KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); -- if (q->properties.is_evicted) -- dqm->queue_count--; -- } -- if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) -- retval = execute_queues_cpsch(dqm, qpd->is_debug); -- --out: -- mutex_unlock(&dqm->lock); -- return retval; -- --} -- --int process_restore_queues(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd) --{ -- struct queue *q, *next; -- struct mqd_manager *mqd; -- int retval = 0; -- struct kfd_process_device *pdd; -- uint32_t pd_base; -- -- pdd = qpd_to_pdd(qpd); -- /* Retrieve PD base */ -- pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); -- -- mutex_lock(&dqm->lock); -- if (qpd->evicted == 0) /* already restored, do nothing */ -- goto out_unlock; -- -- if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ -- qpd->evicted--; -- goto out_unlock; -- } -- -- pr_info_ratelimited("Restoring PASID %u queues\n", -- pdd->process->pasid); -- -- /* Update PD Base in QPD */ -- qpd->page_table_base = pd_base; -- pr_debug("Updated PD address to 0x%08x\n", pd_base); -- -- if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && -- !list_empty(&qpd->queues_list)) { -- dqm->dev->kfd2kgd->set_vm_context_page_table_base( -- dqm->dev->kgd, -- qpd->vmid, -- qpd->page_table_base); -- -- kfd_flush_tlb(dqm->dev, pdd->process->pasid); -- } -- -- /* activate all active queues on the qpd */ -- list_for_each_entry_safe(q, next, &qpd->queues_list, list) { -- mqd = dqm->ops.get_mqd_manager(dqm, -- get_mqd_type_from_queue_type(q->properties.type)); -- if (!mqd) { /* should not be here */ -- pr_err("Cannot restore queue, mqd is NULL\n"); -- retval = -ENOMEM; -- goto out_unlock; -- } -- if (q->properties.is_evicted) { -- q->properties.is_evicted = false; -- q->properties.is_active = true; -- -- if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && -- (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || -- q->properties.type == KFD_QUEUE_TYPE_SDMA)) -- retval = mqd->load_mqd(mqd, q->mqd, q->pipe, -- q->queue, &q->properties, -- q->process->mm); -- dqm->queue_count++; -- } -- } -- if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) -- retval = execute_queues_cpsch(dqm, false); -- -- if (retval == 0) -- qpd->evicted = 0; -- --out_unlock: -- mutex_unlock(&dqm->lock); -- -- return retval; --} -- --static int register_process(struct device_queue_manager *dqm, -+static int register_process_nocpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) - { - struct device_process_node *n; - int retval; -- struct kfd_process_device *pdd; -- uint32_t pd_base; - -- n = kzalloc(sizeof(*n), GFP_KERNEL); -+ BUG_ON(!dqm || !qpd); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL); - if (!n) - return -ENOMEM; - - n->qpd = qpd; - -- pdd = qpd_to_pdd(qpd); -- /* Retrieve PD base */ -- pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); -- - mutex_lock(&dqm->lock); - list_add(&n->list, &dqm->queues); - -- /* Update PD Base in QPD */ -- qpd->page_table_base = pd_base; -- pr_debug("Updated PD address to 0x%08x\n", pd_base); -- -- retval = dqm->asic_ops.update_qpd(dqm, qpd); -+ retval = dqm->ops_asic_specific.register_process(dqm, qpd); - - dqm->processes_count++; - -@@ -719,12 +443,16 @@ static int register_process(struct device_queue_manager *dqm, - return retval; - } - --static int unregister_process(struct device_queue_manager *dqm, -+static int unregister_process_nocpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) - { - int retval; - struct device_process_node *cur, *next; - -+ BUG_ON(!dqm || !qpd); -+ -+ pr_debug("In func %s\n", __func__); -+ - pr_debug("qpd->queues_list is %s\n", - list_empty(&qpd->queues_list) ? "empty" : "not empty"); - -@@ -765,41 +493,48 @@ static void init_interrupts(struct device_queue_manager *dqm) - { - unsigned int i; - -+ BUG_ON(dqm == NULL); -+ - for (i = 0 ; i < get_pipes_per_mec(dqm) ; i++) - if (is_pipe_enabled(dqm, 0, i)) - dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i); - } -+ - static int init_scheduler(struct device_queue_manager *dqm) - { -- return 0; -+ int retval = 0; -+ -+ BUG_ON(!dqm); -+ -+ pr_debug("kfd: In %s\n", __func__); -+ -+ return retval; - } - - static int initialize_nocpsch(struct device_queue_manager *dqm) - { -- int pipe, queue; -+ int i; - -- pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); -+ BUG_ON(!dqm); - -- dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm), -- sizeof(unsigned int), GFP_KERNEL); -- if (!dqm->allocated_queues) -- return -ENOMEM; -+ pr_debug("kfd: In func %s num of pipes: %d\n", -+ __func__, get_pipes_per_mec(dqm)); - - mutex_init(&dqm->lock); - INIT_LIST_HEAD(&dqm->queues); - dqm->queue_count = dqm->next_pipe_to_allocate = 0; - dqm->sdma_queue_count = 0; -- -- for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { -- int pipe_offset = pipe * get_queues_per_pipe(dqm); -- -- for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) -- if (test_bit(pipe_offset + queue, -- dqm->dev->shared_resources.queue_bitmap)) -- dqm->allocated_queues[pipe] |= 1 << queue; -+ dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm), -+ sizeof(unsigned int), GFP_KERNEL); -+ if (!dqm->allocated_queues) { -+ mutex_destroy(&dqm->lock); -+ return -ENOMEM; - } - -- dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; -+ for (i = 0; i < get_pipes_per_mec(dqm); i++) -+ dqm->allocated_queues[i] = (1 << get_queues_per_pipe(dqm)) - 1; -+ -+ dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; - dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; - - init_scheduler(dqm); -@@ -810,7 +545,9 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm) - { - int i; - -- WARN_ON(dqm->queue_count > 0 || dqm->processes_count > 0); -+ BUG_ON(!dqm); -+ -+ BUG_ON(dqm->queue_count > 0 || dqm->processes_count > 0); - - kfree(dqm->allocated_queues); - for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) -@@ -822,12 +559,11 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm) - static int start_nocpsch(struct device_queue_manager *dqm) - { - init_interrupts(dqm); -- return pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); -+ return 0; - } - - static int stop_nocpsch(struct device_queue_manager *dqm) - { -- pm_uninit(&dqm->packets); - return 0; - } - -@@ -839,8 +575,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, - if (dqm->sdma_bitmap == 0) - return -ENOMEM; - -- bit = ffs(dqm->sdma_bitmap) - 1; -- dqm->sdma_bitmap &= ~(1 << bit); -+ bit = find_first_bit((unsigned long *)&dqm->sdma_bitmap, -+ CIK_SDMA_QUEUES); -+ -+ clear_bit(bit, (unsigned long *)&dqm->sdma_bitmap); - *sdma_queue_id = bit; - - return 0; -@@ -851,7 +589,7 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm, - { - if (sdma_queue_id >= CIK_SDMA_QUEUES) - return; -- dqm->sdma_bitmap |= (1 << sdma_queue_id); -+ set_bit(sdma_queue_id, (unsigned long *)&dqm->sdma_bitmap); - } - - static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, -@@ -866,40 +604,33 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - return -ENOMEM; - - retval = allocate_sdma_queue(dqm, &q->sdma_id); -- if (retval) -+ if (retval != 0) - return retval; - -- q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; -- q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; -- -- retval = allocate_doorbell(qpd, q); -- if (retval) -- goto out_deallocate_sdma_queue; -+ q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; -+ q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM; - -- pr_debug("SDMA id is: %d\n", q->sdma_id); -- pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); -- pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); -+ pr_debug("kfd: sdma id is: %d\n", q->sdma_id); -+ pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); -+ pr_debug(" sdma engine id: %d\n", q->properties.sdma_engine_id); - -- dqm->asic_ops.init_sdma_vm(dqm, q, qpd); -+ dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); -- if (retval) -- goto out_deallocate_doorbell; -+ if (retval != 0) { -+ deallocate_sdma_queue(dqm, q->sdma_id); -+ return retval; -+ } - -- retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); -- if (retval) -- goto out_uninit_mqd; -+ retval = mqd->load_mqd(mqd, q->mqd, 0, -+ 0, NULL); -+ if (retval != 0) { -+ deallocate_sdma_queue(dqm, q->sdma_id); -+ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -+ return retval; -+ } - - return 0; -- --out_uninit_mqd: -- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); --out_deallocate_doorbell: -- deallocate_doorbell(qpd, q); --out_deallocate_sdma_queue: -- deallocate_sdma_queue(dqm, q->sdma_id); -- -- return retval; - } - - /* -@@ -911,7 +642,12 @@ static int set_sched_resources(struct device_queue_manager *dqm) - int i, mec; - struct scheduling_resources res; - -- res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap; -+ BUG_ON(!dqm); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ res.vmid_mask = (1 << VMID_PER_DEVICE) - 1; -+ res.vmid_mask <<= KFD_VMID_START_OFFSET; - - res.queue_mask = 0; - for (i = 0; i < KGD_MAX_QUEUES; ++i) { -@@ -927,8 +663,7 @@ static int set_sched_resources(struct device_queue_manager *dqm) - - /* This situation may be hit in the future if a new HW - * generation exposes more than 64 queues. If so, the -- * definition of res.queue_mask needs updating -- */ -+ * definition of res.queue_mask needs updating */ - if (WARN_ON(i >= (sizeof(res.queue_mask)*8))) { - pr_err("Invalid queue enabled by amdgpu: %d\n", i); - break; -@@ -939,9 +674,9 @@ static int set_sched_resources(struct device_queue_manager *dqm) - res.gws_mask = res.oac_mask = res.gds_heap_base = - res.gds_heap_size = 0; - -- pr_debug("Scheduling resources:\n" -- "vmid mask: 0x%8X\n" -- "queue mask: 0x%8llX\n", -+ pr_debug("kfd: scheduling resources:\n" -+ " vmid mask: 0x%8X\n" -+ " queue mask: 0x%8llX\n", - res.vmid_mask, res.queue_mask); - - return pm_send_set_resources(&dqm->packets, &res); -@@ -951,42 +686,51 @@ static int initialize_cpsch(struct device_queue_manager *dqm) - { - int retval; - -- pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); -+ BUG_ON(!dqm); -+ -+ pr_debug("kfd: In func %s num of pipes: %d\n", -+ __func__, get_pipes_per_mec(dqm)); - - mutex_init(&dqm->lock); - INIT_LIST_HEAD(&dqm->queues); - dqm->queue_count = dqm->processes_count = 0; - dqm->sdma_queue_count = 0; - dqm->active_runlist = false; -- dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; -- retval = dqm->asic_ops.init_cpsch(dqm); -- if (retval) -- mutex_destroy(&dqm->lock); -+ retval = dqm->ops_asic_specific.initialize(dqm); -+ if (retval != 0) -+ goto fail_init_pipelines; -+ -+ return 0; - -+fail_init_pipelines: -+ mutex_destroy(&dqm->lock); - return retval; - } - - static int start_cpsch(struct device_queue_manager *dqm) - { -+ struct device_process_node *node; - int retval; - -+ BUG_ON(!dqm); -+ - retval = 0; - -- retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); -- if (retval) -+ retval = pm_init(&dqm->packets, dqm); -+ if (retval != 0) - goto fail_packet_manager_init; - - retval = set_sched_resources(dqm); -- if (retval) -+ if (retval != 0) - goto fail_set_sched_resources; - -- pr_debug("Allocating fence memory\n"); -+ pr_debug("kfd: allocating fence memory\n"); - - /* allocate fence memory on the gart */ - retval = kfd_gtt_sa_allocate(dqm->dev, sizeof(*dqm->fence_addr), - &dqm->fence_mem); - -- if (retval) -+ if (retval != 0) - goto fail_allocate_vidmem; - - dqm->fence_addr = dqm->fence_mem->cpu_ptr; -@@ -994,9 +738,12 @@ static int start_cpsch(struct device_queue_manager *dqm) - - init_interrupts(dqm); - -- mutex_lock(&dqm->lock); -- execute_queues_cpsch(dqm, false); -- mutex_unlock(&dqm->lock); -+ list_for_each_entry(node, &dqm->queues, list) -+ if (node->qpd->pqm->process && dqm->dev) -+ kfd_bind_process_to_device(dqm->dev, -+ node->qpd->pqm->process); -+ -+ execute_queues_cpsch(dqm, true); - - return 0; - fail_allocate_vidmem: -@@ -1008,12 +755,17 @@ static int start_cpsch(struct device_queue_manager *dqm) - - static int stop_cpsch(struct device_queue_manager *dqm) - { -- mutex_lock(&dqm->lock); -+ struct device_process_node *node; -+ struct kfd_process_device *pdd; - -- unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); -+ BUG_ON(!dqm); - -- mutex_unlock(&dqm->lock); -+ destroy_queues_cpsch(dqm, true, true); - -+ list_for_each_entry(node, &dqm->queues, list) { -+ pdd = qpd_to_pdd(node->qpd); -+ pdd->bound = false; -+ } - kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); - pm_uninit(&dqm->packets); - -@@ -1024,9 +776,13 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, - struct kernel_queue *kq, - struct qcm_process_device *qpd) - { -+ BUG_ON(!dqm || !kq || !qpd); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ - mutex_lock(&dqm->lock); - if (dqm->total_queue_count >= max_num_of_queues_per_device) { -- pr_warn("Can't create new kernel queue because %d queues were already created\n", -+ pr_warn("amdkfd: Can't create new kernel queue because %d queues were already created\n", - dqm->total_queue_count); - mutex_unlock(&dqm->lock); - return -EPERM; -@@ -1053,12 +809,17 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, - struct kernel_queue *kq, - struct qcm_process_device *qpd) - { -+ BUG_ON(!dqm || !kq); -+ -+ pr_debug("kfd: In %s\n", __func__); -+ - mutex_lock(&dqm->lock); - /* here we actually preempt the DIQ */ -+ destroy_queues_cpsch(dqm, true, false); - list_del(&kq->list); - dqm->queue_count--; - qpd->is_debug = false; -- execute_queues_cpsch(dqm, true); -+ execute_queues_cpsch(dqm, false); - /* - * Unconditionally decrement this counter, regardless of the queue's - * type. -@@ -1069,12 +830,22 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, - mutex_unlock(&dqm->lock); - } - -+static void select_sdma_engine_id(struct queue *q) -+{ -+ static int sdma_id; -+ -+ q->sdma_id = sdma_id; -+ sdma_id = (sdma_id + 1) % 2; -+} -+ - static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd, int *allocate_vmid) - { - int retval; - struct mqd_manager *mqd; - -+ BUG_ON(!dqm || !q || !qpd); -+ - retval = 0; - - if (allocate_vmid) -@@ -1083,60 +854,37 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - mutex_lock(&dqm->lock); - - if (dqm->total_queue_count >= max_num_of_queues_per_device) { -- pr_warn("Can't create new usermode queue because %d queues were already created\n", -+ pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", - dqm->total_queue_count); - retval = -EPERM; -- goto out_unlock; -- } -- -- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -- retval = allocate_sdma_queue(dqm, &q->sdma_id); -- if (retval) -- goto out_unlock; -- q->properties.sdma_queue_id = -- q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; -- q->properties.sdma_engine_id = -- q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; -+ goto out; - } - -- retval = allocate_doorbell(qpd, q); -- if (retval) -- goto out_deallocate_sdma_queue; -+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) -+ select_sdma_engine_id(q); - - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - -- if (!mqd) { -- retval = -ENOMEM; -- goto out_deallocate_doorbell; -+ if (mqd == NULL) { -+ mutex_unlock(&dqm->lock); -+ return -ENOMEM; - } -- /* -- * Eviction state logic: we only mark active queues as evicted -- * to avoid the overhead of restoring inactive queues later -- */ -- if (qpd->evicted) -- q->properties.is_evicted = (q->properties.queue_size > 0 && -- q->properties.queue_percent > 0 && -- q->properties.queue_address != 0); -- -- dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - -- q->properties.tba_addr = qpd->tba_addr; -- q->properties.tma_addr = qpd->tma_addr; -+ dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); -- if (retval) -- goto out_deallocate_doorbell; -+ if (retval != 0) -+ goto out; - - list_add(&q->list, &qpd->queues_list); -- qpd->queue_count++; - if (q->properties.is_active) { - dqm->queue_count++; - retval = execute_queues_cpsch(dqm, false); - } - - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) -- dqm->sdma_queue_count++; -+ dqm->sdma_queue_count++; - /* - * Unconditionally increment this counter, regardless of the queue's - * type or whether the queue is active. -@@ -1146,31 +894,21 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - pr_debug("Total of %d queues are accountable so far\n", - dqm->total_queue_count); - -+out: - mutex_unlock(&dqm->lock); - return retval; -- --out_deallocate_doorbell: -- deallocate_doorbell(qpd, q); --out_deallocate_sdma_queue: -- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) -- deallocate_sdma_queue(dqm, q->sdma_id); --out_unlock: -- mutex_unlock(&dqm->lock); -- -- return retval; - } - - int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - unsigned int fence_value, -- unsigned long timeout_ms) -+ unsigned long timeout) - { -- unsigned long end_jiffies; -- -- end_jiffies = (timeout_ms * HZ / 1000) + jiffies; -+ BUG_ON(!fence_addr); -+ timeout += jiffies; - - while (*fence_addr != fence_value) { -- if (time_after(jiffies, end_jiffies)) { -- pr_err("qcm fence wait loop timeout expired\n"); -+ if (time_after(jiffies, timeout)) { -+ pr_err("kfd: qcm fence wait loop timeout expired\n"); - return -ETIME; - } - schedule(); -@@ -1179,63 +917,46 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - return 0; - } - --static int unmap_sdma_queues(struct device_queue_manager *dqm, -- unsigned int sdma_engine) -+static int destroy_sdma_queues(struct device_queue_manager *dqm, -+ unsigned int sdma_engine) - { - return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, -- KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, -+ KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false, - sdma_engine); - } - --/* dqm->lock mutex has to be locked before calling this function */ --static int map_queues_cpsch(struct device_queue_manager *dqm) -+static int destroy_queues_cpsch(struct device_queue_manager *dqm, -+ bool preempt_static_queues, bool lock) - { - int retval; -+ enum kfd_preempt_type_filter preempt_type; -+ struct kfd_process_device *pdd; - -- if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { -- retval = 0; -- return retval; -- } -- -- if (dqm->active_runlist) { -- retval = 0; -- return retval; -- } -- -- retval = pm_send_runlist(&dqm->packets, &dqm->queues); -- if (retval) { -- pr_err("failed to execute runlist\n"); -- return retval; -- } -- dqm->active_runlist = true; -- -- return retval; --} -- --/* dqm->lock mutex has to be locked before calling this function */ --static int unmap_queues_cpsch(struct device_queue_manager *dqm, -- enum kfd_unmap_queues_filter filter, -- uint32_t filter_param) --{ -- int retval; -+ BUG_ON(!dqm); - - retval = 0; - -+ if (lock) -+ mutex_lock(&dqm->lock); - if (!dqm->active_runlist) -- return retval; -+ goto out; - -- pr_debug("Before destroying queues, sdma queue count is : %u\n", -+ pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n", - dqm->sdma_queue_count); - - if (dqm->sdma_queue_count > 0) { -- unmap_sdma_queues(dqm, 0); -- unmap_sdma_queues(dqm, 1); -+ destroy_sdma_queues(dqm, 0); -+ destroy_sdma_queues(dqm, 1); - } - -+ preempt_type = preempt_static_queues ? -+ KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES : -+ KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES; -+ - retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, -- filter, filter_param, false, 0); -- if (retval) -- return retval; -+ preempt_type, 0, false, 0); -+ if (retval != 0) -+ goto out; - - *dqm->fence_addr = KFD_FENCE_INIT; - pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, -@@ -1243,36 +964,56 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, - /* should be timed out */ - retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, - QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); -- if (retval) { -- pr_err("Unmapping queues failed.\n"); -- return retval; -+ if (retval != 0) { -+ pdd = kfd_get_process_device_data(dqm->dev, -+ kfd_get_process(current)); -+ pdd->reset_wavefronts = true; -+ goto out; - } -- - pm_release_ib(&dqm->packets); - dqm->active_runlist = false; - -+out: -+ if (lock) -+ mutex_unlock(&dqm->lock); - return retval; - } - --/* dqm->lock mutex has to be locked before calling this function */ --static int execute_queues_cpsch(struct device_queue_manager *dqm, -- bool static_queues_included) -+static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock) - { - int retval; -- enum kfd_unmap_queues_filter filter; - -- filter = static_queues_included ? -- KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : -- KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; -+ BUG_ON(!dqm); - -- retval = unmap_queues_cpsch(dqm, filter, 0); -- if (retval) { -- pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); -- return retval; -+ if (lock) -+ mutex_lock(&dqm->lock); -+ -+ retval = destroy_queues_cpsch(dqm, false, false); -+ if (retval != 0) { -+ pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption"); -+ goto out; -+ } -+ -+ if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { -+ retval = 0; -+ goto out; - } - -- retval = map_queues_cpsch(dqm); -+ if (dqm->active_runlist) { -+ retval = 0; -+ goto out; -+ } - -+ retval = pm_send_runlist(&dqm->packets, &dqm->queues); -+ if (retval != 0) { -+ pr_err("kfd: failed to execute runlist"); -+ goto out; -+ } -+ dqm->active_runlist = true; -+ -+out: -+ if (lock) -+ mutex_unlock(&dqm->lock); - return retval; - } - -@@ -1284,6 +1025,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, - struct mqd_manager *mqd; - bool preempt_all_queues; - -+ BUG_ON(!dqm || !qpd || !q); -+ - preempt_all_queues = false; - - retval = 0; -@@ -1308,21 +1051,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, - goto failed; - } - -- deallocate_doorbell(qpd, q); -- -- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - dqm->sdma_queue_count--; -- deallocate_sdma_queue(dqm, q->sdma_id); -- } - - list_del(&q->list); -- qpd->queue_count--; - if (q->properties.is_active) - dqm->queue_count--; - -- retval = execute_queues_cpsch(dqm, false); -- if (retval == -ETIME) -- qpd->reset_wavefronts = true; -+ execute_queues_cpsch(dqm, false); - - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); - -@@ -1336,7 +1072,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, - - mutex_unlock(&dqm->lock); - -- return retval; -+ return 0; - - failed: - failed_try_destroy_debugged_queue: -@@ -1360,10 +1096,9 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) - { -- bool retval = true; -+ bool retval; - -- if (!dqm->asic_ops.set_cache_memory_policy) -- return retval; -+ pr_debug("kfd: In func %s\n", __func__); - - mutex_lock(&dqm->lock); - -@@ -1385,17 +1120,20 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, - uint64_t base = (uintptr_t)alternate_aperture_base; - uint64_t limit = base + alternate_aperture_size - 1; - -- if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || -- (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { -- retval = false; -+ if (limit <= base) -+ goto out; -+ -+ if ((base & APE1_FIXED_BITS_MASK) != 0) -+ goto out; -+ -+ if ((limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) - goto out; -- } - - qpd->sh_mem_ape1_base = base >> 16; - qpd->sh_mem_ape1_limit = limit >> 16; - } - -- retval = dqm->asic_ops.set_cache_memory_policy( -+ retval = dqm->ops_asic_specific.set_cache_memory_policy( - dqm, - qpd, - default_policy, -@@ -1403,199 +1141,35 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, - alternate_aperture_base, - alternate_aperture_size); - -- if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) -+ if ((sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) - program_sh_mem_settings(dqm, qpd); - -- pr_debug("sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", -+ pr_debug("kfd: sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", - qpd->sh_mem_config, qpd->sh_mem_ape1_base, - qpd->sh_mem_ape1_limit); - --out: - mutex_unlock(&dqm->lock); - return retval; --} -- --static int set_trap_handler(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd, -- uint64_t tba_addr, -- uint64_t tma_addr) --{ -- uint64_t *tma; -- -- if (dqm->dev->cwsr_enabled) { -- /* Jump from CWSR trap handler to user trap */ -- tma = (uint64_t *)(qpd->cwsr_kaddr + dqm->dev->tma_offset); -- tma[0] = tba_addr; -- tma[1] = tma_addr; -- } else { -- qpd->tba_addr = tba_addr; -- qpd->tma_addr = tma_addr; -- } -- -- return 0; --} -- --static int process_termination_nocpsch(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd) --{ -- struct queue *q, *next; -- struct device_process_node *cur, *next_dpn; -- int retval = 0; -- -- mutex_lock(&dqm->lock); -- -- /* Clear all user mode queues */ -- list_for_each_entry_safe(q, next, &qpd->queues_list, list) { -- int ret; -- -- ret = destroy_queue_nocpsch_locked(dqm, qpd, q); -- if (ret) -- retval = ret; -- } -- -- /* Unregister process */ -- list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { -- if (qpd == cur->qpd) { -- list_del(&cur->list); -- kfree(cur); -- dqm->processes_count--; -- break; -- } -- } -- -- mutex_unlock(&dqm->lock); -- return retval; --} -- --static int get_wave_state(struct device_queue_manager *dqm, -- struct queue *q, -- void __user *ctl_stack, -- u32 *ctl_stack_used_size, -- u32 *save_area_used_size) --{ -- struct mqd_manager *mqd; -- int r; -- -- mutex_lock(&dqm->lock); -- -- if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE || -- q->properties.is_active || !q->device->cwsr_enabled) { -- r = -EINVAL; -- goto dqm_unlock; -- } -- -- mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); -- if (!mqd) { -- r = -ENOMEM; -- goto dqm_unlock; -- } -- -- if (!mqd->get_wave_state) { -- r = -EINVAL; -- goto dqm_unlock; -- } -- -- r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size, -- save_area_used_size); -- --dqm_unlock: -- mutex_unlock(&dqm->lock); -- return r; --} -- --static int process_termination_cpsch(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd) --{ -- int retval; -- struct queue *q, *next; -- struct kernel_queue *kq, *kq_next; -- struct mqd_manager *mqd; -- struct device_process_node *cur, *next_dpn; -- bool unmap_static_queues = false; -- -- retval = 0; -- -- mutex_lock(&dqm->lock); -- -- /* Clean all kernel queues */ -- list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) { -- list_del(&kq->list); -- dqm->queue_count--; -- qpd->is_debug = false; -- dqm->total_queue_count--; -- unmap_static_queues = true; -- } -- -- /* Clear all user mode queues */ -- list_for_each_entry(q, &qpd->queues_list, list) { -- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -- dqm->sdma_queue_count--; -- deallocate_sdma_queue(dqm, q->sdma_id); -- } -- -- if (q->properties.is_active) -- dqm->queue_count--; -- -- dqm->total_queue_count--; -- } -- -- /* Unregister process */ -- list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { -- if (qpd == cur->qpd) { -- list_del(&cur->list); -- kfree(cur); -- dqm->processes_count--; -- break; -- } -- } -- -- retval = execute_queues_cpsch(dqm, unmap_static_queues); -- if (retval || qpd->reset_wavefronts) { -- pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); -- dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); -- qpd->reset_wavefronts = false; -- } -- -- /* lastly, free mqd resources */ -- list_for_each_entry_safe(q, next, &qpd->queues_list, list) { -- mqd = dqm->ops.get_mqd_manager(dqm, -- get_mqd_type_from_queue_type(q->properties.type)); -- if (!mqd) { -- retval = -ENOMEM; -- goto out; -- } -- list_del(&q->list); -- qpd->queue_count--; -- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -- } - - out: - mutex_unlock(&dqm->lock); -- return retval; -+ return false; - } - - struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) - { - struct device_queue_manager *dqm; - -- pr_debug("Loading device queue manager\n"); -+ BUG_ON(!dev); -+ -+ pr_debug("kfd: loading device queue manager\n"); - -- dqm = kzalloc(sizeof(*dqm), GFP_KERNEL); -+ dqm = kzalloc(sizeof(struct device_queue_manager), GFP_KERNEL); - if (!dqm) - return NULL; - -- switch (dev->device_info->asic_family) { -- case CHIP_HAWAII: -- case CHIP_TONGA: -- dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS; -- break; -- default: -- dqm->sched_policy = sched_policy; -- break; -- } -- - dqm->dev = dev; -- switch (dqm->sched_policy) { -+ switch (sched_policy) { - case KFD_SCHED_POLICY_HWS: - case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: - /* initialize dqm for cp scheduling */ -@@ -1606,15 +1180,12 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) - dqm->ops.destroy_queue = destroy_queue_cpsch; - dqm->ops.update_queue = update_queue; - dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; -- dqm->ops.register_process = register_process; -- dqm->ops.unregister_process = unregister_process; -+ dqm->ops.register_process = register_process_nocpsch; -+ dqm->ops.unregister_process = unregister_process_nocpsch; - dqm->ops.uninitialize = uninitialize_nocpsch; - dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; - dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; - dqm->ops.set_cache_memory_policy = set_cache_memory_policy; -- dqm->ops.set_trap_handler = set_trap_handler; -- dqm->ops.process_termination = process_termination_cpsch; -- dqm->ops.get_wave_state = get_wave_state; - break; - case KFD_SCHED_POLICY_NO_HWS: - /* initialize dqm for no cp scheduling */ -@@ -1624,142 +1195,39 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) - dqm->ops.destroy_queue = destroy_queue_nocpsch; - dqm->ops.update_queue = update_queue; - dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; -- dqm->ops.register_process = register_process; -- dqm->ops.unregister_process = unregister_process; -+ dqm->ops.register_process = register_process_nocpsch; -+ dqm->ops.unregister_process = unregister_process_nocpsch; - dqm->ops.initialize = initialize_nocpsch; - dqm->ops.uninitialize = uninitialize_nocpsch; - dqm->ops.set_cache_memory_policy = set_cache_memory_policy; -- dqm->ops.set_trap_handler = set_trap_handler; -- dqm->ops.process_termination = process_termination_nocpsch; -- dqm->ops.get_wave_state = get_wave_state; - break; - default: -- WARN(1, "Invalid scheduling policy %d", dqm->sched_policy); -- goto out_free; -+ BUG(); -+ break; - } - - switch (dev->device_info->asic_family) { - case CHIP_CARRIZO: -- device_queue_manager_init_vi(&dqm->asic_ops); -+ device_queue_manager_init_vi(&dqm->ops_asic_specific); - break; - - case CHIP_KAVERI: -- device_queue_manager_init_cik(&dqm->asic_ops); -- break; -- -- case CHIP_HAWAII: -- device_queue_manager_init_cik_hawaii(&dqm->asic_ops); -- break; -- -- case CHIP_TONGA: -- case CHIP_FIJI: -- case CHIP_POLARIS10: -- case CHIP_POLARIS11: -- device_queue_manager_init_vi_tonga(&dqm->asic_ops); -+ device_queue_manager_init_cik(&dqm->ops_asic_specific); - break; -- -- case CHIP_VEGA10: -- case CHIP_RAVEN: -- device_queue_manager_init_v9_vega10(&dqm->asic_ops); -- break; -- default: -- BUG(); - } - -- if (!dqm->ops.initialize(dqm)) -- return dqm; -+ if (dqm->ops.initialize(dqm) != 0) { -+ kfree(dqm); -+ return NULL; -+ } - --out_free: -- kfree(dqm); -- return NULL; -+ return dqm; - } - - void device_queue_manager_uninit(struct device_queue_manager *dqm) - { -+ BUG_ON(!dqm); -+ - dqm->ops.uninitialize(dqm); - kfree(dqm); - } -- --int kfd_process_vm_fault(struct device_queue_manager *dqm, -- unsigned int pasid) --{ -- struct kfd_process_device *pdd; -- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); -- int ret = 0; -- -- if (!p) -- return -EINVAL; -- pdd = kfd_get_process_device_data(dqm->dev, p); -- if (pdd) -- ret = process_evict_queues(dqm, &pdd->qpd); -- kfd_unref_process(p); -- -- return ret; --} -- --static void seq_reg_dump(struct seq_file *m, -- uint32_t (*dump)[2], uint32_t n_regs) --{ -- uint32_t i, count; -- -- for (i = 0, count = 0; i < n_regs; i++) { -- if (count == 0 || -- dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) { -- seq_printf(m, "%s %08x: %08x", -- i ? "\n" : "", -- dump[i][0], dump[i][1]); -- count = 7; -- } else { -- seq_printf(m, " %08x", dump[i][1]); -- count--; -- } -- } -- -- seq_puts(m, "\n"); --} -- --int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data) --{ -- struct device_queue_manager *dqm = data; -- uint32_t (*dump)[2], n_regs; -- int pipe, queue; -- int r = 0; -- -- for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { -- int pipe_offset = pipe * get_queues_per_pipe(dqm); -- -- for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) { -- if (!test_bit(pipe_offset + queue, -- dqm->dev->shared_resources.queue_bitmap)) -- continue; -- -- r = dqm->dev->kfd2kgd->hqd_dump( -- dqm->dev->kgd, pipe, queue, &dump, &n_regs); -- if (r) -- break; -- -- seq_printf(m, " CP Pipe %d, Queue %d\n", -- pipe, queue); -- seq_reg_dump(m, dump, n_regs); -- -- kfree(dump); -- } -- } -- -- for (pipe = 0; pipe < CIK_SDMA_ENGINE_NUM; pipe++) { -- for (queue = 0; queue < CIK_SDMA_QUEUES_PER_ENGINE; queue++) { -- r = dqm->dev->kfd2kgd->hqd_sdma_dump( -- dqm->dev->kgd, pipe, queue, &dump, &n_regs); -- if (r) -- break; -- -- seq_printf(m, " SDMA Engine %d, RLC %d\n", -- pipe, queue); -- seq_reg_dump(m, dump, n_regs); -- -- kfree(dump); -- } -- } -- -- return r; --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h -index 841283a..faf820a 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h -@@ -29,10 +29,10 @@ - #include "kfd_priv.h" - #include "kfd_mqd_manager.h" - --#define KFD_HIQ_TIMEOUT (500) --#define KFD_UNMAP_LATENCY_MS (4000) --#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) -- -+#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500) -+#define CIK_VMID_NUM (8) -+#define KFD_VMID_START_OFFSET (8) -+#define VMID_PER_DEVICE CIK_VMID_NUM - #define KFD_DQM_FIRST_PIPE (0) - #define CIK_SDMA_QUEUES (4) - #define CIK_SDMA_QUEUES_PER_ENGINE (2) -@@ -79,14 +79,6 @@ struct device_process_node { - * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the - * memory apertures. - * -- * @set_page_directory_base: Sets the PD base address (GPU local memory) -- * in all the queues of the relevant process running on the specified device. -- * It preempts the queues, updates the value and execute the runlist again. -- * -- * @process_termination: Clears all process queues belongs to that device. -- * -- * @get_wave_state: Retrieves context save state and optionally copies the -- * control stack, if kept in the MQD, to the given userspace address. - */ - - struct device_queue_manager_ops { -@@ -130,26 +122,12 @@ struct device_queue_manager_ops { - enum cache_policy alternate_policy, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); -- -- int (*set_trap_handler)(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd, -- uint64_t tba_addr, -- uint64_t tma_addr); -- -- int (*process_termination)(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd); -- -- int (*get_wave_state)(struct device_queue_manager *dqm, -- struct queue *q, -- void __user *ctl_stack, -- u32 *ctl_stack_used_size, -- u32 *save_area_used_size); - }; - - struct device_queue_manager_asic_ops { -- int (*update_qpd)(struct device_queue_manager *dqm, -+ int (*register_process)(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); -- int (*init_cpsch)(struct device_queue_manager *dqm); -+ int (*initialize)(struct device_queue_manager *dqm); - bool (*set_cache_memory_policy)(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - enum cache_policy default_policy, -@@ -175,7 +153,7 @@ struct device_queue_manager_asic_ops { - - struct device_queue_manager { - struct device_queue_manager_ops ops; -- struct device_queue_manager_asic_ops asic_ops; -+ struct device_queue_manager_asic_ops ops_asic_specific; - - struct mqd_manager *mqds[KFD_MQD_TYPE_MAX]; - struct packet_manager packets; -@@ -196,37 +174,21 @@ struct device_queue_manager { - unsigned int *fence_addr; - struct kfd_mem_obj *fence_mem; - bool active_runlist; -- int sched_policy; - }; - --void device_queue_manager_init_cik( -- struct device_queue_manager_asic_ops *asic_ops); --void device_queue_manager_init_cik_hawaii( -- struct device_queue_manager_asic_ops *asic_ops); --void device_queue_manager_init_vi( -- struct device_queue_manager_asic_ops *asic_ops); --void device_queue_manager_init_vi_tonga( -- struct device_queue_manager_asic_ops *asic_ops); --void device_queue_manager_init_v9_vega10( -- struct device_queue_manager_asic_ops *asic_ops); -+void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops); -+void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops); - void program_sh_mem_settings(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); - unsigned int get_queues_num(struct device_queue_manager *dqm); - unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); - unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); - --int process_evict_queues(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd); --int process_restore_queues(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd); -- -- - static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) - { - return (pdd->lds_base >> 16) & 0xFF; - } - --/* This function is only useful for GFXv7 and v8 */ - static inline unsigned int - get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) - { -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c -index 8e1eb24..48dc056 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c -@@ -24,7 +24,6 @@ - #include "kfd_device_queue_manager.h" - #include "cik_regs.h" - #include "oss/oss_2_4_sh_mask.h" --#include "gca/gfx_7_2_sh_mask.h" - - static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, -@@ -32,33 +31,18 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, - enum cache_policy alternate_policy, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); --static int update_qpd_cik(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd); --static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, -+static int register_process_cik(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); - static int initialize_cpsch_cik(struct device_queue_manager *dqm); - static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd); --static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, -- struct queue *q, -- struct qcm_process_device *qpd); -- --void device_queue_manager_init_cik( -- struct device_queue_manager_asic_ops *asic_ops) --{ -- asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; -- asic_ops->update_qpd = update_qpd_cik; -- asic_ops->init_cpsch = initialize_cpsch_cik; -- asic_ops->init_sdma_vm = init_sdma_vm; --} - --void device_queue_manager_init_cik_hawaii( -- struct device_queue_manager_asic_ops *asic_ops) -+void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops) - { -- asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; -- asic_ops->update_qpd = update_qpd_cik_hawaii; -- asic_ops->init_cpsch = initialize_cpsch_cik; -- asic_ops->init_sdma_vm = init_sdma_vm_hawaii; -+ ops->set_cache_memory_policy = set_cache_memory_policy_cik; -+ ops->register_process = register_process_cik; -+ ops->initialize = initialize_cpsch_cik; -+ ops->init_sdma_vm = init_sdma_vm; - } - - static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) -@@ -81,7 +65,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) - * for LDS/Scratch and GPUVM. - */ - -- WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE || -+ BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE || - top_address_nybble == 0); - - return PRIVATE_BASE(top_address_nybble << 12) | -@@ -114,12 +98,14 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, - return true; - } - --static int update_qpd_cik(struct device_queue_manager *dqm, -+static int register_process_cik(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) - { - struct kfd_process_device *pdd; - unsigned int temp; - -+ BUG_ON(!dqm || !qpd); -+ - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ -@@ -139,40 +125,9 @@ static int update_qpd_cik(struct device_queue_manager *dqm, - } else { - temp = get_sh_mem_bases_nybble_64(pdd); - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); -- qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; - } - -- pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", -- qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); -- -- return 0; --} -- --static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd) --{ -- struct kfd_process_device *pdd; -- unsigned int temp; -- -- pdd = qpd_to_pdd(qpd); -- -- /* check if sh_mem_config register already configured */ -- if (qpd->sh_mem_config == 0) { -- qpd->sh_mem_config = -- ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | -- DEFAULT_MTYPE(MTYPE_NONCACHED) | -- APE1_MTYPE(MTYPE_NONCACHED); -- qpd->sh_mem_ape1_limit = 0; -- qpd->sh_mem_ape1_base = 0; -- } -- -- /* On dGPU we're always in GPUVM64 addressing mode with 64-bit -- * aperture addresses. -- */ -- temp = get_sh_mem_bases_nybble_64(pdd); -- qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); -- -- pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", -+ pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", - qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); - - return 0; -@@ -194,19 +149,6 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, - q->properties.sdma_vm_addr = value; - } - --static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, -- struct queue *q, -- struct qcm_process_device *qpd) --{ -- /* On dGPU we're always in GPUVM64 addressing mode with 64-bit -- * aperture addresses. -- */ -- q->properties.sdma_vm_addr = -- ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << -- SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & -- SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; --} -- - static int initialize_cpsch_cik(struct device_queue_manager *dqm) - { - return 0; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c -deleted file mode 100644 -index dde5882..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c -+++ /dev/null -@@ -1,90 +0,0 @@ --/* -- * Copyright 2016 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- * -- */ -- --#include "kfd_device_queue_manager.h" --#include "vega10/vega10_enum.h" --#include "vega10/GC/gc_9_0_offset.h" --#include "vega10/GC/gc_9_0_sh_mask.h" --#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" -- --static int update_qpd_v9(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd); --static int initialize_cpsch_v9(struct device_queue_manager *dqm); --static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, -- struct qcm_process_device *qpd); -- --void device_queue_manager_init_v9_vega10( -- struct device_queue_manager_asic_ops *asic_ops) --{ -- asic_ops->update_qpd = update_qpd_v9; -- asic_ops->init_cpsch = initialize_cpsch_v9; -- asic_ops->init_sdma_vm = init_sdma_vm_v9; --} -- --static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) --{ -- uint32_t shared_base = pdd->lds_base >> 48; -- uint32_t private_base = pdd->scratch_base >> 48; -- -- return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | -- private_base; --} -- --static int update_qpd_v9(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd) --{ -- struct kfd_process_device *pdd; -- -- pdd = qpd_to_pdd(qpd); -- -- /* check if sh_mem_config register already configured */ -- if (qpd->sh_mem_config == 0) { -- qpd->sh_mem_config = -- SH_MEM_ALIGNMENT_MODE_UNALIGNED << -- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; -- if (vega10_noretry) -- qpd->sh_mem_config |= -- 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; -- -- qpd->sh_mem_ape1_limit = 0; -- qpd->sh_mem_ape1_base = 0; -- } -- -- qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); -- -- pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); -- -- return 0; --} -- --static int initialize_cpsch_v9(struct device_queue_manager *dqm) --{ -- return 0; --} -- --static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, -- struct qcm_process_device *qpd) --{ -- /* Not needed on SDMAv4 any more */ -- q->properties.sdma_vm_addr = 0; --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c -index ac8d852..7e9cae9 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c -@@ -33,44 +33,18 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, - enum cache_policy alternate_policy, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); --static int update_qpd_vi(struct device_queue_manager *dqm, -+static int register_process_vi(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); - static int initialize_cpsch_vi(struct device_queue_manager *dqm); - static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd); - --/* -- * Tonga device queue manager functions -- */ --static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd, -- enum cache_policy default_policy, -- enum cache_policy alternate_policy, -- void __user *alternate_aperture_base, -- uint64_t alternate_aperture_size); --static int update_qpd_vi_tonga(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd); --static void init_sdma_vm_tonga(struct device_queue_manager *dqm, -- struct queue *q, -- struct qcm_process_device *qpd); -- --void device_queue_manager_init_vi_tonga( -- struct device_queue_manager_asic_ops *asic_ops) --{ -- asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; -- asic_ops->update_qpd = update_qpd_vi_tonga; -- asic_ops->init_cpsch = initialize_cpsch_vi; -- asic_ops->init_sdma_vm = init_sdma_vm_tonga; --} -- -- --void device_queue_manager_init_vi( -- struct device_queue_manager_asic_ops *asic_ops) -+void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops) - { -- asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; -- asic_ops->update_qpd = update_qpd_vi; -- asic_ops->init_cpsch = initialize_cpsch_vi; -- asic_ops->init_sdma_vm = init_sdma_vm; -+ ops->set_cache_memory_policy = set_cache_memory_policy_vi; -+ ops->register_process = register_process_vi; -+ ops->initialize = initialize_cpsch_vi; -+ ops->init_sdma_vm = init_sdma_vm; - } - - static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) -@@ -93,7 +67,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) - * for LDS/Scratch and GPUVM. - */ - -- WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE || -+ BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE || - top_address_nybble == 0); - - return top_address_nybble << 12 | -@@ -130,39 +104,14 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, - return true; - } - --static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd, -- enum cache_policy default_policy, -- enum cache_policy alternate_policy, -- void __user *alternate_aperture_base, -- uint64_t alternate_aperture_size) --{ -- uint32_t default_mtype; -- uint32_t ape1_mtype; -- -- default_mtype = (default_policy == cache_policy_coherent) ? -- MTYPE_UC : -- MTYPE_NC; -- -- ape1_mtype = (alternate_policy == cache_policy_coherent) ? -- MTYPE_UC : -- MTYPE_NC; -- -- qpd->sh_mem_config = -- SH_MEM_ALIGNMENT_MODE_UNALIGNED << -- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | -- default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | -- ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; -- -- return true; --} -- --static int update_qpd_vi(struct device_queue_manager *dqm, -+static int register_process_vi(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) - { - struct kfd_process_device *pdd; - unsigned int temp; - -+ BUG_ON(!dqm || !qpd); -+ - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ -@@ -188,50 +137,14 @@ static int update_qpd_vi(struct device_queue_manager *dqm, - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); - qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 << - SH_MEM_CONFIG__ADDRESS_MODE__SHIFT; -- qpd->sh_mem_config |= 1 << -- SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; - } - -- pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", -+ pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", - qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); - - return 0; - } - --static int update_qpd_vi_tonga(struct device_queue_manager *dqm, -- struct qcm_process_device *qpd) --{ -- struct kfd_process_device *pdd; -- unsigned int temp; -- -- pdd = qpd_to_pdd(qpd); -- -- /* check if sh_mem_config register already configured */ -- if (qpd->sh_mem_config == 0) { -- qpd->sh_mem_config = -- SH_MEM_ALIGNMENT_MODE_UNALIGNED << -- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | -- MTYPE_UC << -- SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | -- MTYPE_UC << -- SH_MEM_CONFIG__APE1_MTYPE__SHIFT; -- -- qpd->sh_mem_ape1_limit = 0; -- qpd->sh_mem_ape1_base = 0; -- } -- -- /* On dGPU we're always in GPUVM64 addressing mode with 64-bit -- * aperture addresses. -- */ -- temp = get_sh_mem_bases_nybble_64(pdd); -- qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); -- -- pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n", -- temp, qpd->sh_mem_bases); -- -- return 0; --} -- - static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd) - { -@@ -248,20 +161,6 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, - q->properties.sdma_vm_addr = value; - } - --static void init_sdma_vm_tonga(struct device_queue_manager *dqm, -- struct queue *q, -- struct qcm_process_device *qpd) --{ -- /* On dGPU we're always in GPUVM64 addressing mode with 64-bit -- * aperture addresses. -- */ -- q->properties.sdma_vm_addr = -- ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << -- SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & -- SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; --} -- -- - static int initialize_cpsch_vi(struct device_queue_manager *dqm) - { - return 0; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c -index 008d258..453c5d6 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c -@@ -34,6 +34,7 @@ - */ - - #define KERNEL_DOORBELL_PASID 1 -+#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 - - /* - * Each device exposes a doorbell aperture, a PCI MMIO aperture that -@@ -50,15 +51,15 @@ - */ - - /* # of doorbell bytes allocated for each process. */ --size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) -+static inline size_t doorbell_process_allocation(void) - { -- return roundup(kfd->device_info->doorbell_size * -+ return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, - PAGE_SIZE); - } - - /* Doorbell calculations for device init. */ --int kfd_doorbell_init(struct kfd_dev *kfd) -+void kfd_doorbell_init(struct kfd_dev *kfd) - { - size_t doorbell_start_offset; - size_t doorbell_aperture_size; -@@ -72,16 +73,16 @@ int kfd_doorbell_init(struct kfd_dev *kfd) - - doorbell_start_offset = - roundup(kfd->shared_resources.doorbell_start_offset, -- kfd_doorbell_process_slice(kfd)); -+ doorbell_process_allocation()); - - doorbell_aperture_size = - rounddown(kfd->shared_resources.doorbell_aperture_size, -- kfd_doorbell_process_slice(kfd)); -+ doorbell_process_allocation()); - - if (doorbell_aperture_size > doorbell_start_offset) - doorbell_process_limit = - (doorbell_aperture_size - doorbell_start_offset) / -- kfd_doorbell_process_slice(kfd); -+ doorbell_process_allocation(); - else - doorbell_process_limit = 0; - -@@ -92,49 +93,45 @@ int kfd_doorbell_init(struct kfd_dev *kfd) - kfd->doorbell_process_limit = doorbell_process_limit - 1; - - kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, -- kfd_doorbell_process_slice(kfd)); -+ doorbell_process_allocation()); - -- if (!kfd->doorbell_kernel_ptr) -- return -ENOMEM; -+ BUG_ON(!kfd->doorbell_kernel_ptr); - -- pr_debug("Doorbell initialization:\n"); -- pr_debug("doorbell base == 0x%08lX\n", -+ pr_debug("kfd: doorbell initialization:\n"); -+ pr_debug("kfd: doorbell base == 0x%08lX\n", - (uintptr_t)kfd->doorbell_base); - -- pr_debug("doorbell_id_offset == 0x%08lX\n", -+ pr_debug("kfd: doorbell_id_offset == 0x%08lX\n", - kfd->doorbell_id_offset); - -- pr_debug("doorbell_process_limit == 0x%08lX\n", -+ pr_debug("kfd: doorbell_process_limit == 0x%08lX\n", - doorbell_process_limit); - -- pr_debug("doorbell_kernel_offset == 0x%08lX\n", -+ pr_debug("kfd: doorbell_kernel_offset == 0x%08lX\n", - (uintptr_t)kfd->doorbell_base); - -- pr_debug("doorbell aperture size == 0x%08lX\n", -+ pr_debug("kfd: doorbell aperture size == 0x%08lX\n", - kfd->shared_resources.doorbell_aperture_size); - -- pr_debug("doorbell kernel address == 0x%08lX\n", -+ pr_debug("kfd: doorbell kernel address == 0x%08lX\n", - (uintptr_t)kfd->doorbell_kernel_ptr); -- -- return 0; --} -- --void kfd_doorbell_fini(struct kfd_dev *kfd) --{ -- if (kfd->doorbell_kernel_ptr) -- iounmap(kfd->doorbell_kernel_ptr); - } - --int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, -- struct vm_area_struct *vma) -+int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) - { - phys_addr_t address; -+ struct kfd_dev *dev; - - /* - * For simplicitly we only allow mapping of the entire doorbell - * allocation of a single device & process. - */ -- if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) -+ if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) -+ return -EINVAL; -+ -+ /* Find kfd device according to gpu id */ -+ dev = kfd_device_by_id(vma->vm_pgoff); -+ if (dev == NULL) - return -EINVAL; - - /* Calculate physical address of doorbell */ -@@ -145,29 +142,32 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - -- pr_debug("Mapping doorbell page\n" -+ pr_debug("kfd: mapping doorbell page in %s\n" - " target user address == 0x%08llX\n" - " physical address == 0x%08llX\n" - " vm_flags == 0x%04lX\n" - " size == 0x%04lX\n", -+ __func__, - (unsigned long long) vma->vm_start, address, vma->vm_flags, -- kfd_doorbell_process_slice(dev)); -+ doorbell_process_allocation()); - - - return io_remap_pfn_range(vma, - vma->vm_start, - address >> PAGE_SHIFT, -- kfd_doorbell_process_slice(dev), -+ doorbell_process_allocation(), - vma->vm_page_prot); - } - - - /* get kernel iomem pointer for a doorbell */ --void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, -+u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, - unsigned int *doorbell_off) - { - u32 inx; - -+ BUG_ON(!kfd || !doorbell_off); -+ - mutex_lock(&kfd->doorbell_mutex); - inx = find_first_zero_bit(kfd->doorbell_available_index, - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); -@@ -178,17 +178,14 @@ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, - if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) - return NULL; - -- inx *= kfd->device_info->doorbell_size / sizeof(u32); -- - /* - * Calculating the kernel doorbell offset using "faked" kernel -- * pasid that allocated for kernel queues only. Offset is in -- * dword units regardless of the ASIC-dependent doorbell size. -+ * pasid that allocated for kernel queues only - */ -- *doorbell_off = KERNEL_DOORBELL_PASID * -- (kfd_doorbell_process_slice(kfd) / sizeof(u32)) + inx; -+ *doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() / -+ sizeof(u32)) + inx; - -- pr_debug("Get kernel queue doorbell\n" -+ pr_debug("kfd: get kernel queue doorbell\n" - " doorbell offset == 0x%08X\n" - " kernel address == 0x%08lX\n", - *doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx)); -@@ -200,6 +197,8 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) - { - unsigned int inx; - -+ BUG_ON(!kfd || !db_addr); -+ - inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr); - - mutex_lock(&kfd->doorbell_mutex); -@@ -207,21 +206,11 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) - mutex_unlock(&kfd->doorbell_mutex); - } - --void write_kernel_doorbell(void __iomem *db, u32 value) -+inline void write_kernel_doorbell(u32 __iomem *db, u32 value) - { - if (db) { - writel(value, db); -- pr_debug("Writing %d to doorbell address 0x%p\n", value, db); -- } --} -- --void write_kernel_doorbell64(void __iomem *db, u64 value) --{ -- if (db) { -- WARN(((unsigned long)db & 7) != 0, -- "Unaligned 64-bit doorbell"); -- writeq(value, (u64 __iomem *)db); -- pr_debug("writing %llu to doorbell address 0x%p\n", value, db); -+ pr_debug("writing %d to doorbell address 0x%p\n", value, db); - } - } - -@@ -229,26 +218,25 @@ void write_kernel_doorbell64(void __iomem *db, u64 value) - * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 - * to doorbells with the process's doorbell page - */ --unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, -+unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, - struct kfd_process *process, -- unsigned int doorbell_id) -+ unsigned int queue_id) - { - /* - * doorbell_id_offset accounts for doorbells taken by KGD. -- * pasid * kfd_doorbell_process_slice/sizeof(u32) adjusts to -- * the process's doorbells. The offset returned is in dword -- * units regardless of the ASIC-dependent doorbell size. -+ * pasid * doorbell_process_allocation/sizeof(u32) adjusts -+ * to the process's doorbells - */ - return kfd->doorbell_id_offset + -- process->pasid * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) + -- doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); -+ process->pasid * (doorbell_process_allocation()/sizeof(u32)) + -+ queue_id; - } - - uint64_t kfd_get_number_elems(struct kfd_dev *kfd) - { - uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - - kfd->shared_resources.doorbell_start_offset) / -- kfd_doorbell_process_slice(kfd) + 1; -+ doorbell_process_allocation() + 1; - - return num_of_elems; - -@@ -258,5 +246,5 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, - struct kfd_process *process) - { - return dev->doorbell_base + -- process->pasid * kfd_doorbell_process_slice(dev); -+ process->pasid * doorbell_process_allocation(); - } -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c -index 7eacf42..d1ce83d 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c -@@ -23,9 +23,9 @@ - #include <linux/mm_types.h> - #include <linux/slab.h> - #include <linux/types.h> --#include <linux/uaccess.h> --#include <linux/sched/mm.h> - #include <linux/sched/signal.h> -+#include <linux/uaccess.h> -+#include <linux/mm.h> - #include <linux/mman.h> - #include <linux/memory.h> - #include "kfd_priv.h" -@@ -52,9 +52,6 @@ struct kfd_event_waiter { - uint32_t input_index; - }; - --#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT --#define SLOT_BITMAP_LONGS BITS_TO_LONGS(SLOTS_PER_PAGE) -- - /* - * Over-complicated pooled allocator for event notification slots. - * -@@ -68,19 +65,24 @@ struct kfd_event_waiter { - struct signal_page { - struct list_head event_pages; /* kfd_process.signal_event_pages */ - uint64_t *kernel_address; -- uint64_t handle; - uint64_t __user *user_address; - uint32_t page_index; /* Index into the mmap aperture. */ - unsigned int free_slots; -- unsigned long used_slot_bitmap[SLOT_BITMAP_LONGS]; -+ unsigned long used_slot_bitmap[0]; - }; - -+#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT -+#define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE) -+#define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1) -+#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \ -+ SLOT_BITMAP_SIZE * sizeof(long)) -+ - /* - * For signal events, the event ID is used as the interrupt user data. - * For SQ s_sendmsg interrupts, this is limited to 8 bits. - */ - --#define INTERRUPT_DATA_BITS 12 -+#define INTERRUPT_DATA_BITS 8 - #define SIGNAL_EVENT_ID_SLOT_SHIFT 0 - - static uint64_t *page_slots(struct signal_page *page) -@@ -108,7 +110,7 @@ static bool allocate_free_slot(struct kfd_process *process, - *out_page = page; - *out_slot_index = slot; - -- pr_debug("Allocated event signal slot in page %p, slot %d\n", -+ pr_debug("allocated event signal slot in page %p, slot %d\n", - page, slot); - - return true; -@@ -129,7 +131,7 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) - void *backing_store; - struct signal_page *page; - -- page = kzalloc(sizeof(*page), GFP_KERNEL); -+ page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); - if (!page) - goto fail_alloc_signal_page; - -@@ -153,9 +155,9 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) - struct signal_page, - event_pages)->page_index + 1; - -- pr_debug("Allocated new event signal page at %p, for process %p\n", -+ pr_debug("allocated new event signal page at %p, for process %p\n", - page, p); -- pr_debug("Page index is %d\n", page->page_index); -+ pr_debug("page index is %d\n", page->page_index); - - list_add(&page->event_pages, &p->signal_event_pages); - -@@ -184,53 +186,6 @@ static bool allocate_event_notification_slot(struct file *devkfd, - return ret; - } - --static bool allocate_signal_page_dgpu(struct kfd_process *p, -- uint64_t *kernel_address, uint64_t handle) --{ -- struct signal_page *my_page; -- -- my_page = kzalloc(sizeof(*my_page), GFP_KERNEL); -- if (!my_page) -- return false; -- -- /* prevent user-mode info leaks */ -- memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, -- KFD_SIGNAL_EVENT_LIMIT * 8); -- -- my_page->kernel_address = kernel_address; -- my_page->handle = handle; -- my_page->user_address = NULL; -- my_page->free_slots = SLOTS_PER_PAGE; -- if (list_empty(&p->signal_event_pages)) -- my_page->page_index = 0; -- else -- my_page->page_index = list_tail_entry(&p->signal_event_pages, -- struct signal_page, -- event_pages)->page_index + 1; -- -- pr_debug("Allocated new event signal page at %p, for process %p\n", -- my_page, p); -- pr_debug("Page index is %d\n", my_page->page_index); -- -- list_add(&my_page->event_pages, &p->signal_event_pages); -- -- return true; --} -- --void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle) --{ -- struct signal_page *page, *tmp; -- -- list_for_each_entry_safe(page, tmp, &p->signal_event_pages, -- event_pages) { -- if (page->handle == handle) { -- list_del(&page->event_pages); -- kfree(page); -- break; -- } -- } --} -- - /* Assumes that the process's event_mutex is locked. */ - static void release_event_notification_slot(struct signal_page *page, - size_t slot_index) -@@ -239,8 +194,7 @@ static void release_event_notification_slot(struct signal_page *page, - page->free_slots++; - - /* We don't free signal pages, they are retained by the process -- * and reused until it exits. -- */ -+ * and reused until it exits. */ - } - - static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, -@@ -292,7 +246,7 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) - - for (id = p->next_nonsignal_event_id; - id < KFD_LAST_NONSIGNAL_EVENT_ID && -- lookup_event_by_id(p, id); -+ lookup_event_by_id(p, id) != NULL; - id++) - ; - -@@ -311,7 +265,7 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) - - for (id = KFD_FIRST_NONSIGNAL_EVENT_ID; - id < KFD_LAST_NONSIGNAL_EVENT_ID && -- lookup_event_by_id(p, id); -+ lookup_event_by_id(p, id) != NULL; - id++) - ; - -@@ -337,16 +291,13 @@ static int create_signal_event(struct file *devkfd, - struct kfd_event *ev) - { - if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { -- if (!p->signal_event_limit_reached) { -- pr_warn("Signal event wasn't created because limit was reached\n"); -- p->signal_event_limit_reached = true; -- } -+ pr_warn("amdkfd: Signal event wasn't created because limit was reached\n"); - return -ENOMEM; - } - - if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page, -- &ev->signal_slot_index)) { -- pr_warn("Signal event wasn't created because out of kernel memory\n"); -+ &ev->signal_slot_index)) { -+ pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n"); - return -ENOMEM; - } - -@@ -358,7 +309,11 @@ static int create_signal_event(struct file *devkfd, - ev->event_id = make_signal_event_id(ev->signal_page, - ev->signal_slot_index); - -- pr_debug("Signal event number %zu created with id %d, address %p\n", -+ pr_debug("signal event number %zu created with id %d, address %p\n", -+ p->signal_event_count, ev->event_id, -+ ev->user_signal_address); -+ -+ pr_debug("signal event number %zu created with id %d, address %p\n", - p->signal_event_count, ev->event_id, - ev->user_signal_address); - -@@ -390,7 +345,7 @@ void kfd_event_init_process(struct kfd_process *p) - - static void destroy_event(struct kfd_process *p, struct kfd_event *ev) - { -- if (ev->signal_page) { -+ if (ev->signal_page != NULL) { - release_event_notification_slot(ev->signal_page, - ev->signal_slot_index); - p->signal_event_count--; -@@ -426,9 +381,8 @@ static void shutdown_signal_pages(struct kfd_process *p) - - list_for_each_entry_safe(page, tmp, &p->signal_event_pages, - event_pages) { -- if (page->user_address) -- free_pages((unsigned long)page->kernel_address, -- get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); -+ free_pages((unsigned long)page->kernel_address, -+ get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); - kfree(page); - } - } -@@ -453,8 +407,7 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) - int kfd_event_create(struct file *devkfd, struct kfd_process *p, - uint32_t event_type, bool auto_reset, uint32_t node_id, - uint32_t *event_id, uint32_t *event_trigger_data, -- uint64_t *event_page_offset, uint32_t *event_slot_index, -- void *kern_addr) -+ uint64_t *event_page_offset, uint32_t *event_slot_index) - { - int ret = 0; - struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); -@@ -468,20 +421,17 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, - - INIT_LIST_HEAD(&ev->waiters); - -- mutex_lock(&p->event_mutex); -- -- if (kern_addr && list_empty(&p->signal_event_pages)) -- allocate_signal_page_dgpu(p, kern_addr, *event_page_offset); -- - *event_page_offset = 0; - -+ mutex_lock(&p->event_mutex); -+ - switch (event_type) { - case KFD_EVENT_TYPE_SIGNAL: - case KFD_EVENT_TYPE_DEBUG: - ret = create_signal_event(devkfd, p, ev); - if (!ret) { - *event_page_offset = (ev->signal_page->page_index | -- KFD_MMAP_TYPE_EVENTS); -+ KFD_MMAP_EVENTS_MASK); - *event_page_offset <<= PAGE_SHIFT; - *event_slot_index = ev->signal_slot_index; - } -@@ -614,7 +564,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, - /* - * Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are -- * running so the lookup function increments the process ref count. -+ * running so the lookup function returns a locked process. - */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - -@@ -634,7 +584,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, - * search faster. - */ - struct signal_page *page; -- unsigned int i; -+ unsigned i; - - list_for_each_entry(page, &p->signal_event_pages, event_pages) - for (i = 0; i < SLOTS_PER_PAGE; i++) -@@ -646,7 +596,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, - } - - mutex_unlock(&p->event_mutex); -- kfd_unref_process(p); -+ mutex_unlock(&p->mutex); - } - - static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) -@@ -667,7 +617,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) - return event_waiters; - } - --static int init_event_waiter_get_status(struct kfd_process *p, -+static int init_event_waiter(struct kfd_process *p, - struct kfd_event_waiter *waiter, - uint32_t event_id, - uint32_t input_index) -@@ -682,18 +632,9 @@ static int init_event_waiter_get_status(struct kfd_process *p, - waiter->activated = ev->signaled; - ev->signaled = ev->signaled && !ev->auto_reset; - -- return 0; --} -+ list_add(&waiter->waiters, &ev->waiters); - --static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) --{ -- struct kfd_event *ev = waiter->event; -- -- /* Only add to the wait list if we actually need to -- * wait on this event. -- */ -- if (!waiter->activated) -- list_add(&waiter->waiters, &ev->waiters); -+ return 0; - } - - static bool test_event_condition(bool all, uint32_t num_events, -@@ -788,11 +729,6 @@ int kfd_wait_on_events(struct kfd_process *p, - - mutex_lock(&p->event_mutex); - -- /* Set to something unreasonable - this is really -- * just a bool for now. -- */ -- *wait_result = KFD_WAIT_TIMEOUT; -- - event_waiters = alloc_event_waiters(num_events); - if (!event_waiters) { - ret = -ENOMEM; -@@ -808,34 +744,14 @@ int kfd_wait_on_events(struct kfd_process *p, - goto fail; - } - -- ret = init_event_waiter_get_status(p, &event_waiters[i], -+ ret = init_event_waiter(p, &event_waiters[i], - event_data.event_id, i); - if (ret) - goto fail; - } - -- /* Check condition once. */ -- if (test_event_condition(all, num_events, event_waiters)) { -- if (copy_signaled_event_data(num_events, -- event_waiters, events)) -- *wait_result = KFD_WAIT_COMPLETE; -- else -- *wait_result = KFD_WAIT_ERROR; -- free_waiters(num_events, event_waiters); -- } else { -- /* Add to wait lists if we need to wait. */ -- for (i = 0; i < num_events; i++) -- init_event_waiter_add_to_waitlist(&event_waiters[i]); -- } -- - mutex_unlock(&p->event_mutex); - -- /* Return if all waits were already satisfied. */ -- if (*wait_result != KFD_WAIT_TIMEOUT) { -- __set_current_state(TASK_RUNNING); -- return ret; -- } -- - while (true) { - if (fatal_signal_pending(current)) { - ret = -EINTR; -@@ -855,17 +771,6 @@ int kfd_wait_on_events(struct kfd_process *p, - break; - } - -- /* Set task state to interruptible sleep before -- * checking wake-up conditions. A concurrent wake-up -- * will put the task back into runnable state. In that -- * case schedule_timeout will not put the task to -- * sleep and we'll get a chance to re-check the -- * updated conditions almost immediately. Otherwise, -- * this race condition would lead to a soft hang or a -- * very long sleep. -- */ -- set_current_state(TASK_INTERRUPTIBLE); -- - if (test_event_condition(all, num_events, event_waiters)) { - if (copy_signaled_event_data(num_events, - event_waiters, events)) -@@ -880,7 +785,7 @@ int kfd_wait_on_events(struct kfd_process *p, - break; - } - -- timeout = schedule_timeout(timeout); -+ timeout = schedule_timeout_interruptible(timeout); - } - __set_current_state(TASK_RUNNING); - -@@ -911,7 +816,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) - /* check required size is logical */ - if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) != - get_order(vma->vm_end - vma->vm_start)) { -- pr_err("Event page mmap requested illegal size\n"); -+ pr_err("amdkfd: event page mmap requested illegal size\n"); - return -EINVAL; - } - -@@ -920,7 +825,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) - page = lookup_signal_page_by_index(p, page_index); - if (!page) { - /* Probably KFD bug, but mmap is user-accessible. */ -- pr_debug("Signal page could not be found for page_index %u\n", -+ pr_debug("signal page could not be found for page_index %u\n", - page_index); - return -EINVAL; - } -@@ -931,7 +836,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE - | VM_DONTDUMP | VM_PFNMAP; - -- pr_debug("Mapping signal page\n"); -+ pr_debug("mapping signal page\n"); - pr_debug(" start user address == 0x%08lx\n", vma->vm_start); - pr_debug(" end user address == 0x%08lx\n", vma->vm_end); - pr_debug(" pfn == 0x%016lX\n", pfn); -@@ -971,13 +876,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, - ev->memory_exception_data = *ev_data; - } - -- if (type == KFD_EVENT_TYPE_MEMORY) { -- dev_warn(kfd_device, -- "Sending SIGSEGV to HSA Process with PID %d ", -- p->lead_thread->pid); -- send_sig(SIGSEGV, p->lead_thread, 0); -- } -- - /* Send SIGTERM no event of type "type" has been found*/ - if (send_signal) { - if (send_sigterm) { -@@ -993,7 +891,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, - } - } - --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, - unsigned long address, bool is_write_requested, - bool is_execute_requested) -@@ -1004,27 +901,17 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, - /* - * Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are -- * running so the lookup function increments the process ref count. -+ * running so the lookup function returns a locked process. - */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); -- struct mm_struct *mm; - - if (!p) - return; /* Presumably process exited. */ - -- /* Take a safe reference to the mm_struct, which may otherwise -- * disappear even while the kfd_process is still referenced. -- */ -- mm = get_task_mm(p->lead_thread); -- if (!mm) { -- kfd_unref_process(p); -- return; /* Process is exiting */ -- } -- - memset(&memory_exception_data, 0, sizeof(memory_exception_data)); - -- down_read(&mm->mmap_sem); -- vma = find_vma(mm, address); -+ down_read(&p->mm->mmap_sem); -+ vma = find_vma(p->mm, address); - - memory_exception_data.gpu_id = dev->id; - memory_exception_data.va = address; -@@ -1050,8 +937,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, - } - } - -- up_read(&mm->mmap_sem); -- mmput(mm); -+ up_read(&p->mm->mmap_sem); - - mutex_lock(&p->event_mutex); - -@@ -1060,17 +946,15 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, - &memory_exception_data); - - mutex_unlock(&p->event_mutex); -- -- kfd_unref_process(p); -+ mutex_unlock(&p->mutex); - } --#endif /* CONFIG_AMD_IOMMU_V2_MODULE */ - - void kfd_signal_hw_exception_event(unsigned int pasid) - { - /* - * Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are -- * running so the lookup function increments the process ref count. -+ * running so the lookup function returns a locked process. - */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - -@@ -1083,42 +967,5 @@ void kfd_signal_hw_exception_event(unsigned int pasid) - lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL); - - mutex_unlock(&p->event_mutex); -- kfd_unref_process(p); --} -- --void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, -- struct kfd_vm_fault_info *info) --{ -- struct kfd_event *ev; -- int bkt; -- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); -- struct kfd_hsa_memory_exception_data memory_exception_data; -- -- if (!p) -- return; /* Presumably process exited. */ -- memset(&memory_exception_data, 0, sizeof(memory_exception_data)); -- memory_exception_data.gpu_id = dev->id; -- memory_exception_data.failure.imprecise = true; -- /* Set failure reason */ -- if (info) { -- memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; -- memory_exception_data.failure.NotPresent = -- info->prot_valid ? 1 : 0; -- memory_exception_data.failure.NoExecute = -- info->prot_exec ? 1 : 0; -- memory_exception_data.failure.ReadOnly = -- info->prot_write ? 1 : 0; -- memory_exception_data.failure.imprecise = 0; -- } -- mutex_lock(&p->event_mutex); -- -- hash_for_each(p->events, bkt, ev, events) { -- if (ev->type == KFD_EVENT_TYPE_MEMORY) { -- ev->memory_exception_data = memory_exception_data; -- set_event(ev); -- } -- } -- -- mutex_unlock(&p->event_mutex); -- kfd_unref_process(p); -+ mutex_unlock(&p->mutex); - } -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c -index 09595a9..2b65510 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c -@@ -275,80 +275,24 @@ - * for FLAT_* / S_LOAD operations. - */ - --#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ -+#define MAKE_GPUVM_APP_BASE(gpu_num) \ - (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) - --#define MAKE_GPUVM_APP_LIMIT(base, size) \ -- (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) -+#define MAKE_GPUVM_APP_LIMIT(base) \ -+ (((uint64_t)(base) & \ -+ 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) - --#define MAKE_SCRATCH_APP_BASE_VI() \ -- (((uint64_t)(0x1UL) << 61) + 0x100000000L) -+#define MAKE_SCRATCH_APP_BASE(gpu_num) \ -+ (((uint64_t)(gpu_num) << 61) + 0x100000000L) - - #define MAKE_SCRATCH_APP_LIMIT(base) \ - (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) - --#define MAKE_LDS_APP_BASE_VI() \ -- (((uint64_t)(0x1UL) << 61) + 0x0) -- -+#define MAKE_LDS_APP_BASE(gpu_num) \ -+ (((uint64_t)(gpu_num) << 61) + 0x0) - #define MAKE_LDS_APP_LIMIT(base) \ - (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) - --/* On GFXv9 the LDS and scratch apertures are programmed independently -- * using the high 16 bits of the 64-bit virtual address. They must be -- * in the hole, which will be the case as long as the high 16 bits are -- * not 0. -- * -- * The aperture sizes are still 4GB implicitly. -- * -- * A GPUVM aperture is not applicable on GFXv9. -- */ --#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) --#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) -- --/* Some VM address space reserved for kernel use (CWSR trap handlers -- * and kernel IBs) -- */ --#define DGPU_VM_BASE_DEFAULT 0x100000 --#define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE) -- --int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, -- uint64_t base, uint64_t limit) --{ -- if (base < (pdd->qpd.cwsr_base + pdd->dev->cwsr_size)) { -- pr_err("Set dgpu vm base 0x%llx failed.\n", base); -- return -EINVAL; -- } -- pdd->dgpu_base = base; -- pdd->dgpu_limit = limit; -- return 0; --} -- --void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) --{ -- /* -- * node id couldn't be 0 - the three MSB bits of -- * aperture shoudn't be 0 -- */ -- pdd->lds_base = MAKE_LDS_APP_BASE_VI(); -- pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); -- -- pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); -- pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( -- pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size); -- -- pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); -- pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); --} -- --void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) --{ -- pdd->lds_base = MAKE_LDS_APP_BASE_V9(); -- pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); -- -- pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); -- pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); --} -- - int kfd_init_apertures(struct kfd_process *process) - { - uint8_t id = 0; -@@ -356,14 +300,11 @@ int kfd_init_apertures(struct kfd_process *process) - struct kfd_process_device *pdd; - - /*Iterating over all devices*/ -- while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { -- if (!dev) { -- id++; /* Skip non GPU devices */ -- continue; -- } -+ while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && -+ id < NUM_OF_SUPPORTED_GPUS) { - - pdd = kfd_create_process_device_data(dev, process); -- if (!pdd) { -+ if (pdd == NULL) { - pr_err("Failed to create process device data\n"); - return -1; - } -@@ -377,29 +318,23 @@ int kfd_init_apertures(struct kfd_process *process) - pdd->gpuvm_base = pdd->gpuvm_limit = 0; - pdd->scratch_base = pdd->scratch_limit = 0; - } else { -- switch (dev->device_info->asic_family) { -- case CHIP_KAVERI: -- case CHIP_HAWAII: -- case CHIP_CARRIZO: -- case CHIP_TONGA: -- case CHIP_FIJI: -- case CHIP_POLARIS10: -- case CHIP_POLARIS11: -- kfd_init_apertures_vi(pdd, id); -- break; -- case CHIP_VEGA10: -- case CHIP_RAVEN: -- kfd_init_apertures_v9(pdd, id); -- break; -- default: -- pr_err("Unknown chip in kfd_init_apertures\n"); -- return -1; -- } -+ /* -+ * node id couldn't be 0 - the three MSB bits of -+ * aperture shoudn't be 0 -+ */ -+ pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); -+ -+ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); -+ -+ pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); -+ -+ pdd->gpuvm_limit = -+ MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); - -- if (!dev->device_info->is_need_iommu_device) { -- pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT; -- pdd->qpd.ib_base = DGPU_IB_BASE_DEFAULT; -- } -+ pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); -+ -+ pdd->scratch_limit = -+ MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); - } - - dev_dbg(kfd_device, "node id %u\n", id); -@@ -417,9 +352,4 @@ int kfd_init_apertures(struct kfd_process *process) - return 0; - } - --void kfd_flush_tlb(struct kfd_dev *dev, uint32_t pasid) --{ -- const struct kfd2kgd_calls *f2g = dev->kfd2kgd; - -- f2g->invalidate_tlbs(dev->kgd, pasid); --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c -deleted file mode 100644 -index b2c6b52..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c -+++ /dev/null -@@ -1,133 +0,0 @@ --/* -- * Copyright 2016 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- --#include "kfd_priv.h" --#include "kfd_events.h" --#include "soc15_int.h" -- -- --static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid) --{ -- uint32_t pasid = 0; -- const struct kfd2kgd_calls *f2g = dev->kfd2kgd; -- -- if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid)) -- pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); -- -- return pasid; --} -- --static bool event_interrupt_isr_v9(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry, -- uint32_t *patched_ihre, -- bool *patched_flag) --{ -- uint16_t source_id, client_id, pasid, vmid; -- bool result = false; -- -- source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); -- client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); -- pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); -- vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); -- -- if (pasid) { -- const uint32_t *data = ih_ring_entry; -- -- pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", -- client_id, source_id, pasid); -- pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", -- data[0], data[1], data[2], data[3], -- data[4], data[5], data[6], data[7]); -- } -- -- if ((vmid >= dev->vm_info.first_vmid_kfd && -- vmid <= dev->vm_info.last_vmid_kfd) && -- (source_id == SOC15_INTSRC_CP_END_OF_PIPE || -- source_id == SOC15_INTSRC_SDMA_TRAP || -- source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || -- source_id == SOC15_INTSRC_CP_BAD_OPCODE || -- client_id == SOC15_IH_CLIENTID_VMC || -- client_id == SOC15_IH_CLIENTID_UTCL2)) { -- -- /* -- * KFD want to handle this INT, but MEC firmware did -- * not send pasid. Try to get it from vmid mapping -- * and patch the ih entry. It's a temp workaround. -- */ -- WARN_ONCE((!pasid), "Fix me.\n"); -- if (!pasid) { -- uint32_t temp = le32_to_cpu(ih_ring_entry[3]); -- -- pasid = kfd_get_pasid_from_vmid(dev, vmid); -- memcpy(patched_ihre, ih_ring_entry, -- dev->device_info->ih_ring_entry_size); -- patched_ihre[3] = cpu_to_le32(temp | pasid); -- *patched_flag = true; -- } -- result = pasid ? true : false; -- } -- -- /* Do not process in ISR, just request it to be forwarded to WQ. */ -- return result; -- --} -- --static void event_interrupt_wq_v9(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry) --{ -- uint16_t source_id, client_id, pasid, vmid; -- -- source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); -- client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); -- pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); -- vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); -- -- if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) -- kfd_signal_event_interrupt(pasid, 0, 0); -- else if (source_id == SOC15_INTSRC_SDMA_TRAP) -- kfd_signal_event_interrupt(pasid, 0, 0); -- else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) -- kfd_signal_event_interrupt(pasid, 0, 0); /*todo */ -- else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) -- kfd_signal_hw_exception_event(pasid); -- else if (client_id == SOC15_IH_CLIENTID_VMC || -- client_id == SOC15_IH_CLIENTID_UTCL2) { -- struct kfd_vm_fault_info info = {0}; -- uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); -- -- info.vmid = vmid; -- info.mc_id = client_id; -- info.page_addr = ih_ring_entry[4] | -- (uint64_t)(ih_ring_entry[5] & 0xf) << 32; -- info.prot_valid = ring_id & 0x08; -- info.prot_read = ring_id & 0x10; -- info.prot_write = ring_id & 0x20; -- -- kfd_process_vm_fault(dev->dqm, pasid); -- kfd_signal_vm_fault_event(dev, pasid, &info); -- } --} -- --const struct kfd_event_interrupt_class event_interrupt_class_v9 = { -- .interrupt_isr = event_interrupt_isr_v9, -- .interrupt_wq = event_interrupt_wq_v9, --}; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c -index 47dcf4a..7f134aa 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c -@@ -44,24 +44,24 @@ - #include <linux/device.h> - #include "kfd_priv.h" - --#define KFD_IH_NUM_ENTRIES 8192 -+#define KFD_INTERRUPT_RING_SIZE 1024 - - static void interrupt_wq(struct work_struct *); - - int kfd_interrupt_init(struct kfd_dev *kfd) - { -- int r; -- -- r = kfifo_alloc(&kfd->ih_fifo, -- KFD_IH_NUM_ENTRIES * -- kfd->device_info->ih_ring_entry_size, -- GFP_KERNEL); -- if (r) { -- dev_err(kfd_chardev(), "Failed to allocate IH fifo\n"); -- return r; -- } -+ void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE, -+ kfd->device_info->ih_ring_entry_size, -+ GFP_KERNEL); -+ if (!interrupt_ring) -+ return -ENOMEM; -+ -+ kfd->interrupt_ring = interrupt_ring; -+ kfd->interrupt_ring_size = -+ KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size; -+ atomic_set(&kfd->interrupt_ring_wptr, 0); -+ atomic_set(&kfd->interrupt_ring_rptr, 0); - -- kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); - spin_lock_init(&kfd->interrupt_lock); - - INIT_WORK(&kfd->interrupt_work, interrupt_wq); -@@ -92,47 +92,74 @@ void kfd_interrupt_exit(struct kfd_dev *kfd) - spin_unlock_irqrestore(&kfd->interrupt_lock, flags); - - /* -- * flush_work ensures that there are no outstanding -+ * Flush_scheduled_work ensures that there are no outstanding - * work-queue items that will access interrupt_ring. New work items - * can't be created because we stopped interrupt handling above. - */ -- flush_workqueue(kfd->ih_wq); -+ flush_scheduled_work(); - -- kfifo_free(&kfd->ih_fifo); -+ kfree(kfd->interrupt_ring); - } - - /* -- * Assumption: single reader/writer. This function is not re-entrant -+ * This assumes that it can't be called concurrently with itself -+ * but only with dequeue_ih_ring_entry. - */ - bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) - { -- int count; -+ unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); -+ unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); - -- count = kfifo_in(&kfd->ih_fifo, ih_ring_entry, -- kfd->device_info->ih_ring_entry_size); -- if (count != kfd->device_info->ih_ring_entry_size) { -+ if ((rptr - wptr) % kfd->interrupt_ring_size == -+ kfd->device_info->ih_ring_entry_size) { -+ /* This is very bad, the system is likely to hang. */ - dev_err_ratelimited(kfd_chardev(), -- "Interrupt ring overflow, dropping interrupt %d\n", -- count); -+ "Interrupt ring overflow, dropping interrupt.\n"); - return false; - } - -+ memcpy(kfd->interrupt_ring + wptr, ih_ring_entry, -+ kfd->device_info->ih_ring_entry_size); -+ -+ wptr = (wptr + kfd->device_info->ih_ring_entry_size) % -+ kfd->interrupt_ring_size; -+ smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */ -+ atomic_set(&kfd->interrupt_ring_wptr, wptr); -+ - return true; - } - - /* -- * Assumption: single reader/writer. This function is not re-entrant -+ * This assumes that it can't be called concurrently with itself -+ * but only with enqueue_ih_ring_entry. - */ - static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry) - { -- int count; -+ /* -+ * Assume that wait queues have an implicit barrier, i.e. anything that -+ * happened in the ISR before it queued work is visible. -+ */ -+ -+ unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); -+ unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); - -- count = kfifo_out(&kfd->ih_fifo, ih_ring_entry, -- kfd->device_info->ih_ring_entry_size); -+ if (rptr == wptr) -+ return false; - -- WARN_ON(count && count != kfd->device_info->ih_ring_entry_size); -+ memcpy(ih_ring_entry, kfd->interrupt_ring + rptr, -+ kfd->device_info->ih_ring_entry_size); - -- return count == kfd->device_info->ih_ring_entry_size; -+ rptr = (rptr + kfd->device_info->ih_ring_entry_size) % -+ kfd->interrupt_ring_size; -+ -+ /* -+ * Ensure the rptr write update is not visible until -+ * memcpy has finished reading. -+ */ -+ smp_mb(); -+ atomic_set(&kfd->interrupt_ring_rptr, rptr); -+ -+ return true; - } - - static void interrupt_wq(struct work_struct *work) -@@ -149,15 +176,13 @@ static void interrupt_wq(struct work_struct *work) - ih_ring_entry); - } - --bool interrupt_is_wanted(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry, -- uint32_t *patched_ihre, bool *flag) -+bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) - { - /* integer and bitwise OR so there is no boolean short-circuiting */ -- unsigned int wanted = 0; -+ unsigned wanted = 0; - - wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, -- ih_ring_entry, patched_ihre, flag); -+ ih_ring_entry); - - return wanted != 0; - } -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c -deleted file mode 100644 -index e67eb9f..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c -+++ /dev/null -@@ -1,275 +0,0 @@ --/* -- * Copyright 2014 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- --#include <linux/dma-buf.h> --#include <linux/slab.h> --#include <linux/random.h> -- --#include "kfd_ipc.h" --#include "kfd_priv.h" -- --#define KFD_IPC_HASH_TABLE_SIZE_SHIFT 4 --#define KFD_IPC_HASH_TABLE_SIZE_MASK ((1 << KFD_IPC_HASH_TABLE_SIZE_SHIFT) - 1) -- --static struct kfd_ipc_handles { -- DECLARE_HASHTABLE(handles, KFD_IPC_HASH_TABLE_SIZE_SHIFT); -- struct mutex lock; --} kfd_ipc_handles; -- --/* Since, handles are random numbers, it can be used directly as hashing key. -- * The least 4 bits of the handle are used as key. However, during import all -- * 128 bits of the handle are checked to prevent handle snooping. -- */ --#define HANDLE_TO_KEY(sh) ((*(uint64_t *)sh) & KFD_IPC_HASH_TABLE_SIZE_MASK) -- --static int ipc_store_insert(void *val, void *sh, struct kfd_ipc_obj **ipc_obj) --{ -- struct kfd_ipc_obj *obj; -- -- obj = kmalloc(sizeof(*obj), GFP_KERNEL); -- if (!obj) -- return -ENOMEM; -- -- /* The initial ref belongs to the allocator process. -- * The IPC object store itself does not hold a ref since -- * there is no specific moment in time where that ref should -- * be dropped, except "when there are no more userspace processes -- * holding a ref to the object". Therefore the removal from IPC -- * storage happens at ipc_obj release time. -- */ -- kref_init(&obj->ref); -- obj->data = val; -- get_random_bytes(obj->share_handle, sizeof(obj->share_handle)); -- -- memcpy(sh, obj->share_handle, sizeof(obj->share_handle)); -- -- mutex_lock(&kfd_ipc_handles.lock); -- hlist_add_head(&obj->node, -- &kfd_ipc_handles.handles[HANDLE_TO_KEY(obj->share_handle)]); -- mutex_unlock(&kfd_ipc_handles.lock); -- -- if (ipc_obj) -- *ipc_obj = obj; -- -- return 0; --} -- --static void ipc_obj_release(struct kref *r) --{ -- struct kfd_ipc_obj *obj; -- -- obj = container_of(r, struct kfd_ipc_obj, ref); -- -- mutex_lock(&kfd_ipc_handles.lock); -- hash_del(&obj->node); -- mutex_unlock(&kfd_ipc_handles.lock); -- -- dma_buf_put(obj->data); -- kfree(obj); --} -- --void ipc_obj_get(struct kfd_ipc_obj *obj) --{ -- kref_get(&obj->ref); --} -- --void ipc_obj_put(struct kfd_ipc_obj **obj) --{ -- kref_put(&(*obj)->ref, ipc_obj_release); -- *obj = NULL; --} -- --int kfd_ipc_init(void) --{ -- mutex_init(&kfd_ipc_handles.lock); -- hash_init(kfd_ipc_handles.handles); -- return 0; --} -- --static int kfd_import_dmabuf_create_kfd_bo(struct kfd_dev *dev, -- struct kfd_process *p, -- uint32_t gpu_id, struct dma_buf *dmabuf, -- uint64_t va_addr, uint64_t *handle, -- uint64_t *mmap_offset, -- struct kfd_ipc_obj *ipc_obj) --{ -- int r; -- void *mem; -- uint64_t size; -- int idr_handle; -- struct kfd_process_device *pdd = NULL; -- uint64_t kfd_mmap_flags = KFD_MMAP_TYPE_MAP_BO | -- KFD_MMAP_GPU_ID(gpu_id); -- -- if (!handle) -- return -EINVAL; -- -- if (!dev || !dev->kfd2kgd->import_dmabuf) -- return -EINVAL; -- -- mutex_lock(&p->mutex); -- -- pdd = kfd_bind_process_to_device(dev, p); -- if (IS_ERR(pdd)) { -- r = PTR_ERR(pdd); -- goto err_unlock; -- } -- -- r = dev->kfd2kgd->import_dmabuf(dev->kgd, dmabuf, -- va_addr, pdd->vm, -- (struct kgd_mem **)&mem, &size, -- mmap_offset); -- if (r) -- goto err_unlock; -- -- idr_handle = kfd_process_device_create_obj_handle(pdd, mem, -- va_addr, size, -- ipc_obj); -- if (idr_handle < 0) { -- r = -EFAULT; -- goto err_free; -- } -- -- mutex_unlock(&p->mutex); -- -- *handle = MAKE_HANDLE(gpu_id, idr_handle); -- if (mmap_offset) -- *mmap_offset = (kfd_mmap_flags << PAGE_SHIFT) | *mmap_offset; -- -- return 0; -- --err_free: -- dev->kfd2kgd->free_memory_of_gpu(dev->kgd, -- (struct kgd_mem *)mem, -- pdd->vm); --err_unlock: -- mutex_unlock(&p->mutex); -- return r; --} -- --int kfd_ipc_import_dmabuf(struct kfd_dev *dev, -- struct kfd_process *p, -- uint32_t gpu_id, int dmabuf_fd, -- uint64_t va_addr, uint64_t *handle, -- uint64_t *mmap_offset) --{ -- int r; -- struct dma_buf *dmabuf = dma_buf_get(dmabuf_fd); -- -- if (!dmabuf) -- return -EINVAL; -- -- r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, dmabuf, -- va_addr, handle, mmap_offset, -- NULL); -- dma_buf_put(dmabuf); -- return r; --} -- --int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, -- uint32_t gpu_id, uint32_t *share_handle, -- uint64_t va_addr, uint64_t *handle, -- uint64_t *mmap_offset) --{ -- int r; -- struct kfd_ipc_obj *entry, *found = NULL; -- -- mutex_lock(&kfd_ipc_handles.lock); -- /* Convert the user provided handle to hash key and search only in that -- * bucket -- */ -- hlist_for_each_entry(entry, -- &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { -- if (!memcmp(entry->share_handle, share_handle, -- sizeof(entry->share_handle))) { -- found = entry; -- break; -- } -- } -- mutex_unlock(&kfd_ipc_handles.lock); -- -- if (!found) -- return -EINVAL; -- ipc_obj_get(found); -- -- pr_debug("Found ipc_dma_buf: %p\n", found->data); -- -- r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, found->data, -- va_addr, handle, mmap_offset, -- found); -- if (r) -- goto error_unref; -- -- return r; -- --error_unref: -- ipc_obj_put(&found); -- return r; --} -- --int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, -- uint64_t handle, uint32_t *ipc_handle) --{ -- struct kfd_process_device *pdd = NULL; -- struct kfd_ipc_obj *obj; -- struct kfd_bo *kfd_bo = NULL; -- struct dma_buf *dmabuf; -- int r; -- -- if (!dev || !ipc_handle) -- return -EINVAL; -- -- mutex_lock(&p->mutex); -- pdd = kfd_bind_process_to_device(dev, p); -- if (IS_ERR(pdd)) { -- mutex_unlock(&p->mutex); -- pr_err("Failed to get pdd\n"); -- return PTR_ERR(pdd); -- } -- -- kfd_bo = kfd_process_device_find_bo(pdd, GET_IDR_HANDLE(handle)); -- mutex_unlock(&p->mutex); -- -- if (!kfd_bo) { -- pr_err("Failed to get bo"); -- return -EINVAL; -- } -- if (kfd_bo->kfd_ipc_obj) { -- memcpy(ipc_handle, kfd_bo->kfd_ipc_obj->share_handle, -- sizeof(kfd_bo->kfd_ipc_obj->share_handle)); -- return 0; -- } -- -- r = dev->kfd2kgd->export_dmabuf(dev->kgd, pdd->vm, -- (struct kgd_mem *)kfd_bo->mem, -- &dmabuf); -- if (r) -- return r; -- -- r = ipc_store_insert(dmabuf, ipc_handle, &obj); -- if (r) -- return r; -- -- kfd_bo->kfd_ipc_obj = obj; -- -- return r; --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h -deleted file mode 100644 -index 9ee8627..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h -+++ /dev/null -@@ -1,51 +0,0 @@ --/* -- * Copyright 2014 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- * -- */ -- --#ifndef KFD_IPC_H_ --#define KFD_IPC_H_ -- --#include <linux/types.h> --#include "kfd_priv.h" -- --struct kfd_ipc_obj { -- struct hlist_node node; -- struct kref ref; -- void *data; -- uint32_t share_handle[4]; --}; -- --int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, -- uint32_t gpu_id, uint32_t *share_handle, -- uint64_t va_addr, uint64_t *handle, -- uint64_t *mmap_offset); --int kfd_ipc_import_dmabuf(struct kfd_dev *kfd, struct kfd_process *p, -- uint32_t gpu_id, int dmabuf_fd, -- uint64_t va_addr, uint64_t *handle, -- uint64_t *mmap_offset); --int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, -- uint64_t handle, uint32_t *ipc_handle); -- --void ipc_obj_get(struct kfd_ipc_obj *obj); --void ipc_obj_put(struct kfd_ipc_obj **obj); -- --#endif /* KFD_IPC_H_ */ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c -index 903ef25..d135cd0 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c -@@ -41,8 +41,11 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, - int retval; - union PM4_MES_TYPE_3_HEADER nop; - -- pr_debug("Initializing queue type %d size %d\n", KFD_QUEUE_TYPE_HIQ, -- queue_size); -+ BUG_ON(!kq || !dev); -+ BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ); -+ -+ pr_debug("amdkfd: In func %s initializing queue type %d size %d\n", -+ __func__, KFD_QUEUE_TYPE_HIQ, queue_size); - - memset(&prop, 0, sizeof(prop)); - memset(&nop, 0, sizeof(nop)); -@@ -60,23 +63,23 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, - KFD_MQD_TYPE_HIQ); - break; - default: -- pr_err("Invalid queue type %d\n", type); -- return false; -+ BUG(); -+ break; - } - -- if (!kq->mqd) -+ if (kq->mqd == NULL) - return false; - - prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off); - -- if (!prop.doorbell_ptr) { -- pr_err("Failed to initialize doorbell"); -+ if (prop.doorbell_ptr == NULL) { -+ pr_err("amdkfd: error init doorbell"); - goto err_get_kernel_doorbell; - } - - retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq); - if (retval != 0) { -- pr_err("Failed to init pq queues size %d\n", queue_size); -+ pr_err("amdkfd: error init pq queues size (%d)\n", queue_size); - goto err_pq_allocate_vidmem; - } - -@@ -84,7 +87,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, - kq->pq_gpu_addr = kq->pq->gpu_addr; - - retval = kq->ops_asic_specific.initialize(kq, dev, type, queue_size); -- if (!retval) -+ if (retval == false) - goto err_eop_allocate_vidmem; - - retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel), -@@ -96,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, - kq->rptr_kernel = kq->rptr_mem->cpu_ptr; - kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; - -- retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, -+ retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), - &kq->wptr_mem); - - if (retval != 0) -@@ -120,7 +123,6 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, - prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr; - prop.eop_ring_buffer_address = kq->eop_gpu_addr; - prop.eop_ring_buffer_size = PAGE_SIZE; -- prop.cu_mask = NULL; - - if (init_queue(&kq->queue, &prop) != 0) - goto err_init_queue; -@@ -137,12 +139,11 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, - - /* assign HIQ to HQD */ - if (type == KFD_QUEUE_TYPE_HIQ) { -- pr_debug("Assigning hiq to hqd\n"); -+ pr_debug("assigning hiq to hqd\n"); - kq->queue->pipe = KFD_CIK_HIQ_PIPE; - kq->queue->queue = KFD_CIK_HIQ_QUEUE; - kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe, -- kq->queue->queue, &kq->queue->properties, -- NULL); -+ kq->queue->queue, NULL); - } else { - /* allocate fence for DIQ */ - -@@ -179,10 +180,12 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, - - static void uninitialize(struct kernel_queue *kq) - { -+ BUG_ON(!kq); -+ - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) - kq->mqd->destroy_mqd(kq->mqd, -- kq->queue->mqd, -- KFD_PREEMPT_TYPE_WAVEFRONT_RESET, -+ NULL, -+ false, - QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, - kq->queue->pipe, - kq->queue->queue); -@@ -206,17 +209,12 @@ static int acquire_packet_buffer(struct kernel_queue *kq, - size_t available_size; - size_t queue_size_dwords; - uint32_t wptr, rptr; -- uint64_t wptr64; - unsigned int *queue_address; - -- /* When rptr == wptr, the buffer is empty. -- * When rptr == wptr + 1, the buffer is full. -- * It is always rptr that advances to the position of wptr, rather than -- * the opposite. So we can only use up to queue_size_dwords - 1 dwords. -- */ -+ BUG_ON(!kq || !buffer_ptr); -+ - rptr = *kq->rptr_kernel; -- wptr = kq->pending_wptr; -- wptr64 = kq->pending_wptr64; -+ wptr = *kq->wptr_kernel; - queue_address = (unsigned int *)kq->pq_kernel_addr; - queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); - -@@ -224,72 +222,28 @@ static int acquire_packet_buffer(struct kernel_queue *kq, - pr_debug("wptr: %d\n", wptr); - pr_debug("queue_address 0x%p\n", queue_address); - -- available_size = (rptr + queue_size_dwords - 1 - wptr) % -+ available_size = (rptr - 1 - wptr + queue_size_dwords) % - queue_size_dwords; - -- if (packet_size_in_dwords > available_size) { -+ if (packet_size_in_dwords >= queue_size_dwords || -+ packet_size_in_dwords >= available_size) { - /* - * make sure calling functions know - * acquire_packet_buffer() failed - */ -- goto err_no_space; -+ *buffer_ptr = NULL; -+ return -ENOMEM; - } - - if (wptr + packet_size_in_dwords >= queue_size_dwords) { -- /* make sure after rolling back to position 0, there is -- * still enough space. -- */ -- if (packet_size_in_dwords >= rptr) -- goto err_no_space; -- -- /* fill nops, roll back and start at position 0 */ - while (wptr > 0) { - queue_address[wptr] = kq->nop_packet; - wptr = (wptr + 1) % queue_size_dwords; -- wptr64++; - } - } - - *buffer_ptr = &queue_address[wptr]; - kq->pending_wptr = wptr + packet_size_in_dwords; -- kq->pending_wptr64 = wptr64 + packet_size_in_dwords; -- -- return 0; -- --err_no_space: -- *buffer_ptr = NULL; -- return -ENOMEM; --} -- --static int acquire_inline_ib(struct kernel_queue *kq, -- size_t size_in_dwords, -- unsigned int **buffer_ptr, -- uint64_t *gpu_addr) --{ -- int ret; -- unsigned int *buf; -- union PM4_MES_TYPE_3_HEADER nop; -- -- if (size_in_dwords >= (1 << 14)) -- return -EINVAL; -- -- /* Allocate size_in_dwords on the ring, plus an extra dword -- * for a NOP packet header -- */ -- ret = acquire_packet_buffer(kq, size_in_dwords + 1, &buf); -- if (ret) -- return ret; -- -- /* Build a NOP packet that contains the IB as "payload". */ -- nop.u32all = 0; -- nop.opcode = IT_NOP; -- nop.count = size_in_dwords - 1; -- nop.type = PM4_TYPE_3; -- -- *buf = nop.u32all; -- *buffer_ptr = buf + 1; -- *gpu_addr = kq->pq_gpu_addr + ((unsigned long)*buffer_ptr - -- (unsigned long)kq->pq_kernel_addr); - - return 0; - } -@@ -298,7 +252,11 @@ static void submit_packet(struct kernel_queue *kq) - { - #ifdef DEBUG - int i; -+#endif -+ -+ BUG_ON(!kq); - -+#ifdef DEBUG - for (i = *kq->wptr_kernel; i < kq->pending_wptr; i++) { - pr_debug("0x%2X ", kq->pq_kernel_addr[i]); - if (i % 15 == 0) -@@ -307,11 +265,14 @@ static void submit_packet(struct kernel_queue *kq) - pr_debug("\n"); - #endif - -- kq->ops_asic_specific.submit_packet(kq); -+ *kq->wptr_kernel = kq->pending_wptr; -+ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, -+ kq->pending_wptr); - } - - static void rollback_packet(struct kernel_queue *kq) - { -+ BUG_ON(!kq); - kq->pending_wptr = *kq->queue->properties.write_ptr; - } - -@@ -320,41 +281,30 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, - { - struct kernel_queue *kq; - -- kq = kzalloc(sizeof(*kq), GFP_KERNEL); -+ BUG_ON(!dev); -+ -+ kq = kzalloc(sizeof(struct kernel_queue), GFP_KERNEL); - if (!kq) - return NULL; - - kq->ops.initialize = initialize; - kq->ops.uninitialize = uninitialize; - kq->ops.acquire_packet_buffer = acquire_packet_buffer; -- kq->ops.acquire_inline_ib = acquire_inline_ib; - kq->ops.submit_packet = submit_packet; - kq->ops.rollback_packet = rollback_packet; - - switch (dev->device_info->asic_family) { - case CHIP_CARRIZO: -- case CHIP_TONGA: -- case CHIP_FIJI: -- case CHIP_POLARIS10: -- case CHIP_POLARIS11: - kernel_queue_init_vi(&kq->ops_asic_specific); - break; - - case CHIP_KAVERI: -- case CHIP_HAWAII: - kernel_queue_init_cik(&kq->ops_asic_specific); - break; -- -- case CHIP_VEGA10: -- case CHIP_RAVEN: -- kernel_queue_init_v9(&kq->ops_asic_specific); -- break; -- default: -- BUG(); - } - - if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) { -- pr_err("Failed to init kernel queue\n"); -+ pr_err("amdkfd: failed to init kernel queue\n"); - kfree(kq); - return NULL; - } -@@ -363,37 +313,32 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, - - void kernel_queue_uninit(struct kernel_queue *kq) - { -+ BUG_ON(!kq); -+ - kq->ops.uninitialize(kq); - kfree(kq); - } - --/* FIXME: Can this test be removed? */ - static __attribute__((unused)) void test_kq(struct kfd_dev *dev) - { - struct kernel_queue *kq; - uint32_t *buffer, i; - int retval; - -- pr_err("Starting kernel queue test\n"); -+ BUG_ON(!dev); -+ -+ pr_err("amdkfd: starting kernel queue test\n"); - - kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); -- if (unlikely(!kq)) { -- pr_err(" Failed to initialize HIQ\n"); -- pr_err("Kernel queue test failed\n"); -- return; -- } -+ BUG_ON(!kq); - - retval = kq->ops.acquire_packet_buffer(kq, 5, &buffer); -- if (unlikely(retval != 0)) { -- pr_err(" Failed to acquire packet buffer\n"); -- pr_err("Kernel queue test failed\n"); -- return; -- } -+ BUG_ON(retval != 0); - for (i = 0; i < 5; i++) - buffer[i] = kq->nop_packet; - kq->ops.submit_packet(kq); - -- pr_err("Ending kernel queue test\n"); -+ pr_err("amdkfd: ending kernel queue test\n"); - } - - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h -index 82c94a6..5940531 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h -@@ -42,12 +42,6 @@ - * pending write pointer to that location so subsequent calls to - * acquire_packet_buffer will get a correct write pointer - * -- * @acquire_inline_ib: Returns a pointer to the location in the kernel -- * queue ring buffer where the calling function can write an inline IB. It is -- * Guaranteed that there is enough space for that IB. It also updates the -- * pending write pointer to that location so subsequent calls to -- * acquire_packet_buffer will get a correct write pointer -- * - * @submit_packet: Update the write pointer and doorbell of a kernel queue. - * - * @sync_with_hw: Wait until the write pointer and the read pointer of a kernel -@@ -65,10 +59,6 @@ struct kernel_queue_ops { - int (*acquire_packet_buffer)(struct kernel_queue *kq, - size_t packet_size_in_dwords, - unsigned int **buffer_ptr); -- int (*acquire_inline_ib)(struct kernel_queue *kq, -- size_t packet_size_in_dwords, -- unsigned int **buffer_ptr, -- uint64_t *gpu_addr); - - void (*submit_packet)(struct kernel_queue *kq); - void (*rollback_packet)(struct kernel_queue *kq); -@@ -82,7 +72,6 @@ struct kernel_queue { - struct kfd_dev *dev; - struct mqd_manager *mqd; - struct queue *queue; -- uint64_t pending_wptr64; - uint32_t pending_wptr; - unsigned int nop_packet; - -@@ -90,10 +79,7 @@ struct kernel_queue { - uint32_t *rptr_kernel; - uint64_t rptr_gpu_addr; - struct kfd_mem_obj *wptr_mem; -- union { -- uint64_t *wptr64_kernel; -- uint32_t *wptr_kernel; -- }; -+ uint32_t *wptr_kernel; - uint64_t wptr_gpu_addr; - struct kfd_mem_obj *pq; - uint64_t pq_gpu_addr; -@@ -111,6 +97,5 @@ struct kernel_queue { - - void kernel_queue_init_cik(struct kernel_queue_ops *ops); - void kernel_queue_init_vi(struct kernel_queue_ops *ops); --void kernel_queue_init_v9(struct kernel_queue_ops *ops); - - #endif /* KFD_KERNEL_QUEUE_H_ */ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c -index 2808422..a90eb44 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c -@@ -22,19 +22,15 @@ - */ - - #include "kfd_kernel_queue.h" --#include "kfd_pm4_headers.h" --#include "kfd_pm4_opcodes.h" - - static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, - enum kfd_queue_type type, unsigned int queue_size); - static void uninitialize_cik(struct kernel_queue *kq); --static void submit_packet_cik(struct kernel_queue *kq); - - void kernel_queue_init_cik(struct kernel_queue_ops *ops) - { - ops->initialize = initialize_cik; - ops->uninitialize = uninitialize_cik; -- ops->submit_packet = submit_packet_cik; - } - - static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, -@@ -46,127 +42,3 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, - static void uninitialize_cik(struct kernel_queue *kq) - { - } -- --static void submit_packet_cik(struct kernel_queue *kq) --{ -- *kq->wptr_kernel = kq->pending_wptr; -- write_kernel_doorbell(kq->queue->properties.doorbell_ptr, -- kq->pending_wptr); --} -- --static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer, -- struct qcm_process_device *qpd) --{ -- struct pm4_map_process *packet; -- -- packet = (struct pm4_map_process *)buffer; -- -- memset(buffer, 0, sizeof(struct pm4_map_process)); -- -- packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, -- sizeof(struct pm4_map_process)); -- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -- packet->bitfields2.process_quantum = 1; -- packet->bitfields2.pasid = qpd->pqm->process->pasid; -- packet->bitfields3.page_table_base = qpd->page_table_base; -- packet->bitfields10.gds_size = qpd->gds_size; -- packet->bitfields10.num_gws = qpd->num_gws; -- packet->bitfields10.num_oac = qpd->num_oac; -- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; -- -- packet->sh_mem_config = qpd->sh_mem_config; -- packet->sh_mem_bases = qpd->sh_mem_bases; -- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -- -- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -- -- return 0; --} -- --static int pm_map_process_scratch_cik(struct packet_manager *pm, -- uint32_t *buffer, struct qcm_process_device *qpd) --{ -- struct pm4_map_process_scratch_kv *packet; -- -- packet = (struct pm4_map_process_scratch_kv *)buffer; -- -- memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); -- -- packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, -- sizeof(struct pm4_map_process_scratch_kv)); -- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -- packet->bitfields2.process_quantum = 1; -- packet->bitfields2.pasid = qpd->pqm->process->pasid; -- packet->bitfields3.page_table_base = qpd->page_table_base; -- packet->bitfields14.gds_size = qpd->gds_size; -- packet->bitfields14.num_gws = qpd->num_gws; -- packet->bitfields14.num_oac = qpd->num_oac; -- packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; -- -- packet->sh_mem_config = qpd->sh_mem_config; -- packet->sh_mem_bases = qpd->sh_mem_bases; -- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -- -- packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; -- -- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -- -- return 0; --} -- --static uint32_t pm_get_map_process_packet_size_cik(void) --{ -- return sizeof(struct pm4_map_process); --} --static uint32_t pm_get_map_process_scratch_packet_size_cik(void) --{ -- return sizeof(struct pm4_map_process_scratch_kv); --} -- -- --static struct packet_manager_funcs kfd_cik_pm_funcs = { -- .map_process = pm_map_process_cik, -- .runlist = pm_runlist_vi, -- .set_resources = pm_set_resources_vi, -- .map_queues = pm_map_queues_vi, -- .unmap_queues = pm_unmap_queues_vi, -- .query_status = pm_query_status_vi, -- .release_mem = pm_release_mem_vi, -- .get_map_process_packet_size = pm_get_map_process_packet_size_cik, -- .get_runlist_packet_size = pm_get_runlist_packet_size_vi, -- .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, -- .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, -- .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, -- .get_query_status_packet_size = pm_get_query_status_packet_size_vi, -- .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, --}; -- --static struct packet_manager_funcs kfd_cik_scratch_pm_funcs = { -- .map_process = pm_map_process_scratch_cik, -- .runlist = pm_runlist_vi, -- .set_resources = pm_set_resources_vi, -- .map_queues = pm_map_queues_vi, -- .unmap_queues = pm_unmap_queues_vi, -- .query_status = pm_query_status_vi, -- .release_mem = pm_release_mem_vi, -- .get_map_process_packet_size = -- pm_get_map_process_scratch_packet_size_cik, -- .get_runlist_packet_size = pm_get_runlist_packet_size_vi, -- .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, -- .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, -- .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, -- .get_query_status_packet_size = pm_get_query_status_packet_size_vi, -- .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, --}; -- --void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver) --{ -- if (fw_ver >= KFD_SCRATCH_KV_FW_VER) -- pm->pmf = &kfd_cik_scratch_pm_funcs; -- else -- pm->pmf = &kfd_cik_pm_funcs; --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c -deleted file mode 100644 -index 5fe4f60..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c -+++ /dev/null -@@ -1,377 +0,0 @@ --/* -- * Copyright 2016 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- * -- */ -- --#include "kfd_kernel_queue.h" --#include "kfd_device_queue_manager.h" --#include "kfd_pm4_headers_ai.h" --#include "kfd_pm4_opcodes.h" -- --static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, -- enum kfd_queue_type type, unsigned int queue_size); --static void uninitialize_v9(struct kernel_queue *kq); --static void submit_packet_v9(struct kernel_queue *kq); -- --void kernel_queue_init_v9(struct kernel_queue_ops *ops) --{ -- ops->initialize = initialize_v9; -- ops->uninitialize = uninitialize_v9; -- ops->submit_packet = submit_packet_v9; --} -- --static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, -- enum kfd_queue_type type, unsigned int queue_size) --{ -- int retval; -- -- retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); -- if (retval != 0) -- return false; -- -- kq->eop_gpu_addr = kq->eop_mem->gpu_addr; -- kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; -- -- memset(kq->eop_kernel_addr, 0, PAGE_SIZE); -- -- return true; --} -- --static void uninitialize_v9(struct kernel_queue *kq) --{ -- kfd_gtt_sa_free(kq->dev, kq->eop_mem); --} -- --static void submit_packet_v9(struct kernel_queue *kq) --{ -- *kq->wptr64_kernel = kq->pending_wptr64; -- write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, -- kq->pending_wptr64); --} -- --static int pm_map_process_v9(struct packet_manager *pm, -- uint32_t *buffer, struct qcm_process_device *qpd) --{ -- struct pm4_mes_map_process *packet; -- uint64_t vm_page_table_base_addr = -- (uint64_t)(qpd->page_table_base) << 12; -- -- packet = (struct pm4_mes_map_process *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mes_map_process)); -- -- packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, -- sizeof(struct pm4_mes_map_process)); -- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -- packet->bitfields2.process_quantum = 1; -- packet->bitfields2.pasid = qpd->pqm->process->pasid; -- packet->bitfields14.gds_size = qpd->gds_size; -- packet->bitfields14.num_gws = qpd->num_gws; -- packet->bitfields14.num_oac = qpd->num_oac; -- packet->bitfields14.sdma_enable = 1; -- packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; -- -- packet->sh_mem_config = qpd->sh_mem_config; -- packet->sh_mem_bases = qpd->sh_mem_bases; -- packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); -- packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); -- packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); -- packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); -- -- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -- -- packet->vm_context_page_table_base_addr_lo32 = -- lower_32_bits(vm_page_table_base_addr); -- packet->vm_context_page_table_base_addr_hi32 = -- upper_32_bits(vm_page_table_base_addr); -- -- return 0; --} -- --static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, -- uint64_t ib, size_t ib_size_in_dwords, bool chain) --{ -- struct pm4_mes_runlist *packet; -- -- int concurrent_proc_cnt = 0; -- struct kfd_dev *kfd = pm->dqm->dev; -- -- /* Determine the number of processes to map together to HW: -- * it can not exceed the number of VMIDs available to the -- * scheduler, and it is determined by the smaller of the number -- * of processes in the runlist and kfd module parameter -- * hws_max_conc_proc. -- * Note: the arbitration between the number of VMIDs and -- * hws_max_conc_proc has been done in -- * kgd2kfd_device_init(). -- */ -- concurrent_proc_cnt = min(pm->dqm->processes_count, -- kfd->max_proc_per_quantum); -- -- -- packet = (struct pm4_mes_runlist *)buffer; -- -- memset(buffer, 0, sizeof(struct pm4_mes_runlist)); -- packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, -- sizeof(struct pm4_mes_runlist)); -- -- packet->bitfields4.ib_size = ib_size_in_dwords; -- packet->bitfields4.chain = chain ? 1 : 0; -- packet->bitfields4.offload_polling = 0; -- packet->bitfields4.valid = 1; -- packet->bitfields4.process_cnt = concurrent_proc_cnt; -- packet->ordinal2 = lower_32_bits(ib); -- packet->ib_base_hi = upper_32_bits(ib); -- -- return 0; --} -- --static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, -- struct queue *q, bool is_static) --{ -- struct pm4_mes_map_queues *packet; -- bool use_static = is_static; -- -- packet = (struct pm4_mes_map_queues *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); -- -- packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, -- sizeof(struct pm4_mes_map_queues)); -- packet->bitfields2.alloc_format = -- alloc_format__mes_map_queues__one_per_pipe_vi; -- packet->bitfields2.num_queues = 1; -- packet->bitfields2.queue_sel = -- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; -- -- packet->bitfields2.engine_sel = -- engine_sel__mes_map_queues__compute_vi; -- packet->bitfields2.queue_type = -- queue_type__mes_map_queues__normal_compute_vi; -- -- switch (q->properties.type) { -- case KFD_QUEUE_TYPE_COMPUTE: -- if (use_static) -- packet->bitfields2.queue_type = -- queue_type__mes_map_queues__normal_latency_static_queue_vi; -- break; -- case KFD_QUEUE_TYPE_DIQ: -- packet->bitfields2.queue_type = -- queue_type__mes_map_queues__debug_interface_queue_vi; -- break; -- case KFD_QUEUE_TYPE_SDMA: -- packet->bitfields2.engine_sel = q->properties.sdma_engine_id + -- engine_sel__mes_map_queues__sdma0_vi; -- use_static = false; /* no static queues under SDMA */ -- break; -- default: -- WARN(1, "queue type %d", q->properties.type); -- return -EINVAL; -- } -- packet->bitfields3.doorbell_offset = -- q->properties.doorbell_off; -- -- packet->mqd_addr_lo = -- lower_32_bits(q->gart_mqd_addr); -- -- packet->mqd_addr_hi = -- upper_32_bits(q->gart_mqd_addr); -- -- packet->wptr_addr_lo = -- lower_32_bits((uint64_t)q->properties.write_ptr); -- -- packet->wptr_addr_hi = -- upper_32_bits((uint64_t)q->properties.write_ptr); -- -- return 0; --} -- --static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, -- enum kfd_queue_type type, -- enum kfd_unmap_queues_filter filter, -- uint32_t filter_param, bool reset, -- unsigned int sdma_engine) --{ -- struct pm4_mes_unmap_queues *packet; -- -- packet = (struct pm4_mes_unmap_queues *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); -- -- packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, -- sizeof(struct pm4_mes_unmap_queues)); -- switch (type) { -- case KFD_QUEUE_TYPE_COMPUTE: -- case KFD_QUEUE_TYPE_DIQ: -- packet->bitfields2.engine_sel = -- engine_sel__mes_unmap_queues__compute; -- break; -- case KFD_QUEUE_TYPE_SDMA: -- packet->bitfields2.engine_sel = -- engine_sel__mes_unmap_queues__sdma0 + sdma_engine; -- break; -- default: -- WARN(1, "queue type %d", type); -- return -EINVAL; -- } -- -- if (reset) -- packet->bitfields2.action = -- action__mes_unmap_queues__reset_queues; -- else -- packet->bitfields2.action = -- action__mes_unmap_queues__preempt_queues; -- -- switch (filter) { -- case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__perform_request_on_specified_queues; -- packet->bitfields2.num_queues = 1; -- packet->bitfields3b.doorbell_offset0 = filter_param; -- break; -- case KFD_UNMAP_QUEUES_FILTER_BY_PASID: -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; -- packet->bitfields3a.pasid = filter_param; -- break; -- case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__unmap_all_queues; -- break; -- case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: -- /* in this case, we do not preempt static queues */ -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__unmap_all_non_static_queues; -- break; -- default: -- WARN(1, "filter %d", filter); -- return -EINVAL; -- } -- -- return 0; -- --} -- --static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, -- uint64_t fence_address, uint32_t fence_value) --{ -- struct pm4_mes_query_status *packet; -- -- packet = (struct pm4_mes_query_status *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mes_query_status)); -- -- -- packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, -- sizeof(struct pm4_mes_query_status)); -- -- packet->bitfields2.context_id = 0; -- packet->bitfields2.interrupt_sel = -- interrupt_sel__mes_query_status__completion_status; -- packet->bitfields2.command = -- command__mes_query_status__fence_only_after_write_ack; -- -- packet->addr_hi = upper_32_bits((uint64_t)fence_address); -- packet->addr_lo = lower_32_bits((uint64_t)fence_address); -- packet->data_hi = upper_32_bits((uint64_t)fence_value); -- packet->data_lo = lower_32_bits((uint64_t)fence_value); -- -- return 0; --} -- -- --static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) --{ -- struct pm4_mec_release_mem *packet; -- -- packet = (struct pm4_mec_release_mem *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); -- -- packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, -- sizeof(struct pm4_mec_release_mem)); -- -- packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; -- packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; -- packet->bitfields2.tcl1_action_ena = 1; -- packet->bitfields2.tc_action_ena = 1; -- packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; -- -- packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; -- packet->bitfields3.int_sel = -- int_sel__mec_release_mem__send_interrupt_after_write_confirm; -- -- packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; -- packet->address_hi = upper_32_bits(gpu_addr); -- -- packet->data_lo = 0; -- -- return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); --} -- --static uint32_t pm_get_map_process_packet_size_v9(void) --{ -- return sizeof(struct pm4_mes_map_process); --} -- --static uint32_t pm_get_runlist_packet_size_v9(void) --{ -- return sizeof(struct pm4_mes_runlist); --} -- --static uint32_t pm_get_map_queues_packet_size_v9(void) --{ -- return sizeof(struct pm4_mes_map_queues); --} -- --static uint32_t pm_get_unmap_queues_packet_size_v9(void) --{ -- return sizeof(struct pm4_mes_unmap_queues); --} -- --static uint32_t pm_get_query_status_packet_size_v9(void) --{ -- return sizeof(struct pm4_mes_query_status); --} -- --static uint32_t pm_get_release_mem_packet_size_v9(void) --{ -- return sizeof(struct pm4_mec_release_mem); --} -- --static struct packet_manager_funcs kfd_v9_pm_funcs = { -- .map_process = pm_map_process_v9, -- .runlist = pm_runlist_v9, -- .set_resources = pm_set_resources_vi, -- .map_queues = pm_map_queues_v9, -- .unmap_queues = pm_unmap_queues_v9, -- .query_status = pm_query_status_v9, -- .release_mem = pm_release_mem_v9, -- .get_map_process_packet_size = pm_get_map_process_packet_size_v9, -- .get_runlist_packet_size = pm_get_runlist_packet_size_v9, -- .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, -- .get_map_queues_packet_size = pm_get_map_queues_packet_size_v9, -- .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_v9, -- .get_query_status_packet_size = pm_get_query_status_packet_size_v9, -- .get_release_mem_packet_size = pm_get_release_mem_packet_size_v9, --}; -- --void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver) --{ -- pm->pmf = &kfd_v9_pm_funcs; --} -- -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c -index ecf4a33..f1d4828 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c -@@ -22,20 +22,15 @@ - */ - - #include "kfd_kernel_queue.h" --#include "kfd_device_queue_manager.h" --#include "kfd_pm4_headers_vi.h" --#include "kfd_pm4_opcodes.h" - - static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, - enum kfd_queue_type type, unsigned int queue_size); - static void uninitialize_vi(struct kernel_queue *kq); --static void submit_packet_vi(struct kernel_queue *kq); - - void kernel_queue_init_vi(struct kernel_queue_ops *ops) - { - ops->initialize = initialize_vi; - ops->uninitialize = uninitialize_vi; -- ops->submit_packet = submit_packet_vi; - } - - static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, -@@ -59,359 +54,3 @@ static void uninitialize_vi(struct kernel_queue *kq) - { - kfd_gtt_sa_free(kq->dev, kq->eop_mem); - } -- --static void submit_packet_vi(struct kernel_queue *kq) --{ -- *kq->wptr_kernel = kq->pending_wptr; -- write_kernel_doorbell(kq->queue->properties.doorbell_ptr, -- kq->pending_wptr); --} -- --static int pm_map_process_vi(struct packet_manager *pm, -- uint32_t *buffer, struct qcm_process_device *qpd) --{ -- struct pm4_mes_map_process *packet; -- -- packet = (struct pm4_mes_map_process *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mes_map_process)); -- -- packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, -- sizeof(struct pm4_mes_map_process)); -- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -- packet->bitfields2.process_quantum = 1; -- packet->bitfields2.pasid = qpd->pqm->process->pasid; -- packet->bitfields3.page_table_base = qpd->page_table_base; -- packet->bitfields10.gds_size = qpd->gds_size; -- packet->bitfields10.num_gws = qpd->num_gws; -- packet->bitfields10.num_oac = qpd->num_oac; -- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; -- -- packet->sh_mem_config = qpd->sh_mem_config; -- packet->sh_mem_bases = qpd->sh_mem_bases; -- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -- -- packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; -- -- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -- -- return 0; --} -- -- --unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) --{ -- union PM4_MES_TYPE_3_HEADER header; -- -- header.u32All = 0; -- header.opcode = opcode; -- header.count = packet_size/sizeof(uint32_t) - 2; -- header.type = PM4_TYPE_3; -- -- return header.u32All; --} -- --int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, -- uint64_t ib, size_t ib_size_in_dwords, bool chain) --{ -- struct pm4_mes_runlist *packet; -- -- int concurrent_proc_cnt = 0; -- struct kfd_dev *kfd = pm->dqm->dev; -- -- /* Determine the number of processes to map together to HW: -- * it can not exceed the number of VMIDs available to the -- * scheduler, and it is determined by the smaller of the number -- * of processes in the runlist and kfd module parameter -- * hws_max_conc_proc. -- * Note: the arbitration between the number of VMIDs and -- * hws_max_conc_proc has been done in -- * kgd2kfd_device_init(). -- */ -- concurrent_proc_cnt = min(pm->dqm->processes_count, -- kfd->max_proc_per_quantum); -- -- -- packet = (struct pm4_mes_runlist *)buffer; -- -- memset(buffer, 0, sizeof(struct pm4_mes_runlist)); -- packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, -- sizeof(struct pm4_mes_runlist)); -- -- packet->bitfields4.ib_size = ib_size_in_dwords; -- packet->bitfields4.chain = chain ? 1 : 0; -- packet->bitfields4.offload_polling = 0; -- packet->bitfields4.valid = 1; -- packet->bitfields4.process_cnt = concurrent_proc_cnt; -- packet->ordinal2 = lower_32_bits(ib); -- packet->bitfields3.ib_base_hi = upper_32_bits(ib); -- -- return 0; --} -- --int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, -- struct queue *q, bool is_static) --{ -- struct pm4_mes_map_queues *packet; -- bool use_static = is_static; -- -- packet = (struct pm4_mes_map_queues *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); -- -- packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, -- sizeof(struct pm4_mes_map_queues)); -- packet->bitfields2.alloc_format = -- alloc_format__mes_map_queues__one_per_pipe_vi; -- packet->bitfields2.num_queues = 1; -- packet->bitfields2.queue_sel = -- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; -- -- packet->bitfields2.engine_sel = -- engine_sel__mes_map_queues__compute_vi; -- packet->bitfields2.queue_type = -- queue_type__mes_map_queues__normal_compute_vi; -- -- switch (q->properties.type) { -- case KFD_QUEUE_TYPE_COMPUTE: -- if (use_static) -- packet->bitfields2.queue_type = -- queue_type__mes_map_queues__normal_latency_static_queue_vi; -- break; -- case KFD_QUEUE_TYPE_DIQ: -- packet->bitfields2.queue_type = -- queue_type__mes_map_queues__debug_interface_queue_vi; -- break; -- case KFD_QUEUE_TYPE_SDMA: -- packet->bitfields2.engine_sel = q->properties.sdma_engine_id + -- engine_sel__mes_map_queues__sdma0_vi; -- use_static = false; /* no static queues under SDMA */ -- break; -- default: -- WARN(1, "queue type %d", q->properties.type); -- return -EINVAL; -- } -- packet->bitfields3.doorbell_offset = -- q->properties.doorbell_off; -- -- packet->mqd_addr_lo = -- lower_32_bits(q->gart_mqd_addr); -- -- packet->mqd_addr_hi = -- upper_32_bits(q->gart_mqd_addr); -- -- packet->wptr_addr_lo = -- lower_32_bits((uint64_t)q->properties.write_ptr); -- -- packet->wptr_addr_hi = -- upper_32_bits((uint64_t)q->properties.write_ptr); -- -- return 0; --} -- --int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, -- struct scheduling_resources *res) --{ -- struct pm4_mes_set_resources *packet; -- -- packet = (struct pm4_mes_set_resources *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); -- -- packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, -- sizeof(struct pm4_mes_set_resources)); -- -- packet->bitfields2.queue_type = -- queue_type__mes_set_resources__hsa_interface_queue_hiq; -- packet->bitfields2.vmid_mask = res->vmid_mask; -- packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; -- packet->bitfields7.oac_mask = res->oac_mask; -- packet->bitfields8.gds_heap_base = res->gds_heap_base; -- packet->bitfields8.gds_heap_size = res->gds_heap_size; -- -- packet->gws_mask_lo = lower_32_bits(res->gws_mask); -- packet->gws_mask_hi = upper_32_bits(res->gws_mask); -- -- packet->queue_mask_lo = lower_32_bits(res->queue_mask); -- packet->queue_mask_hi = upper_32_bits(res->queue_mask); -- -- return 0; --} -- --int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, -- enum kfd_queue_type type, -- enum kfd_unmap_queues_filter filter, -- uint32_t filter_param, bool reset, -- unsigned int sdma_engine) --{ -- struct pm4_mes_unmap_queues *packet; -- -- packet = (struct pm4_mes_unmap_queues *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); -- -- packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, -- sizeof(struct pm4_mes_unmap_queues)); -- switch (type) { -- case KFD_QUEUE_TYPE_COMPUTE: -- case KFD_QUEUE_TYPE_DIQ: -- packet->bitfields2.engine_sel = -- engine_sel__mes_unmap_queues__compute; -- break; -- case KFD_QUEUE_TYPE_SDMA: -- packet->bitfields2.engine_sel = -- engine_sel__mes_unmap_queues__sdma0 + sdma_engine; -- break; -- default: -- WARN(1, "queue type %d", type); -- return -EINVAL; -- } -- -- if (reset) -- packet->bitfields2.action = -- action__mes_unmap_queues__reset_queues; -- else -- packet->bitfields2.action = -- action__mes_unmap_queues__preempt_queues; -- -- switch (filter) { -- case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__perform_request_on_specified_queues; -- packet->bitfields2.num_queues = 1; -- packet->bitfields3b.doorbell_offset0 = filter_param; -- break; -- case KFD_UNMAP_QUEUES_FILTER_BY_PASID: -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; -- packet->bitfields3a.pasid = filter_param; -- break; -- case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__unmap_all_queues; -- break; -- case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: -- /* in this case, we do not preempt static queues */ -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__unmap_all_non_static_queues; -- break; -- default: -- WARN(1, "filter %d", filter); -- return -EINVAL; -- } -- -- return 0; -- --} -- --int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, -- uint64_t fence_address, uint32_t fence_value) --{ -- struct pm4_mes_query_status *packet; -- -- packet = (struct pm4_mes_query_status *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mes_query_status)); -- -- -- packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, -- sizeof(struct pm4_mes_query_status)); -- -- packet->bitfields2.context_id = 0; -- packet->bitfields2.interrupt_sel = -- interrupt_sel__mes_query_status__completion_status; -- packet->bitfields2.command = -- command__mes_query_status__fence_only_after_write_ack; -- -- packet->addr_hi = upper_32_bits((uint64_t)fence_address); -- packet->addr_lo = lower_32_bits((uint64_t)fence_address); -- packet->data_hi = upper_32_bits((uint64_t)fence_value); -- packet->data_lo = lower_32_bits((uint64_t)fence_value); -- -- return 0; --} -- -- --uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) --{ -- struct pm4_mec_release_mem *packet; -- -- packet = (struct pm4_mec_release_mem *)buffer; -- memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); -- -- packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, -- sizeof(struct pm4_mec_release_mem)); -- -- packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; -- packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; -- packet->bitfields2.tcl1_action_ena = 1; -- packet->bitfields2.tc_action_ena = 1; -- packet->bitfields2.cache_policy = cache_policy___release_mem__lru; -- packet->bitfields2.atc = 0; -- -- packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; -- packet->bitfields3.int_sel = -- int_sel___release_mem__send_interrupt_after_write_confirm; -- -- packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; -- packet->address_hi = upper_32_bits(gpu_addr); -- -- packet->data_lo = 0; -- -- return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); --} -- --uint32_t pm_get_map_process_packet_size_vi(void) --{ -- return sizeof(struct pm4_mes_map_process); --} -- --uint32_t pm_get_runlist_packet_size_vi(void) --{ -- return sizeof(struct pm4_mes_runlist); --} -- --uint32_t pm_get_set_resources_packet_size_vi(void) --{ -- return sizeof(struct pm4_mes_set_resources); --} -- --uint32_t pm_get_map_queues_packet_size_vi(void) --{ -- return sizeof(struct pm4_mes_map_queues); --} -- --uint32_t pm_get_unmap_queues_packet_size_vi(void) --{ -- return sizeof(struct pm4_mes_unmap_queues); --} -- --uint32_t pm_get_query_status_packet_size_vi(void) --{ -- return sizeof(struct pm4_mes_query_status); --} -- --uint32_t pm_get_release_mem_packet_size_vi(void) --{ -- return sizeof(struct pm4_mec_release_mem); --} -- -- --static struct packet_manager_funcs kfd_vi_pm_funcs = { -- .map_process = pm_map_process_vi, -- .runlist = pm_runlist_vi, -- .set_resources = pm_set_resources_vi, -- .map_queues = pm_map_queues_vi, -- .unmap_queues = pm_unmap_queues_vi, -- .query_status = pm_query_status_vi, -- .release_mem = pm_release_mem_vi, -- .get_map_process_packet_size = pm_get_map_process_packet_size_vi, -- .get_runlist_packet_size = pm_get_runlist_packet_size_vi, -- .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, -- .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, -- .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, -- .get_query_status_packet_size = pm_get_query_status_packet_size_vi, -- .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, --}; -- --void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver) --{ -- pm->pmf = &kfd_vi_pm_funcs; --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c -index ba4d5de..850a562 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c -@@ -29,10 +29,10 @@ - #define KFD_DRIVER_AUTHOR "AMD Inc. and others" - - #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" --#define KFD_DRIVER_DATE "20160408" --#define KFD_DRIVER_MAJOR 2 --#define KFD_DRIVER_MINOR 0 --#define KFD_DRIVER_PATCHLEVEL 0 -+#define KFD_DRIVER_DATE "20150421" -+#define KFD_DRIVER_MAJOR 0 -+#define KFD_DRIVER_MINOR 7 -+#define KFD_DRIVER_PATCHLEVEL 2 - - static const struct kgd2kfd_calls kgd2kfd = { - .exit = kgd2kfd_exit, -@@ -42,10 +42,6 @@ static const struct kgd2kfd_calls kgd2kfd = { - .interrupt = kgd2kfd_interrupt, - .suspend = kgd2kfd_suspend, - .resume = kgd2kfd_resume, -- .quiesce_mm = kgd2kfd_quiesce_mm, -- .resume_mm = kgd2kfd_resume_mm, -- .schedule_evict_and_restore_process = -- kgd2kfd_schedule_evict_and_restore_process, - }; - - int sched_policy = KFD_SCHED_POLICY_HWS; -@@ -53,15 +49,6 @@ module_param(sched_policy, int, 0444); - MODULE_PARM_DESC(sched_policy, - "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); - --int hws_max_conc_proc = 8; --module_param(hws_max_conc_proc, int, 0444); --MODULE_PARM_DESC(hws_max_conc_proc, -- "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); -- --int cwsr_enable = 1; --module_param(cwsr_enable, int, 0444); --MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); -- - int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; - module_param(max_num_of_queues_per_device, int, 0444); - MODULE_PARM_DESC(max_num_of_queues_per_device, -@@ -74,28 +61,7 @@ MODULE_PARM_DESC(send_sigterm, - - static int amdkfd_init_completed; - --int debug_largebar; --module_param(debug_largebar, int, 0444); --MODULE_PARM_DESC(debug_largebar, -- "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); -- --int ignore_crat; --module_param(ignore_crat, int, 0444); --MODULE_PARM_DESC(ignore_crat, -- "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); -- --int vega10_noretry; --module_param_named(noretry, vega10_noretry, int, 0644); --MODULE_PARM_DESC(noretry, -- "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); -- --int priv_cp_queues; --module_param(priv_cp_queues, int, 0644); --MODULE_PARM_DESC(priv_cp_queues, -- "Enable privileged mode for CP queues (0 = off (default), 1 = on)"); -- --int kgd2kfd_init(unsigned int interface_version, -- const struct kgd2kfd_calls **g2f) -+int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f) - { - if (!amdkfd_init_completed) - return -EPROBE_DEFER; -@@ -124,7 +90,7 @@ static int __init kfd_module_init(void) - /* Verify module parameters */ - if ((sched_policy < KFD_SCHED_POLICY_HWS) || - (sched_policy > KFD_SCHED_POLICY_NO_HWS)) { -- pr_err("sched_policy has invalid value\n"); -+ pr_err("kfd: sched_policy has invalid value\n"); - return -1; - } - -@@ -132,13 +98,13 @@ static int __init kfd_module_init(void) - if ((max_num_of_queues_per_device < 1) || - (max_num_of_queues_per_device > - KFD_MAX_NUM_OF_QUEUES_PER_DEVICE)) { -- pr_err("max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n"); -+ pr_err("kfd: max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n"); - return -1; - } - - err = kfd_pasid_init(); - if (err < 0) -- return err; -+ goto err_pasid; - - err = kfd_chardev_init(); - if (err < 0) -@@ -148,16 +114,8 @@ static int __init kfd_module_init(void) - if (err < 0) - goto err_topology; - -- err = kfd_ipc_init(); -- if (err < 0) -- goto err_topology; -- - kfd_process_create_wq(); - -- kfd_init_peer_direct(); -- -- kfd_debugfs_init(); -- - amdkfd_init_completed = 1; - - dev_info(kfd_device, "Initialized module\n"); -@@ -168,6 +126,7 @@ static int __init kfd_module_init(void) - kfd_chardev_exit(); - err_ioctl: - kfd_pasid_exit(); -+err_pasid: - return err; - } - -@@ -175,8 +134,6 @@ static void __exit kfd_module_exit(void) - { - amdkfd_init_completed = 0; - -- kfd_debugfs_fini(); -- kfd_close_peer_direct(); - kfd_process_destroy_wq(); - kfd_topology_shutdown(); - kfd_chardev_exit(); -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c -index 9eb2d54..b1ef136 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c -@@ -23,68 +23,14 @@ - - #include "kfd_priv.h" - --/* Mapping queue priority to pipe priority, indexed by queue priority */ --int pipe_priority_map[] = { -- KFD_PIPE_PRIORITY_CS_LOW, -- KFD_PIPE_PRIORITY_CS_LOW, -- KFD_PIPE_PRIORITY_CS_LOW, -- KFD_PIPE_PRIORITY_CS_LOW, -- KFD_PIPE_PRIORITY_CS_LOW, -- KFD_PIPE_PRIORITY_CS_LOW, -- KFD_PIPE_PRIORITY_CS_LOW, -- KFD_PIPE_PRIORITY_CS_MEDIUM, -- KFD_PIPE_PRIORITY_CS_MEDIUM, -- KFD_PIPE_PRIORITY_CS_MEDIUM, -- KFD_PIPE_PRIORITY_CS_MEDIUM, -- KFD_PIPE_PRIORITY_CS_HIGH, -- KFD_PIPE_PRIORITY_CS_HIGH, -- KFD_PIPE_PRIORITY_CS_HIGH, -- KFD_PIPE_PRIORITY_CS_HIGH, -- KFD_PIPE_PRIORITY_CS_HIGH --}; -- --/* Mapping queue priority to SPI priority, indexed by queue priority -- * SPI priority 2 and 3 are reserved for trap handler context save -- */ --int spi_priority_map[] = { -- KFD_SPI_PRIORITY_EXTRA_LOW, -- KFD_SPI_PRIORITY_EXTRA_LOW, -- KFD_SPI_PRIORITY_EXTRA_LOW, -- KFD_SPI_PRIORITY_EXTRA_LOW, -- KFD_SPI_PRIORITY_EXTRA_LOW, -- KFD_SPI_PRIORITY_EXTRA_LOW, -- KFD_SPI_PRIORITY_EXTRA_LOW, -- KFD_SPI_PRIORITY_EXTRA_LOW, -- KFD_SPI_PRIORITY_LOW, -- KFD_SPI_PRIORITY_LOW, -- KFD_SPI_PRIORITY_LOW, -- KFD_SPI_PRIORITY_LOW, -- KFD_SPI_PRIORITY_LOW, -- KFD_SPI_PRIORITY_LOW, -- KFD_SPI_PRIORITY_LOW, -- KFD_SPI_PRIORITY_LOW --}; -- - struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) - { - switch (dev->device_info->asic_family) { - case CHIP_KAVERI: - return mqd_manager_init_cik(type, dev); -- case CHIP_HAWAII: -- return mqd_manager_init_cik_hawaii(type, dev); - case CHIP_CARRIZO: - return mqd_manager_init_vi(type, dev); -- case CHIP_TONGA: -- case CHIP_FIJI: -- case CHIP_POLARIS10: -- case CHIP_POLARIS11: -- return mqd_manager_init_vi_tonga(type, dev); -- case CHIP_VEGA10: -- case CHIP_RAVEN: -- return mqd_manager_init_v9(type, dev); -- default: -- BUG(); - } - - return NULL; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h -index dcaeda8..213a71e 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h -@@ -43,9 +43,6 @@ - * - * @is_occupied: Checks if the relevant HQD slot is occupied. - * -- * @get_wave_state: Retrieves context save state and optionally copies the -- * control stack, if kept in the MQD, to the given userspace address. -- * - * @mqd_mutex: Mqd manager mutex. - * - * @dev: The kfd device structure coupled with this module. -@@ -62,8 +59,7 @@ - * per KFD_MQD_TYPE for each device. - * - */ --extern int pipe_priority_map[]; --extern int spi_priority_map[]; -+ - struct mqd_manager { - int (*init_mqd)(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, -@@ -71,8 +67,7 @@ struct mqd_manager { - - int (*load_mqd)(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, -- struct queue_properties *p, -- struct mm_struct *mms); -+ uint32_t __user *wptr); - - int (*update_mqd)(struct mqd_manager *mm, void *mqd, - struct queue_properties *q); -@@ -89,15 +84,6 @@ struct mqd_manager { - uint64_t queue_address, uint32_t pipe_id, - uint32_t queue_id); - -- int (*get_wave_state)(struct mqd_manager *mm, void *mqd, -- void __user *ctl_stack, -- u32 *ctl_stack_used_size, -- u32 *save_area_used_size); -- --#if defined(CONFIG_DEBUG_FS) -- int (*debugfs_show_mqd)(struct seq_file *m, void *data); --#endif -- - struct mutex mqd_mutex; - struct kfd_dev *dev; - }; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c -index 5724d33..6acc431 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c -@@ -30,80 +30,12 @@ - #include "cik_regs.h" - #include "cik_structs.h" - #include "oss/oss_2_4_sh_mask.h" --#include "gca/gfx_7_2_sh_mask.h" - - static inline struct cik_mqd *get_mqd(void *mqd) - { - return (struct cik_mqd *)mqd; - } - --static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) --{ -- return (struct cik_sdma_rlc_registers *)mqd; --} -- --static void update_cu_mask(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- struct cik_mqd *m; -- struct kfd_cu_info cu_info; -- uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ -- uint32_t cu_mask_count = q->cu_mask_count; -- const uint32_t *cu_mask = q->cu_mask; -- int se, cu_per_sh, cu_index, i; -- -- if (cu_mask_count == 0) -- return; -- -- m = get_mqd(mqd); -- m->compute_static_thread_mgmt_se0 = 0; -- m->compute_static_thread_mgmt_se1 = 0; -- m->compute_static_thread_mgmt_se2 = 0; -- m->compute_static_thread_mgmt_se3 = 0; -- -- mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); -- -- /* If # CU mask bits > # CUs, set it to the # of CUs */ -- if (cu_mask_count > cu_info.cu_active_number) -- cu_mask_count = cu_info.cu_active_number; -- -- cu_index = 0; -- for (se = 0; se < cu_info.num_shader_engines; se++) { -- cu_per_sh = 0; -- -- /* Get the number of CUs on this Shader Engine */ -- for (i = 0; i < 4; i++) -- cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); -- -- se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); -- if ((cu_per_sh + (cu_index % 32)) > 32) -- se_mask[se] |= cu_mask[(cu_index / 32) + 1] -- << (32 - (cu_index % 32)); -- se_mask[se] &= (1 << cu_per_sh) - 1; -- cu_index += cu_per_sh; -- } -- m->compute_static_thread_mgmt_se0 = se_mask[0]; -- m->compute_static_thread_mgmt_se1 = se_mask[1]; -- m->compute_static_thread_mgmt_se2 = se_mask[2]; -- m->compute_static_thread_mgmt_se3 = se_mask[3]; -- -- pr_debug("Update cu mask to %#x %#x %#x %#x\n", -- m->compute_static_thread_mgmt_se0, -- m->compute_static_thread_mgmt_se1, -- m->compute_static_thread_mgmt_se2, -- m->compute_static_thread_mgmt_se3); --} -- --static void set_priority(struct cik_mqd *m, struct queue_properties *q) --{ -- m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; -- m->cp_hqd_queue_priority = q->priority; -- m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & -- (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | -- (spi_priority_map[q->priority] << -- COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); --} -- - static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -@@ -112,6 +44,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, - struct cik_mqd *m; - int retval; - -+ BUG_ON(!mm || !q || !mqd); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), - mqd_mem_obj); - -@@ -142,6 +78,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, - m->cp_mqd_base_addr_lo = lower_32_bits(addr); - m->cp_mqd_base_addr_hi = upper_32_bits(addr); - -+ m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN; -+ /* Although WinKFD writes this, I suspect it should not be necessary */ -+ m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; -+ - m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | - QUANTUM_DURATION(10); - -@@ -154,17 +94,14 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, - * 1 = CS_MEDIUM (typically between HP3D and GFX - * 2 = CS_HIGH (typically above HP3D) - */ -- set_priority(m, q); -+ m->cp_hqd_pipe_priority = 1; -+ m->cp_hqd_queue_priority = 15; - - if (q->format == KFD_QUEUE_FORMAT_AQL) - m->cp_hqd_iq_rptr = AQL_ENABLE; - -- if (priv_cp_queues) -- m->cp_hqd_pq_control |= -- 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; -- - *mqd = m; -- if (gart_addr) -+ if (gart_addr != NULL) - *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - -@@ -178,6 +115,8 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - int retval; - struct cik_sdma_rlc_registers *m; - -+ BUG_ON(!mm || !mqd || !mqd_mem_obj); -+ - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct cik_sdma_rlc_registers), - mqd_mem_obj); -@@ -190,7 +129,7 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - memset(m, 0, sizeof(struct cik_sdma_rlc_registers)); - - *mqd = m; -- if (gart_addr) -+ if (gart_addr != NULL) - *gart_addr = (*mqd_mem_obj)->gpu_addr; - - retval = mm->update_mqd(mm, m, q); -@@ -201,50 +140,43 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - static void uninit_mqd(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) - { -+ BUG_ON(!mm || !mqd); - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); - } - - static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) - { -+ BUG_ON(!mm || !mqd); - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); - } - - static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, struct queue_properties *p, -- struct mm_struct *mms) -+ uint32_t queue_id, uint32_t __user *wptr) - { -- /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ -- uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); -- uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); -- -- return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, -- (uint32_t __user *)p->write_ptr, -- wptr_shift, wptr_mask, mms); -+ return mm->dev->kfd2kgd->hqd_load -+ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); - } - - static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, -- uint32_t pipe_id, uint32_t queue_id, -- struct queue_properties *p, struct mm_struct *mms) -+ uint32_t pipe_id, uint32_t queue_id, -+ uint32_t __user *wptr) - { -- return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, -- (uint32_t __user *)p->write_ptr, -- mms); -+ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); - } - --static int __update_mqd(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q, unsigned int atc_bit) -+static int update_mqd(struct mqd_manager *mm, void *mqd, -+ struct queue_properties *q) - { - struct cik_mqd *m; - -+ BUG_ON(!mm || !q || !mqd); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ - m = get_mqd(mqd); - m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | -- DEFAULT_MIN_AVAIL_SIZE; -- m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE; -- if (atc_bit) { -- m->cp_hqd_pq_control |= PQ_ATC_EN; -- m->cp_hqd_ib_control |= IB_ATC_EN; -- } -+ DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN; - - /* - * Calculating queue size which is log base 2 of actual queue size -1 -@@ -256,47 +188,37 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, - m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); - m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -- m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); -+ m->cp_hqd_pq_doorbell_control = DOORBELL_EN | -+ DOORBELL_OFFSET(q->doorbell_off); - - m->cp_hqd_vmid = q->vmid; - -- if (q->format == KFD_QUEUE_FORMAT_AQL) -+ if (q->format == KFD_QUEUE_FORMAT_AQL) { - m->cp_hqd_pq_control |= NO_UPDATE_RPTR; -+ } - -- update_cu_mask(mm, mqd, q); -- set_priority(m, q); -- -+ m->cp_hqd_active = 0; - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && -- q->queue_percent > 0 && -- !q->is_evicted) { -+ q->queue_percent > 0) { -+ m->cp_hqd_active = 1; - q->is_active = true; - } - - return 0; - } - --static int update_mqd(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- return __update_mqd(mm, mqd, q, 1); --} -- --static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- return __update_mqd(mm, mqd, q, 0); --} -- - static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) - { - struct cik_sdma_rlc_registers *m; - -+ BUG_ON(!mm || !mqd || !q); -+ - m = get_sdma_mqd(mqd); -- m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) -- << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | -+ m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << -+ SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | - q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | - 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; -@@ -305,8 +227,9 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, - m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8); - m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -- m->sdma_rlc_doorbell = -- q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; -+ m->sdma_rlc_doorbell = q->doorbell_off << -+ SDMA0_RLC0_DOORBELL__OFFSET__SHIFT | -+ 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; - - m->sdma_rlc_virtual_addr = q->sdma_vm_addr; - -@@ -316,8 +239,10 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && -- q->queue_percent > 0 && -- !q->is_evicted) { -+ q->queue_percent > 0) { -+ m->sdma_rlc_rb_cntl |= -+ 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; -+ - q->is_active = true; - } - -@@ -329,7 +254,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) - { -- return mm->dev->kfd2kgd->hqd_destroy(mm->dev->kgd, mqd, type, timeout, -+ return mm->dev->kfd2kgd->hqd_destroy(mm->dev->kgd, type, timeout, - pipe_id, queue_id); - } - -@@ -376,6 +301,10 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct cik_mqd *m; - int retval; - -+ BUG_ON(!mm || !q || !mqd || !mqd_mem_obj); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), - mqd_mem_obj); - -@@ -414,7 +343,8 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - * 1 = CS_MEDIUM (typically between HP3D and GFX - * 2 = CS_HIGH (typically above HP3D) - */ -- set_priority(m, q); -+ m->cp_hqd_pipe_priority = 1; -+ m->cp_hqd_queue_priority = 15; - - *mqd = m; - if (gart_addr) -@@ -429,6 +359,10 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, - { - struct cik_mqd *m; - -+ BUG_ON(!mm || !q || !mqd); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ - m = get_mqd(mqd); - m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | - DEFAULT_MIN_AVAIL_SIZE | -@@ -445,50 +379,45 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, - m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); - m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -- m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); -+ m->cp_hqd_pq_doorbell_control = DOORBELL_EN | -+ DOORBELL_OFFSET(q->doorbell_off); - - m->cp_hqd_vmid = q->vmid; - -+ m->cp_hqd_active = 0; - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && -- q->queue_percent > 0 && -- !q->is_evicted) { -+ q->queue_percent > 0) { -+ m->cp_hqd_active = 1; - q->is_active = true; - } - -- set_priority(m, q); - return 0; - } - --#if defined(CONFIG_DEBUG_FS) -- --static int debugfs_show_mqd(struct seq_file *m, void *data) -+struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) - { -- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, -- data, sizeof(struct cik_mqd), false); -- return 0; --} -+ struct cik_sdma_rlc_registers *m; - --static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) --{ -- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, -- data, sizeof(struct cik_sdma_rlc_registers), false); -- return 0; --} -+ BUG_ON(!mqd); - --#endif -+ m = (struct cik_sdma_rlc_registers *)mqd; - -+ return m; -+} - - struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) - { - struct mqd_manager *mqd; - -- if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) -- return NULL; -+ BUG_ON(!dev); -+ BUG_ON(type >= KFD_MQD_TYPE_MAX); - -- mqd = kzalloc(sizeof(*mqd), GFP_NOIO); -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); - if (!mqd) - return NULL; - -@@ -503,9 +432,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, - mqd->update_mqd = update_mqd; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; --#if defined(CONFIG_DEBUG_FS) -- mqd->debugfs_show_mqd = debugfs_show_mqd; --#endif - break; - case KFD_MQD_TYPE_HIQ: - mqd->init_mqd = init_mqd_hiq; -@@ -514,9 +440,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, - mqd->update_mqd = update_mqd_hiq; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; --#if defined(CONFIG_DEBUG_FS) -- mqd->debugfs_show_mqd = debugfs_show_mqd; --#endif - break; - case KFD_MQD_TYPE_SDMA: - mqd->init_mqd = init_mqd_sdma; -@@ -525,9 +448,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, - mqd->update_mqd = update_mqd_sdma; - mqd->destroy_mqd = destroy_mqd_sdma; - mqd->is_occupied = is_occupied_sdma; --#if defined(CONFIG_DEBUG_FS) -- mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; --#endif - break; - default: - kfree(mqd); -@@ -537,15 +457,3 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, - return mqd; - } - --struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, -- struct kfd_dev *dev) --{ -- struct mqd_manager *mqd; -- -- mqd = mqd_manager_init_cik(type, dev); -- if (!mqd) -- return NULL; -- if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) -- mqd->update_mqd = update_mqd_hawaii; -- return mqd; --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c -deleted file mode 100644 -index 6c302d2..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c -+++ /dev/null -@@ -1,528 +0,0 @@ --/* -- * Copyright 2016 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- * -- */ -- --#include <linux/printk.h> --#include <linux/slab.h> --#include <linux/uaccess.h> --#include "kfd_priv.h" --#include "kfd_mqd_manager.h" --#include "v9_structs.h" --#include "vega10/GC/gc_9_0_offset.h" --#include "vega10/GC/gc_9_0_sh_mask.h" --#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" -- --static inline struct v9_mqd *get_mqd(void *mqd) --{ -- return (struct v9_mqd *)mqd; --} -- --static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) --{ -- return (struct v9_sdma_mqd *)mqd; --} -- --static void update_cu_mask(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- struct v9_mqd *m; -- struct kfd_cu_info cu_info; -- uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ -- uint32_t cu_mask_count = q->cu_mask_count; -- const uint32_t *cu_mask = q->cu_mask; -- int se, cu_per_sh, cu_index, i; -- -- if (cu_mask_count == 0) -- return; -- -- m = get_mqd(mqd); -- m->compute_static_thread_mgmt_se0 = 0; -- m->compute_static_thread_mgmt_se1 = 0; -- m->compute_static_thread_mgmt_se2 = 0; -- m->compute_static_thread_mgmt_se3 = 0; -- -- mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); -- -- /* If # CU mask bits > # CUs, set it to the # of CUs */ -- if (cu_mask_count > cu_info.cu_active_number) -- cu_mask_count = cu_info.cu_active_number; -- -- cu_index = 0; -- for (se = 0; se < cu_info.num_shader_engines; se++) { -- cu_per_sh = 0; -- -- /* Get the number of CUs on this Shader Engine */ -- for (i = 0; i < 4; i++) -- cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); -- -- se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); -- if ((cu_per_sh + (cu_index % 32)) > 32) -- se_mask[se] |= cu_mask[(cu_index / 32) + 1] -- << (32 - (cu_index % 32)); -- se_mask[se] &= (1 << cu_per_sh) - 1; -- cu_index += cu_per_sh; -- } -- m->compute_static_thread_mgmt_se0 = se_mask[0]; -- m->compute_static_thread_mgmt_se1 = se_mask[1]; -- m->compute_static_thread_mgmt_se2 = se_mask[2]; -- m->compute_static_thread_mgmt_se3 = se_mask[3]; -- -- pr_debug("update cu mask to %#x %#x %#x %#x\n", -- m->compute_static_thread_mgmt_se0, -- m->compute_static_thread_mgmt_se1, -- m->compute_static_thread_mgmt_se2, -- m->compute_static_thread_mgmt_se3); --} -- --static int init_mqd(struct mqd_manager *mm, void **mqd, -- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, -- struct queue_properties *q) --{ -- int retval; -- uint64_t addr; -- struct v9_mqd *m; -- struct kfd_dev *kfd = mm->dev; -- -- /* From V9, for CWSR, the control stack is located on the next page -- * boundary after the mqd, we will use the gtt allocation function -- * instead of sub-allocation function. -- */ -- if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { -- *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); -- retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, -- ALIGN(q->ctl_stack_size, PAGE_SIZE) + -- ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), -- &((*mqd_mem_obj)->gtt_mem), -- &((*mqd_mem_obj)->gpu_addr), -- (void *)&((*mqd_mem_obj)->cpu_ptr)); -- } else -- retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), -- mqd_mem_obj); -- if (retval != 0) -- return -ENOMEM; -- -- m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; -- addr = (*mqd_mem_obj)->gpu_addr; -- -- memset(m, 0, sizeof(struct v9_mqd)); -- -- m->header = 0xC0310800; -- m->compute_pipelinestat_enable = 1; -- m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; -- m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; -- m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; -- m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; -- -- m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | -- 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; -- -- m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; -- -- m->cp_mqd_base_addr_lo = lower_32_bits(addr); -- m->cp_mqd_base_addr_hi = upper_32_bits(addr); -- -- m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | -- 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | -- 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; -- -- m->cp_hqd_pipe_priority = 1; -- m->cp_hqd_queue_priority = 15; -- -- if (q->format == KFD_QUEUE_FORMAT_AQL) { -- m->cp_hqd_aql_control = -- 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; -- } -- -- if (q->tba_addr) { -- m->compute_pgm_rsrc2 |= -- (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); -- } -- -- if (mm->dev->cwsr_enabled) { -- m->cp_hqd_persistent_state |= -- (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); -- m->cp_hqd_ctx_save_base_addr_lo = -- lower_32_bits(q->ctx_save_restore_area_address); -- m->cp_hqd_ctx_save_base_addr_hi = -- upper_32_bits(q->ctx_save_restore_area_address); -- m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; -- m->cp_hqd_cntl_stack_size = q->ctl_stack_size; -- m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; -- m->cp_hqd_wg_state_offset = q->ctl_stack_size; -- } -- -- if (priv_cp_queues) -- m->cp_hqd_pq_control |= -- 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; -- -- *mqd = m; -- if (gart_addr) -- *gart_addr = addr; -- retval = mm->update_mqd(mm, m, q); -- -- return retval; --} -- --static int load_mqd(struct mqd_manager *mm, void *mqd, -- uint32_t pipe_id, uint32_t queue_id, -- struct queue_properties *p, struct mm_struct *mms) --{ -- /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ -- uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); -- -- return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, -- (uint32_t __user *)p->write_ptr, -- wptr_shift, 0, mms); --} -- --static int update_mqd(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- struct v9_mqd *m; -- -- m = get_mqd(mqd); -- -- m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; -- m->cp_hqd_pq_control |= -- ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; -- pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); -- -- m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); -- m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); -- -- m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); -- m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -- m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); -- m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); -- -- m->cp_hqd_pq_doorbell_control = -- q->doorbell_off << -- CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; -- pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", -- m->cp_hqd_pq_doorbell_control); -- -- m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; -- -- /* -- * HW does not clamp this field correctly. Maximum EOP queue size -- * is constrained by per-SE EOP done signal count, which is 8-bit. -- * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit -- * more than (EOP entry count - 1) so a queue size of 0x800 dwords -- * is safe, giving a maximum field value of 0xA. -- */ -- m->cp_hqd_eop_control = min(0xA, -- ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); -- m->cp_hqd_eop_base_addr_lo = -- lower_32_bits(q->eop_ring_buffer_address >> 8); -- m->cp_hqd_eop_base_addr_hi = -- upper_32_bits(q->eop_ring_buffer_address >> 8); -- -- m->cp_hqd_iq_timer = 0; -- -- m->cp_hqd_vmid = q->vmid; -- -- if (q->format == KFD_QUEUE_FORMAT_AQL) { -- m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | -- 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | -- 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | -- 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; -- m->cp_hqd_pq_doorbell_control |= -- 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; -- } -- if (mm->dev->cwsr_enabled) -- m->cp_hqd_ctx_save_control = 0; -- -- update_cu_mask(mm, mqd, q); -- -- q->is_active = false; -- if (q->queue_size > 0 && -- q->queue_address != 0 && -- q->queue_percent > 0 && -- !q->is_evicted) { -- q->is_active = true; -- } -- -- return 0; --} -- -- --static int destroy_mqd(struct mqd_manager *mm, void *mqd, -- enum kfd_preempt_type type, -- unsigned int timeout, uint32_t pipe_id, -- uint32_t queue_id) --{ -- return mm->dev->kfd2kgd->hqd_destroy -- (mm->dev->kgd, mqd, type, timeout, -- pipe_id, queue_id); --} -- --static void uninit_mqd(struct mqd_manager *mm, void *mqd, -- struct kfd_mem_obj *mqd_mem_obj) --{ -- struct kfd_dev *kfd = mm->dev; -- -- if (mqd_mem_obj->gtt_mem) { -- kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); -- kfree(mqd_mem_obj); -- } else { -- kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -- } --} -- --static bool is_occupied(struct mqd_manager *mm, void *mqd, -- uint64_t queue_address, uint32_t pipe_id, -- uint32_t queue_id) --{ -- return mm->dev->kfd2kgd->hqd_is_occupied( -- mm->dev->kgd, queue_address, -- pipe_id, queue_id); --} -- --static int get_wave_state(struct mqd_manager *mm, void *mqd, -- void __user *ctl_stack, -- u32 *ctl_stack_used_size, -- u32 *save_area_used_size) --{ -- struct v9_mqd *m; -- -- /* Control stack is located one page after MQD. */ -- void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); -- -- m = get_mqd(mqd); -- -- *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - -- m->cp_hqd_cntl_stack_offset; -- *save_area_used_size = m->cp_hqd_wg_state_offset - -- m->cp_hqd_cntl_stack_size; -- -- if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) -- return -EFAULT; -- -- return 0; --} -- --static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, -- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, -- struct queue_properties *q) --{ -- struct v9_mqd *m; -- int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); -- -- if (retval != 0) -- return retval; -- -- m = get_mqd(*mqd); -- -- m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | -- 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; -- -- return retval; --} -- --static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- struct v9_mqd *m; -- int retval = update_mqd(mm, mqd, q); -- -- if (retval != 0) -- return retval; -- -- /* TODO: what's the point? update_mqd already does this. */ -- m = get_mqd(mqd); -- m->cp_hqd_vmid = q->vmid; -- return retval; --} -- --static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, -- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, -- struct queue_properties *q) --{ -- int retval; -- struct v9_sdma_mqd *m; -- -- -- retval = kfd_gtt_sa_allocate(mm->dev, -- sizeof(struct v9_sdma_mqd), -- mqd_mem_obj); -- -- if (retval != 0) -- return -ENOMEM; -- -- m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; -- -- memset(m, 0, sizeof(struct v9_sdma_mqd)); -- -- *mqd = m; -- if (gart_addr) -- *gart_addr = (*mqd_mem_obj)->gpu_addr; -- -- retval = mm->update_mqd(mm, m, q); -- -- return retval; --} -- --static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, -- struct kfd_mem_obj *mqd_mem_obj) --{ -- kfd_gtt_sa_free(mm->dev, mqd_mem_obj); --} -- --static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, -- uint32_t pipe_id, uint32_t queue_id, -- struct queue_properties *p, struct mm_struct *mms) --{ -- return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, -- (uint32_t __user *)p->write_ptr, -- mms); --} -- --#define SDMA_RLC_DUMMY_DEFAULT 0xf -- --static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- struct v9_sdma_mqd *m; -- -- m = get_sdma_mqd(mqd); -- m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) -- << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | -- q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | -- 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | -- 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; -- -- m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); -- m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); -- m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); -- m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -- m->sdmax_rlcx_doorbell_offset = -- q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; -- -- m->sdma_engine_id = q->sdma_engine_id; -- m->sdma_queue_id = q->sdma_queue_id; -- m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; -- -- q->is_active = false; -- if (q->queue_size > 0 && -- q->queue_address != 0 && -- q->queue_percent > 0 && -- !q->is_evicted) { -- q->is_active = true; -- } -- -- return 0; --} -- --/* -- * * preempt type here is ignored because there is only one way -- * * to preempt sdma queue -- */ --static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, -- enum kfd_preempt_type type, -- unsigned int timeout, uint32_t pipe_id, -- uint32_t queue_id) --{ -- return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); --} -- --static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, -- uint64_t queue_address, uint32_t pipe_id, -- uint32_t queue_id) --{ -- return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); --} -- --#if defined(CONFIG_DEBUG_FS) -- --static int debugfs_show_mqd(struct seq_file *m, void *data) --{ -- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, -- data, sizeof(struct v9_mqd), false); -- return 0; --} -- --static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) --{ -- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, -- data, sizeof(struct v9_sdma_mqd), false); -- return 0; --} -- --#endif -- --struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, -- struct kfd_dev *dev) --{ -- struct mqd_manager *mqd; -- -- if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) -- return NULL; -- -- mqd = kzalloc(sizeof(*mqd), GFP_NOIO); -- if (!mqd) -- return NULL; -- -- mqd->dev = dev; -- -- switch (type) { -- case KFD_MQD_TYPE_CP: -- case KFD_MQD_TYPE_COMPUTE: -- mqd->init_mqd = init_mqd; -- mqd->uninit_mqd = uninit_mqd; -- mqd->load_mqd = load_mqd; -- mqd->update_mqd = update_mqd; -- mqd->destroy_mqd = destroy_mqd; -- mqd->is_occupied = is_occupied; -- mqd->get_wave_state = get_wave_state; --#if defined(CONFIG_DEBUG_FS) -- mqd->debugfs_show_mqd = debugfs_show_mqd; --#endif -- break; -- case KFD_MQD_TYPE_HIQ: -- mqd->init_mqd = init_mqd_hiq; -- mqd->uninit_mqd = uninit_mqd; -- mqd->load_mqd = load_mqd; -- mqd->update_mqd = update_mqd_hiq; -- mqd->destroy_mqd = destroy_mqd; -- mqd->is_occupied = is_occupied; --#if defined(CONFIG_DEBUG_FS) -- mqd->debugfs_show_mqd = debugfs_show_mqd; --#endif -- break; -- case KFD_MQD_TYPE_SDMA: -- mqd->init_mqd = init_mqd_sdma; -- mqd->uninit_mqd = uninit_mqd_sdma; -- mqd->load_mqd = load_mqd_sdma; -- mqd->update_mqd = update_mqd_sdma; -- mqd->destroy_mqd = destroy_mqd_sdma; -- mqd->is_occupied = is_occupied_sdma; --#if defined(CONFIG_DEBUG_FS) -- mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; --#endif -- break; -- default: -- kfree(mqd); -- return NULL; -- } -- -- return mqd; --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c -index 5c26e5a..a9b9882 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c -@@ -30,7 +30,6 @@ - #include "vi_structs.h" - #include "gca/gfx_8_0_sh_mask.h" - #include "gca/gfx_8_0_enum.h" --#include "oss/oss_3_0_sh_mask.h" - - #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 - -@@ -39,73 +38,6 @@ static inline struct vi_mqd *get_mqd(void *mqd) - return (struct vi_mqd *)mqd; - } - --static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) --{ -- return (struct vi_sdma_mqd *)mqd; --} -- --static void update_cu_mask(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- struct vi_mqd *m; -- struct kfd_cu_info cu_info; -- uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ -- uint32_t cu_mask_count = q->cu_mask_count; -- const uint32_t *cu_mask = q->cu_mask; -- int se, cu_per_sh, cu_index, i; -- -- if (cu_mask_count == 0) -- return; -- -- m = get_mqd(mqd); -- m->compute_static_thread_mgmt_se0 = 0; -- m->compute_static_thread_mgmt_se1 = 0; -- m->compute_static_thread_mgmt_se2 = 0; -- m->compute_static_thread_mgmt_se3 = 0; -- -- mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); -- -- /* If # CU mask bits > # CUs, set it to the # of CUs */ -- if (cu_mask_count > cu_info.cu_active_number) -- cu_mask_count = cu_info.cu_active_number; -- -- cu_index = 0; -- for (se = 0; se < cu_info.num_shader_engines; se++) { -- cu_per_sh = 0; -- -- /* Get the number of CUs on this Shader Engine */ -- for (i = 0; i < 4; i++) -- cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); -- -- se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); -- if ((cu_per_sh + (cu_index % 32)) > 32) -- se_mask[se] |= cu_mask[(cu_index / 32) + 1] -- << (32 - (cu_index % 32)); -- se_mask[se] &= (1 << cu_per_sh) - 1; -- cu_index += cu_per_sh; -- } -- m->compute_static_thread_mgmt_se0 = se_mask[0]; -- m->compute_static_thread_mgmt_se1 = se_mask[1]; -- m->compute_static_thread_mgmt_se2 = se_mask[2]; -- m->compute_static_thread_mgmt_se3 = se_mask[3]; -- -- pr_debug("Update cu mask to %#x %#x %#x %#x\n", -- m->compute_static_thread_mgmt_se0, -- m->compute_static_thread_mgmt_se1, -- m->compute_static_thread_mgmt_se2, -- m->compute_static_thread_mgmt_se3); --} -- --static void set_priority(struct vi_mqd *m, struct queue_properties *q) --{ -- m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; -- m->cp_hqd_queue_priority = q->priority; -- m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & -- (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | -- (spi_priority_map[q->priority] << -- COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); --} -- - static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -@@ -144,40 +76,16 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, - 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | - 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; - -- set_priority(m, q); -+ m->cp_hqd_pipe_priority = 1; -+ m->cp_hqd_queue_priority = 15; -+ - m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT; - - if (q->format == KFD_QUEUE_FORMAT_AQL) - m->cp_hqd_iq_rptr = 1; - -- if (q->tba_addr) { -- m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); -- m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); -- m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); -- m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); -- m->compute_pgm_rsrc2 |= -- (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); -- } -- -- if (mm->dev->cwsr_enabled) { -- m->cp_hqd_persistent_state |= -- (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); -- m->cp_hqd_ctx_save_base_addr_lo = -- lower_32_bits(q->ctx_save_restore_area_address); -- m->cp_hqd_ctx_save_base_addr_hi = -- upper_32_bits(q->ctx_save_restore_area_address); -- m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; -- m->cp_hqd_cntl_stack_size = q->ctl_stack_size; -- m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; -- m->cp_hqd_wg_state_offset = q->ctl_stack_size; -- } -- -- if (priv_cp_queues) -- m->cp_hqd_pq_control |= -- 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; -- - *mqd = m; -- if (gart_addr) -+ if (gart_addr != NULL) - *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - -@@ -186,15 +94,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, - - static int load_mqd(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, -- struct queue_properties *p, struct mm_struct *mms) -+ uint32_t __user *wptr) - { -- /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ -- uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); -- uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); -- -- return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, -- (uint32_t __user *)p->write_ptr, -- wptr_shift, wptr_mask, mms); -+ return mm->dev->kfd2kgd->hqd_load -+ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); - } - - static int __update_mqd(struct mqd_manager *mm, void *mqd, -@@ -203,6 +106,10 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, - { - struct vi_mqd *m; - -+ BUG_ON(!mm || !q || !mqd); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ - m = get_mqd(mqd); - - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT | -@@ -210,20 +117,19 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, - mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT; - m->cp_hqd_pq_control |= - ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; -- pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); -+ pr_debug("kfd: cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); - - m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); - m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); - - m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -- m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); -- m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); - - m->cp_hqd_pq_doorbell_control = -+ 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_EN__SHIFT | - q->doorbell_off << - CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; -- pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", -+ pr_debug("kfd: cp_hqd_pq_doorbell_control 0x%x\n", - m->cp_hqd_pq_doorbell_control); - - m->cp_hqd_eop_control = atc_bit << CP_HQD_EOP_CONTROL__EOP_ATC__SHIFT | -@@ -233,15 +139,8 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, - 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | - mtype << CP_HQD_IB_CONTROL__MTYPE__SHIFT; - -- /* -- * HW does not clamp this field correctly. Maximum EOP queue size -- * is constrained by per-SE EOP done signal count, which is 8-bit. -- * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit -- * more than (EOP entry count - 1) so a queue size of 0x800 dwords -- * is safe, giving a maximum field value of 0xA. -- */ -- m->cp_hqd_eop_control |= min(0xA, -- ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); -+ m->cp_hqd_eop_control |= -+ ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_eop_base_addr_lo = - lower_32_bits(q->eop_ring_buffer_address >> 8); - m->cp_hqd_eop_base_addr_hi = -@@ -256,19 +155,13 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | - 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; - } -- if (mm->dev->cwsr_enabled) -- m->cp_hqd_ctx_save_control = -- atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | -- mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; -- -- update_cu_mask(mm, mqd, q); -- set_priority(m, q); - -+ m->cp_hqd_active = 0; - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && -- q->queue_percent > 0 && -- !q->is_evicted) { -+ q->queue_percent > 0) { -+ m->cp_hqd_active = 1; - q->is_active = true; - } - -@@ -282,25 +175,20 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, - return __update_mqd(mm, mqd, q, MTYPE_CC, 1); - } - --static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- return __update_mqd(mm, mqd, q, MTYPE_UC, 0); --} -- - static int destroy_mqd(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) - { - return mm->dev->kfd2kgd->hqd_destroy -- (mm->dev->kgd, mqd, type, timeout, -+ (mm->dev->kgd, type, timeout, - pipe_id, queue_id); - } - - static void uninit_mqd(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) - { -+ BUG_ON(!mm || !mqd); - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); - } - -@@ -313,28 +201,6 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd, - pipe_id, queue_id); - } - --static int get_wave_state(struct mqd_manager *mm, void *mqd, -- void __user *ctl_stack, -- u32 *ctl_stack_used_size, -- u32 *save_area_used_size) --{ -- struct vi_mqd *m; -- -- m = get_mqd(mqd); -- -- *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - -- m->cp_hqd_cntl_stack_offset; -- *save_area_used_size = m->cp_hqd_wg_state_offset - -- m->cp_hqd_cntl_stack_size; -- -- /* Control stack is not copied to user mode for GFXv8 because -- * it's part of the context save area that is already -- * accessible to user mode -- */ -- -- return 0; --} -- - static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -@@ -367,130 +233,17 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, - return retval; - } - --static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, -- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, -- struct queue_properties *q) --{ -- int retval; -- struct vi_sdma_mqd *m; -- -- -- retval = kfd_gtt_sa_allocate(mm->dev, -- sizeof(struct vi_sdma_mqd), -- mqd_mem_obj); -- -- if (retval != 0) -- return -ENOMEM; -- -- m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; -- -- memset(m, 0, sizeof(struct vi_sdma_mqd)); -- -- *mqd = m; -- if (gart_addr) -- *gart_addr = (*mqd_mem_obj)->gpu_addr; -- -- retval = mm->update_mqd(mm, m, q); -- -- return retval; --} -- --static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, -- struct kfd_mem_obj *mqd_mem_obj) --{ -- kfd_gtt_sa_free(mm->dev, mqd_mem_obj); --} -- --static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, -- uint32_t pipe_id, uint32_t queue_id, -- struct queue_properties *p, struct mm_struct *mms) --{ -- return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, -- (uint32_t __user *)p->write_ptr, -- mms); --} -- --static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, -- struct queue_properties *q) --{ -- struct vi_sdma_mqd *m; -- -- m = get_sdma_mqd(mqd); -- m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) -- << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | -- q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | -- 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | -- 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; -- -- m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); -- m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); -- m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); -- m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -- m->sdmax_rlcx_doorbell = -- q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; -- -- m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr; -- -- m->sdma_engine_id = q->sdma_engine_id; -- m->sdma_queue_id = q->sdma_queue_id; -- -- q->is_active = false; -- if (q->queue_size > 0 && -- q->queue_address != 0 && -- q->queue_percent > 0 && -- !q->is_evicted) { -- q->is_active = true; -- } -- -- return 0; --} -- --/* -- * * preempt type here is ignored because there is only one way -- * * to preempt sdma queue -- */ --static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, -- enum kfd_preempt_type type, -- unsigned int timeout, uint32_t pipe_id, -- uint32_t queue_id) --{ -- return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); --} -- --static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, -- uint64_t queue_address, uint32_t pipe_id, -- uint32_t queue_id) --{ -- return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); --} -- --#if defined(CONFIG_DEBUG_FS) -- --static int debugfs_show_mqd(struct seq_file *m, void *data) --{ -- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, -- data, sizeof(struct vi_mqd), false); -- return 0; --} -- --static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) --{ -- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, -- data, sizeof(struct vi_sdma_mqd), false); -- return 0; --} -- --#endif -- - struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) - { - struct mqd_manager *mqd; - -- if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) -- return NULL; -+ BUG_ON(!dev); -+ BUG_ON(type >= KFD_MQD_TYPE_MAX); -+ -+ pr_debug("kfd: In func %s\n", __func__); - -- mqd = kzalloc(sizeof(*mqd), GFP_NOIO); -+ mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); - if (!mqd) - return NULL; - -@@ -505,10 +258,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - mqd->update_mqd = update_mqd; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; -- mqd->get_wave_state = get_wave_state; --#if defined(CONFIG_DEBUG_FS) -- mqd->debugfs_show_mqd = debugfs_show_mqd; --#endif - break; - case KFD_MQD_TYPE_HIQ: - mqd->init_mqd = init_mqd_hiq; -@@ -517,20 +266,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - mqd->update_mqd = update_mqd_hiq; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; --#if defined(CONFIG_DEBUG_FS) -- mqd->debugfs_show_mqd = debugfs_show_mqd; --#endif - break; - case KFD_MQD_TYPE_SDMA: -- mqd->init_mqd = init_mqd_sdma; -- mqd->uninit_mqd = uninit_mqd_sdma; -- mqd->load_mqd = load_mqd_sdma; -- mqd->update_mqd = update_mqd_sdma; -- mqd->destroy_mqd = destroy_mqd_sdma; -- mqd->is_occupied = is_occupied_sdma; --#if defined(CONFIG_DEBUG_FS) -- mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; --#endif - break; - default: - kfree(mqd); -@@ -539,17 +276,3 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - - return mqd; - } -- --struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, -- struct kfd_dev *dev) --{ -- struct mqd_manager *mqd; -- -- mqd = mqd_manager_init_vi(type, dev); -- if (!mqd) -- return NULL; -- if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) -- mqd->update_mqd = update_mqd_tonga; -- return mqd; --} -- -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -index 7cca7b4..7e92921 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -@@ -26,6 +26,8 @@ - #include "kfd_device_queue_manager.h" - #include "kfd_kernel_queue.h" - #include "kfd_priv.h" -+#include "kfd_pm4_headers.h" -+#include "kfd_pm4_headers_vi.h" - #include "kfd_pm4_opcodes.h" - - static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, -@@ -33,45 +35,47 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, - { - unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t); - -- WARN((temp * sizeof(uint32_t)) > buffer_size_bytes, -- "Runlist IB overflow"); -+ BUG_ON((temp * sizeof(uint32_t)) > buffer_size_bytes); - *wptr = temp; - } - -+static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) -+{ -+ union PM4_MES_TYPE_3_HEADER header; -+ -+ header.u32all = 0; -+ header.opcode = opcode; -+ header.count = packet_size/sizeof(uint32_t) - 2; -+ header.type = PM4_TYPE_3; -+ -+ return header.u32all; -+} -+ - static void pm_calc_rlib_size(struct packet_manager *pm, - unsigned int *rlib_size, - bool *over_subscription) - { -- unsigned int process_count, queue_count, compute_queue_count; -+ unsigned int process_count, queue_count; - unsigned int map_queue_size; -- unsigned int max_proc_per_quantum = 1; - -- struct kfd_dev *dev = pm->dqm->dev; -+ BUG_ON(!pm || !rlib_size || !over_subscription); - - process_count = pm->dqm->processes_count; - queue_count = pm->dqm->queue_count; -- compute_queue_count = queue_count - pm->dqm->sdma_queue_count; -- -- /* check if there is over subscription -- * Note: the arbitration between the number of VMIDs and -- * hws_max_conc_proc has been done in -- * kgd2kfd_device_init(). -- */ - -+ /* check if there is over subscription*/ - *over_subscription = false; -- -- if (dev->max_proc_per_quantum > 1) -- max_proc_per_quantum = dev->max_proc_per_quantum; -- -- if ((process_count > max_proc_per_quantum) || -- compute_queue_count > get_queues_num(pm->dqm)) { -+ if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) { - *over_subscription = true; -- pr_debug("Over subscribed runlist\n"); -+ pr_debug("kfd: over subscribed runlist\n"); - } - -- map_queue_size = pm->pmf->get_map_queues_packet_size(); -+ map_queue_size = -+ (pm->dqm->dev->device_info->asic_family == CHIP_CARRIZO) ? -+ sizeof(struct pm4_mes_map_queues) : -+ sizeof(struct pm4_map_queues); - /* calculate run list ib allocation size */ -- *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + -+ *rlib_size = process_count * sizeof(struct pm4_map_process) + - queue_count * map_queue_size; - - /* -@@ -79,9 +83,9 @@ static void pm_calc_rlib_size(struct packet_manager *pm, - * when over subscription - */ - if (*over_subscription) -- *rlib_size += pm->pmf->get_runlist_packet_size(); -+ *rlib_size += sizeof(struct pm4_runlist); - -- pr_debug("runlist ib size %d\n", *rlib_size); -+ pr_debug("kfd: runlist ib size %d\n", *rlib_size); - } - - static int pm_allocate_runlist_ib(struct packet_manager *pm, -@@ -92,19 +96,18 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, - { - int retval; - -- if (WARN_ON(pm->allocated)) -- return -EINVAL; -+ BUG_ON(!pm); -+ BUG_ON(pm->allocated); -+ BUG_ON(is_over_subscription == NULL); - - pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); - -- mutex_lock(&pm->lock); -- - retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, - &pm->ib_buffer_obj); - -- if (retval) { -- pr_err("Failed to allocate runlist IB\n"); -- goto out; -+ if (retval != 0) { -+ pr_err("kfd: failed to allocate runlist IB\n"); -+ return retval; - } - - *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; -@@ -112,12 +115,198 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, - - memset(*rl_buffer, 0, *rl_buffer_size); - pm->allocated = true; -- --out: -- mutex_unlock(&pm->lock); - return retval; - } - -+static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, -+ uint64_t ib, size_t ib_size_in_dwords, bool chain) -+{ -+ struct pm4_runlist *packet; -+ -+ BUG_ON(!pm || !buffer || !ib); -+ -+ packet = (struct pm4_runlist *)buffer; -+ -+ memset(buffer, 0, sizeof(struct pm4_runlist)); -+ packet->header.u32all = build_pm4_header(IT_RUN_LIST, -+ sizeof(struct pm4_runlist)); -+ -+ packet->bitfields4.ib_size = ib_size_in_dwords; -+ packet->bitfields4.chain = chain ? 1 : 0; -+ packet->bitfields4.offload_polling = 0; -+ packet->bitfields4.valid = 1; -+ packet->ordinal2 = lower_32_bits(ib); -+ packet->bitfields3.ib_base_hi = upper_32_bits(ib); -+ -+ return 0; -+} -+ -+static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, -+ struct qcm_process_device *qpd) -+{ -+ struct pm4_map_process *packet; -+ struct queue *cur; -+ uint32_t num_queues; -+ -+ BUG_ON(!pm || !buffer || !qpd); -+ -+ packet = (struct pm4_map_process *)buffer; -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ memset(buffer, 0, sizeof(struct pm4_map_process)); -+ -+ packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, -+ sizeof(struct pm4_map_process)); -+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -+ packet->bitfields2.process_quantum = 1; -+ packet->bitfields2.pasid = qpd->pqm->process->pasid; -+ packet->bitfields3.page_table_base = qpd->page_table_base; -+ packet->bitfields10.gds_size = qpd->gds_size; -+ packet->bitfields10.num_gws = qpd->num_gws; -+ packet->bitfields10.num_oac = qpd->num_oac; -+ num_queues = 0; -+ list_for_each_entry(cur, &qpd->queues_list, list) -+ num_queues++; -+ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; -+ -+ packet->sh_mem_config = qpd->sh_mem_config; -+ packet->sh_mem_bases = qpd->sh_mem_bases; -+ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -+ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -+ -+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -+ -+ return 0; -+} -+ -+static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, -+ struct queue *q, bool is_static) -+{ -+ struct pm4_mes_map_queues *packet; -+ bool use_static = is_static; -+ -+ BUG_ON(!pm || !buffer || !q); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ packet = (struct pm4_mes_map_queues *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_map_queues)); -+ -+ packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, -+ sizeof(struct pm4_map_queues)); -+ packet->bitfields2.alloc_format = -+ alloc_format__mes_map_queues__one_per_pipe_vi; -+ packet->bitfields2.num_queues = 1; -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; -+ -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_map_queues__compute_vi; -+ packet->bitfields2.queue_type = -+ queue_type__mes_map_queues__normal_compute_vi; -+ -+ switch (q->properties.type) { -+ case KFD_QUEUE_TYPE_COMPUTE: -+ if (use_static) -+ packet->bitfields2.queue_type = -+ queue_type__mes_map_queues__normal_latency_static_queue_vi; -+ break; -+ case KFD_QUEUE_TYPE_DIQ: -+ packet->bitfields2.queue_type = -+ queue_type__mes_map_queues__debug_interface_queue_vi; -+ break; -+ case KFD_QUEUE_TYPE_SDMA: -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_map_queues__sdma0_vi; -+ use_static = false; /* no static queues under SDMA */ -+ break; -+ default: -+ pr_err("kfd: in %s queue type %d\n", __func__, -+ q->properties.type); -+ BUG(); -+ break; -+ } -+ packet->bitfields3.doorbell_offset = -+ q->properties.doorbell_off; -+ -+ packet->mqd_addr_lo = -+ lower_32_bits(q->gart_mqd_addr); -+ -+ packet->mqd_addr_hi = -+ upper_32_bits(q->gart_mqd_addr); -+ -+ packet->wptr_addr_lo = -+ lower_32_bits((uint64_t)q->properties.write_ptr); -+ -+ packet->wptr_addr_hi = -+ upper_32_bits((uint64_t)q->properties.write_ptr); -+ -+ return 0; -+} -+ -+static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, -+ struct queue *q, bool is_static) -+{ -+ struct pm4_map_queues *packet; -+ bool use_static = is_static; -+ -+ BUG_ON(!pm || !buffer || !q); -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ packet = (struct pm4_map_queues *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_map_queues)); -+ -+ packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, -+ sizeof(struct pm4_map_queues)); -+ packet->bitfields2.alloc_format = -+ alloc_format__mes_map_queues__one_per_pipe; -+ packet->bitfields2.num_queues = 1; -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots; -+ -+ packet->bitfields2.vidmem = (q->properties.is_interop) ? -+ vidmem__mes_map_queues__uses_video_memory : -+ vidmem__mes_map_queues__uses_no_video_memory; -+ -+ switch (q->properties.type) { -+ case KFD_QUEUE_TYPE_COMPUTE: -+ case KFD_QUEUE_TYPE_DIQ: -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_map_queues__compute; -+ break; -+ case KFD_QUEUE_TYPE_SDMA: -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_map_queues__sdma0; -+ use_static = false; /* no static queues under SDMA */ -+ break; -+ default: -+ BUG(); -+ break; -+ } -+ -+ packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset = -+ q->properties.doorbell_off; -+ -+ packet->mes_map_queues_ordinals[0].bitfields3.is_static = -+ (use_static) ? 1 : 0; -+ -+ packet->mes_map_queues_ordinals[0].mqd_addr_lo = -+ lower_32_bits(q->gart_mqd_addr); -+ -+ packet->mes_map_queues_ordinals[0].mqd_addr_hi = -+ upper_32_bits(q->gart_mqd_addr); -+ -+ packet->mes_map_queues_ordinals[0].wptr_addr_lo = -+ lower_32_bits((uint64_t)q->properties.write_ptr); -+ -+ packet->mes_map_queues_ordinals[0].wptr_addr_hi = -+ upper_32_bits((uint64_t)q->properties.write_ptr); -+ -+ return 0; -+} - - static int pm_create_runlist_ib(struct packet_manager *pm, - struct list_head *queues, -@@ -133,17 +322,19 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - struct kernel_queue *kq; - bool is_over_subscription; - -+ BUG_ON(!pm || !queues || !rl_size_bytes || !rl_gpu_addr); -+ - rl_wptr = retval = proccesses_mapped = 0; - - retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr, - &alloc_size_bytes, &is_over_subscription); -- if (retval) -+ if (retval != 0) - return retval; - - *rl_size_bytes = alloc_size_bytes; -- pm->ib_size_bytes = alloc_size_bytes; - -- pr_debug("Building runlist ib process count: %d queues count %d\n", -+ pr_debug("kfd: In func %s\n", __func__); -+ pr_debug("kfd: building runlist ib process count: %d queues count %d\n", - pm->dqm->processes_count, pm->dqm->queue_count); - - /* build the run list ib packet */ -@@ -151,35 +342,42 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - qpd = cur->qpd; - /* build map process packet */ - if (proccesses_mapped >= pm->dqm->processes_count) { -- pr_debug("Not enough space left in runlist IB\n"); -+ pr_debug("kfd: not enough space left in runlist IB\n"); - pm_release_ib(pm); - return -ENOMEM; - } - -- retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); -- if (retval) -+ retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); -+ if (retval != 0) - return retval; - - proccesses_mapped++; -- inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(), -+ inc_wptr(&rl_wptr, sizeof(struct pm4_map_process), - alloc_size_bytes); - - list_for_each_entry(kq, &qpd->priv_queue_list, list) { - if (!kq->queue->properties.is_active) - continue; - -- pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", -+ pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n", - kq->queue->queue, qpd->is_debug); - -- retval = pm->pmf->map_queues(pm, -+ if (pm->dqm->dev->device_info->asic_family == -+ CHIP_CARRIZO) -+ retval = pm_create_map_queue_vi(pm, -+ &rl_buffer[rl_wptr], -+ kq->queue, -+ qpd->is_debug); -+ else -+ retval = pm_create_map_queue(pm, - &rl_buffer[rl_wptr], - kq->queue, - qpd->is_debug); -- if (retval) -+ if (retval != 0) - return retval; - - inc_wptr(&rl_wptr, -- pm->pmf->get_map_queues_packet_size(), -+ sizeof(struct pm4_map_queues), - alloc_size_bytes); - } - -@@ -187,74 +385,63 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - if (!q->properties.is_active) - continue; - -- pr_debug("static_queue, mapping user queue %d, is debug status %d\n", -+ pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n", - q->queue, qpd->is_debug); - -- retval = pm->pmf->map_queues(pm, -+ if (pm->dqm->dev->device_info->asic_family == -+ CHIP_CARRIZO) -+ retval = pm_create_map_queue_vi(pm, - &rl_buffer[rl_wptr], - q, - qpd->is_debug); -- if (retval) -+ else -+ retval = pm_create_map_queue(pm, -+ &rl_buffer[rl_wptr], -+ q, -+ qpd->is_debug); -+ -+ if (retval != 0) - return retval; - - inc_wptr(&rl_wptr, -- pm->pmf->get_map_queues_packet_size(), -+ sizeof(struct pm4_map_queues), - alloc_size_bytes); - } - } - -- pr_debug("Finished map process and queues to runlist\n"); -+ pr_debug("kfd: finished map process and queues to runlist\n"); - - if (is_over_subscription) -- retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], -- *rl_gpu_addr, -- alloc_size_bytes / sizeof(uint32_t), -- true); -+ pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, -+ alloc_size_bytes / sizeof(uint32_t), true); - - for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) - pr_debug("0x%2X ", rl_buffer[i]); - pr_debug("\n"); - -- return retval; -+ return 0; - } - --int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, -- uint16_t fw_ver) -+int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) - { -+ BUG_ON(!dqm); -+ - pm->dqm = dqm; - mutex_init(&pm->lock); - pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); -- if (!pm->priv_queue) { -+ if (pm->priv_queue == NULL) { - mutex_destroy(&pm->lock); - return -ENOMEM; - } - pm->allocated = false; - -- switch (pm->dqm->dev->device_info->asic_family) { -- case CHIP_KAVERI: -- case CHIP_HAWAII: -- kfd_pm_func_init_cik(pm, fw_ver); -- break; -- case CHIP_CARRIZO: -- case CHIP_TONGA: -- case CHIP_FIJI: -- case CHIP_POLARIS10: -- case CHIP_POLARIS11: -- kfd_pm_func_init_vi(pm, fw_ver); -- break; -- case CHIP_VEGA10: -- case CHIP_RAVEN: -- kfd_pm_func_init_v9(pm, fw_ver); -- break; -- default: -- BUG(); -- } -- - return 0; - } - - void pm_uninit(struct packet_manager *pm) - { -+ BUG_ON(!pm); -+ - mutex_destroy(&pm->lock); - kernel_queue_uninit(pm->priv_queue); - } -@@ -262,30 +449,45 @@ void pm_uninit(struct packet_manager *pm) - int pm_send_set_resources(struct packet_manager *pm, - struct scheduling_resources *res) - { -- uint32_t *buffer, size; -- int retval = 0; -+ struct pm4_set_resources *packet; -+ -+ BUG_ON(!pm || !res); -+ -+ pr_debug("kfd: In func %s\n", __func__); - -- size = pm->pmf->get_set_resources_packet_size(); - mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, -- size / sizeof(uint32_t), -- (unsigned int **)&buffer); -- if (!buffer) { -- pr_err("Failed to allocate buffer on kernel queue\n"); -- retval = -ENOMEM; -- goto out; -+ sizeof(*packet) / sizeof(uint32_t), -+ (unsigned int **)&packet); -+ if (packet == NULL) { -+ mutex_unlock(&pm->lock); -+ pr_err("kfd: failed to allocate buffer on kernel queue\n"); -+ return -ENOMEM; - } - -- retval = pm->pmf->set_resources(pm, buffer, res); -- if (!retval) -- pm->priv_queue->ops.submit_packet(pm->priv_queue); -- else -- pm->priv_queue->ops.rollback_packet(pm->priv_queue); -+ memset(packet, 0, sizeof(struct pm4_set_resources)); -+ packet->header.u32all = build_pm4_header(IT_SET_RESOURCES, -+ sizeof(struct pm4_set_resources)); -+ -+ packet->bitfields2.queue_type = -+ queue_type__mes_set_resources__hsa_interface_queue_hiq; -+ packet->bitfields2.vmid_mask = res->vmid_mask; -+ packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY; -+ packet->bitfields7.oac_mask = res->oac_mask; -+ packet->bitfields8.gds_heap_base = res->gds_heap_base; -+ packet->bitfields8.gds_heap_size = res->gds_heap_size; -+ -+ packet->gws_mask_lo = lower_32_bits(res->gws_mask); -+ packet->gws_mask_hi = upper_32_bits(res->gws_mask); -+ -+ packet->queue_mask_lo = lower_32_bits(res->queue_mask); -+ packet->queue_mask_hi = upper_32_bits(res->queue_mask); -+ -+ pm->priv_queue->ops.submit_packet(pm->priv_queue); - --out: - mutex_unlock(&pm->lock); - -- return retval; -+ return 0; - } - - int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) -@@ -295,25 +497,26 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) - size_t rl_ib_size, packet_size_dwords; - int retval; - -+ BUG_ON(!pm || !dqm_queues); -+ - retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr, - &rl_ib_size); -- if (retval) -+ if (retval != 0) - goto fail_create_runlist_ib; - -- pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); -+ pr_debug("kfd: runlist IB address: 0x%llX\n", rl_gpu_ib_addr); - -- packet_size_dwords = pm->pmf->get_runlist_packet_size() / -- sizeof(uint32_t); -+ packet_size_dwords = sizeof(struct pm4_runlist) / sizeof(uint32_t); - mutex_lock(&pm->lock); - - retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, - packet_size_dwords, &rl_buffer); -- if (retval) -+ if (retval != 0) - goto fail_acquire_packet_buffer; - -- retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, -- rl_ib_size / sizeof(uint32_t), false); -- if (retval) -+ retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, -+ rl_ib_size / sizeof(uint32_t), false); -+ if (retval != 0) - goto fail_create_runlist; - - pm->priv_queue->ops.submit_packet(pm->priv_queue); -@@ -327,72 +530,138 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) - fail_acquire_packet_buffer: - mutex_unlock(&pm->lock); - fail_create_runlist_ib: -- pm_release_ib(pm); -+ if (pm->allocated) -+ pm_release_ib(pm); - return retval; - } - - int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, - uint32_t fence_value) - { -- uint32_t *buffer, size; -- int retval = 0; -+ int retval; -+ struct pm4_query_status *packet; - -- if (WARN_ON(!fence_address)) -- return -EFAULT; -+ BUG_ON(!pm || !fence_address); - -- size = pm->pmf->get_query_status_packet_size(); - mutex_lock(&pm->lock); -- pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, -- size / sizeof(uint32_t), (unsigned int **)&buffer); -- if (!buffer) { -- pr_err("Failed to allocate buffer on kernel queue\n"); -- retval = -ENOMEM; -- goto out; -- } -+ retval = pm->priv_queue->ops.acquire_packet_buffer( -+ pm->priv_queue, -+ sizeof(struct pm4_query_status) / sizeof(uint32_t), -+ (unsigned int **)&packet); -+ if (retval != 0) -+ goto fail_acquire_packet_buffer; - -- retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); -- if (!retval) -- pm->priv_queue->ops.submit_packet(pm->priv_queue); -- else -- pm->priv_queue->ops.rollback_packet(pm->priv_queue); -+ packet->header.u32all = build_pm4_header(IT_QUERY_STATUS, -+ sizeof(struct pm4_query_status)); -+ -+ packet->bitfields2.context_id = 0; -+ packet->bitfields2.interrupt_sel = -+ interrupt_sel__mes_query_status__completion_status; -+ packet->bitfields2.command = -+ command__mes_query_status__fence_only_after_write_ack; -+ -+ packet->addr_hi = upper_32_bits((uint64_t)fence_address); -+ packet->addr_lo = lower_32_bits((uint64_t)fence_address); -+ packet->data_hi = upper_32_bits((uint64_t)fence_value); -+ packet->data_lo = lower_32_bits((uint64_t)fence_value); -+ -+ pm->priv_queue->ops.submit_packet(pm->priv_queue); -+ mutex_unlock(&pm->lock); -+ -+ return 0; - --out: -+fail_acquire_packet_buffer: - mutex_unlock(&pm->lock); - return retval; - } - - int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, -- enum kfd_unmap_queues_filter filter, -+ enum kfd_preempt_type_filter mode, - uint32_t filter_param, bool reset, - unsigned int sdma_engine) - { -- uint32_t *buffer, size; -- int retval = 0; -+ int retval; -+ uint32_t *buffer; -+ struct pm4_unmap_queues *packet; -+ -+ BUG_ON(!pm); - -- size = pm->pmf->get_unmap_queues_packet_size(); - mutex_lock(&pm->lock); -- pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, -- size / sizeof(uint32_t), (unsigned int **)&buffer); -- if (!buffer) { -- pr_err("Failed to allocate buffer on kernel queue\n"); -- retval = -ENOMEM; -- goto out; -+ retval = pm->priv_queue->ops.acquire_packet_buffer( -+ pm->priv_queue, -+ sizeof(struct pm4_unmap_queues) / sizeof(uint32_t), -+ &buffer); -+ if (retval != 0) -+ goto err_acquire_packet_buffer; -+ -+ packet = (struct pm4_unmap_queues *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_unmap_queues)); -+ pr_debug("kfd: static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n", -+ mode, reset, type); -+ packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, -+ sizeof(struct pm4_unmap_queues)); -+ switch (type) { -+ case KFD_QUEUE_TYPE_COMPUTE: -+ case KFD_QUEUE_TYPE_DIQ: -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_unmap_queues__compute; -+ break; -+ case KFD_QUEUE_TYPE_SDMA: -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_unmap_queues__sdma0 + sdma_engine; -+ break; -+ default: -+ BUG(); -+ break; - } - -- retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, -- reset, sdma_engine); -- if (!retval) -- pm->priv_queue->ops.submit_packet(pm->priv_queue); -+ if (reset) -+ packet->bitfields2.action = -+ action__mes_unmap_queues__reset_queues; - else -- pm->priv_queue->ops.rollback_packet(pm->priv_queue); -+ packet->bitfields2.action = -+ action__mes_unmap_queues__preempt_queues; -+ -+ switch (mode) { -+ case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__perform_request_on_specified_queues; -+ packet->bitfields2.num_queues = 1; -+ packet->bitfields3b.doorbell_offset0 = filter_param; -+ break; -+ case KFD_PREEMPT_TYPE_FILTER_BY_PASID: -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; -+ packet->bitfields3a.pasid = filter_param; -+ break; -+ case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES: -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; -+ break; -+ case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES: -+ /* in this case, we do not preempt static queues */ -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; -+ break; -+ default: -+ BUG(); -+ break; -+ } -+ -+ pm->priv_queue->ops.submit_packet(pm->priv_queue); - --out: -+ mutex_unlock(&pm->lock); -+ return 0; -+ -+err_acquire_packet_buffer: - mutex_unlock(&pm->lock); - return retval; - } - - void pm_release_ib(struct packet_manager *pm) - { -+ BUG_ON(!pm); -+ - mutex_lock(&pm->lock); - if (pm->allocated) { - kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj); -@@ -400,18 +669,3 @@ void pm_release_ib(struct packet_manager *pm) - } - mutex_unlock(&pm->lock); - } -- --int pm_debugfs_runlist(struct seq_file *m, void *data) --{ -- struct packet_manager *pm = data; -- -- if (!pm->allocated) { -- seq_puts(m, " No active runlist\n"); -- return 0; -- } -- -- seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, -- pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); -- -- return 0; --} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c -index 1e06de0..6cfe7f1 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c -@@ -32,8 +32,7 @@ int kfd_pasid_init(void) - { - pasid_limit = KFD_MAX_NUM_OF_PROCESSES; - -- pasid_bitmap = kcalloc(BITS_TO_LONGS(pasid_limit), sizeof(long), -- GFP_KERNEL); -+ pasid_bitmap = kcalloc(BITS_TO_LONGS(pasid_limit), sizeof(long), GFP_KERNEL); - if (!pasid_bitmap) - return -ENOMEM; - -@@ -92,6 +91,6 @@ unsigned int kfd_pasid_alloc(void) - - void kfd_pasid_free(unsigned int pasid) - { -- if (!WARN_ON(pasid == 0 || pasid >= pasid_limit)) -- clear_bit(pasid, pasid_bitmap); -+ BUG_ON(pasid == 0 || pasid >= pasid_limit); -+ clear_bit(pasid, pasid_bitmap); - } -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c -deleted file mode 100644 -index 543ed83..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c -+++ /dev/null -@@ -1,513 +0,0 @@ --/* -- * Copyright 2016 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- -- --/* NOTE: -- * -- * This file contains logic to dynamically detect and enable PeerDirect -- * suppor. PeerDirect support is delivered e.g. as part of OFED -- * from Mellanox. Because we are not able to rely on the fact that the -- * corresponding OFED will be installed we should: -- * - copy PeerDirect definitions locally to avoid dependency on -- * corresponding header file -- * - try dynamically detect address of PeerDirect function -- * pointers. -- * -- * If dynamic detection failed then PeerDirect support should be -- * enabled using the standard PeerDirect bridge driver from: -- * https://github.com/RadeonOpenCompute/ROCnRDMA -- * -- * -- * Logic to support PeerDirect relies only on official public API to be -- * non-intrusive as much as possible. -- * -- **/ -- --#include <linux/device.h> --#include <linux/export.h> --#include <linux/pid.h> --#include <linux/err.h> --#include <linux/slab.h> --#include <linux/scatterlist.h> --#include <linux/module.h> -- --#include "kfd_priv.h" -- -- -- --/* ----------------------- PeerDirect interface ------------------------------*/ -- --/* -- * Copyright (c) 2013, Mellanox Technologies. All rights reserved. -- * -- * This software is available to you under a choice of one of two -- * licenses. You may choose to be licensed under the terms of the GNU -- * General Public License (GPL) Version 2, available from the file -- * COPYING in the main directory of this source tree, or the -- * OpenIB.org BSD license below: -- * -- * Redistribution and use in source and binary forms, with or -- * without modification, are permitted provided that the following -- * conditions are met: -- * -- * - Redistributions of source code must retain the above -- * copyright notice, this list of conditions and the following -- * disclaimer. -- * -- * - Redistributions in binary form must reproduce the above -- * copyright notice, this list of conditions and the following -- * disclaimer in the documentation and/or other materials -- * provided with the distribution. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -- * SOFTWARE. -- */ --#define IB_PEER_MEMORY_NAME_MAX 64 --#define IB_PEER_MEMORY_VER_MAX 16 -- --struct peer_memory_client { -- char name[IB_PEER_MEMORY_NAME_MAX]; -- char version[IB_PEER_MEMORY_VER_MAX]; -- /* acquire return code: 1-mine, 0-not mine */ -- int (*acquire)(unsigned long addr, size_t size, -- void *peer_mem_private_data, -- char *peer_mem_name, -- void **client_context); -- int (*get_pages)(unsigned long addr, -- size_t size, int write, int force, -- struct sg_table *sg_head, -- void *client_context, void *core_context); -- int (*dma_map)(struct sg_table *sg_head, void *client_context, -- struct device *dma_device, int dmasync, int *nmap); -- int (*dma_unmap)(struct sg_table *sg_head, void *client_context, -- struct device *dma_device); -- void (*put_pages)(struct sg_table *sg_head, void *client_context); -- unsigned long (*get_page_size)(void *client_context); -- void (*release)(void *client_context); -- void* (*get_context_private_data)(u64 peer_id); -- void (*put_context_private_data)(void *context); --}; -- --typedef int (*invalidate_peer_memory)(void *reg_handle, -- void *core_context); -- --void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, -- invalidate_peer_memory *invalidate_callback); --void ib_unregister_peer_memory_client(void *reg_handle); -- -- --/*------------------- PeerDirect bridge driver ------------------------------*/ -- --#define AMD_PEER_BRIDGE_DRIVER_VERSION "1.0" --#define AMD_PEER_BRIDGE_DRIVER_NAME "amdkfd" -- -- --static void* (*pfn_ib_register_peer_memory_client)(struct peer_memory_client -- *peer_client, -- invalidate_peer_memory -- *invalidate_callback); -- --static void (*pfn_ib_unregister_peer_memory_client)(void *reg_handle); -- --static const struct amd_rdma_interface *rdma_interface; -- --static invalidate_peer_memory ib_invalidate_callback; --static void *ib_reg_handle; -- --struct amd_mem_context { -- uint64_t va; -- uint64_t size; -- struct pid *pid; -- -- struct amd_p2p_info *p2p_info; -- -- /* Flag that free callback was called */ -- int free_callback_called; -- -- /* Context received from PeerDirect call */ -- void *core_context; --}; -- -- --static void free_callback(void *client_priv) --{ -- struct amd_mem_context *mem_context = -- (struct amd_mem_context *)client_priv; -- -- pr_debug("data 0x%p\n", mem_context); -- -- if (!mem_context) { -- pr_warn("Invalid client context\n"); -- return; -- } -- -- pr_debug("mem_context->core_context 0x%p\n", mem_context->core_context); -- -- /* Call back IB stack asking to invalidate memory */ -- (*ib_invalidate_callback) (ib_reg_handle, mem_context->core_context); -- -- /* amdkfd will free resources when we return from this callback. -- * Set flag to inform that there is nothing to do on "put_pages", etc. -- */ -- ACCESS_ONCE(mem_context->free_callback_called) = 1; --} -- -- --static int amd_acquire(unsigned long addr, size_t size, -- void *peer_mem_private_data, -- char *peer_mem_name, void **client_context) --{ -- int ret; -- struct amd_mem_context *mem_context; -- struct pid *pid; -- -- /* Get pointer to structure describing current process */ -- pid = get_task_pid(current, PIDTYPE_PID); -- -- pr_debug("addr:0x%lx,size:0x%x, pid 0x%p\n", -- addr, (unsigned int)size, pid); -- -- /* Check if address is handled by AMD GPU driver */ -- ret = rdma_interface->is_gpu_address(addr, pid); -- -- if (!ret) { -- pr_debug("Not GPU Address\n"); -- /* This is not GPU address */ -- return 0; -- } -- -- pr_debug("GPU address\n"); -- -- /* Initialize context used for operation with given address */ -- mem_context = kzalloc(sizeof(*mem_context), GFP_KERNEL); -- -- if (!mem_context) -- return 0; /* Error case handled as not GPU address */ -- -- mem_context->free_callback_called = 0; -- mem_context->va = addr; -- mem_context->size = size; -- -- /* Save PID. It is guaranteed that the function will be -- * called in the correct process context as opposite to others. -- */ -- mem_context->pid = pid; -- -- pr_debug("Client context %p\n", mem_context); -- -- /* Return pointer to allocated context */ -- *client_context = mem_context; -- -- /* Return 1 to inform that this address which will be handled -- * by AMD GPU driver -- */ -- return 1; --} -- --static int amd_get_pages(unsigned long addr, size_t size, int write, int force, -- struct sg_table *sg_head, -- void *client_context, void *core_context) --{ -- int ret; -- struct amd_mem_context *mem_context = -- (struct amd_mem_context *)client_context; -- -- pr_debug("addr:0x%lx,size:0x%x, core_context:%p\n", -- addr, (unsigned int)size, core_context); -- -- if (!mem_context) { -- pr_warn("Invalid client context"); -- return -EINVAL; -- } -- -- pr_debug("pid :0x%p\n", mem_context->pid); -- -- -- if (addr != mem_context->va) { -- pr_warn("Context address (0x%llx) is not the same\n", -- mem_context->va); -- return -EINVAL; -- } -- -- if (size != mem_context->size) { -- pr_warn("Context size (0x%llx) is not the same\n", -- mem_context->size); -- return -EINVAL; -- } -- -- ret = rdma_interface->get_pages(addr, -- size, -- mem_context->pid, -- &mem_context->p2p_info, -- free_callback, -- mem_context); -- -- if (ret || !mem_context->p2p_info) { -- pr_err("Could not rdma::get_pages failure: %d\n", ret); -- return ret; -- } -- -- mem_context->core_context = core_context; -- -- /* Note: At this stage it is OK not to fill sg_table */ -- return 0; --} -- -- --static int amd_dma_map(struct sg_table *sg_head, void *client_context, -- struct device *dma_device, int dmasync, int *nmap) --{ -- /* -- * NOTE/TODO: -- * We could have potentially three cases for real memory -- * location: -- * - all memory in the local -- * - all memory in the system (RAM) -- * - memory is spread (s/g) between local and system. -- * -- * In the case of all memory in the system we could use -- * iommu driver to build DMA addresses but not in the case -- * of local memory because currently iommu driver doesn't -- * deal with local/device memory addresses (it requires "struct -- * page"). -- * -- * Accordingly returning assumes that iommu funcutionality -- * should be disabled so we can assume that sg_table already -- * contains DMA addresses. -- * -- */ -- struct amd_mem_context *mem_context = -- (struct amd_mem_context *)client_context; -- -- pr_debug("Context 0x%p, sg_head 0x%p\n", -- client_context, sg_head); -- -- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", -- mem_context->pid, -- mem_context->va, -- mem_context->size); -- -- if (!mem_context->p2p_info) { -- pr_err("No sg table were allocated\n"); -- return -EINVAL; -- } -- -- /* Copy information about previosly allocated sg_table */ -- *sg_head = *mem_context->p2p_info->pages; -- -- /* Return number of pages */ -- *nmap = mem_context->p2p_info->pages->nents; -- -- return 0; --} -- --static int amd_dma_unmap(struct sg_table *sg_head, void *client_context, -- struct device *dma_device) --{ -- struct amd_mem_context *mem_context = -- (struct amd_mem_context *)client_context; -- -- pr_debug("Context 0x%p, sg_table 0x%p\n", -- client_context, sg_head); -- -- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", -- mem_context->pid, -- mem_context->va, -- mem_context->size); -- -- /* Assume success */ -- return 0; --} --static void amd_put_pages(struct sg_table *sg_head, void *client_context) --{ -- int ret = 0; -- struct amd_mem_context *mem_context = -- (struct amd_mem_context *)client_context; -- -- pr_debug("sg_head %p client_context: 0x%p\n", -- sg_head, client_context); -- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", -- mem_context->pid, -- mem_context->va, -- mem_context->size); -- -- pr_debug("mem_context->p2p_info %p\n", -- mem_context->p2p_info); -- -- if (ACCESS_ONCE(mem_context->free_callback_called)) { -- pr_debug("Free callback was called\n"); -- return; -- } -- -- if (mem_context->p2p_info) { -- ret = rdma_interface->put_pages(&mem_context->p2p_info); -- mem_context->p2p_info = NULL; -- -- if (ret) -- pr_err("Failure: %d (callback status %d)\n", -- ret, mem_context->free_callback_called); -- } else -- pr_err("Pointer to p2p info is null\n"); --} --static unsigned long amd_get_page_size(void *client_context) --{ -- unsigned long page_size; -- int result; -- struct amd_mem_context *mem_context = -- (struct amd_mem_context *)client_context; -- -- pr_debug("context: %p\n", client_context); -- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", -- mem_context->pid, -- mem_context->va, -- mem_context->size); -- -- -- result = rdma_interface->get_page_size( -- mem_context->va, -- mem_context->size, -- mem_context->pid, -- &page_size); -- -- if (result) { -- pr_err("Could not get page size. %d\n", result); -- /* If we failed to get page size then do not know what to do. -- * Let's return some default value -- */ -- return PAGE_SIZE; -- } -- -- return page_size; --} -- --static void amd_release(void *client_context) --{ -- struct amd_mem_context *mem_context = -- (struct amd_mem_context *)client_context; -- -- pr_debug("context: 0x%p\n", client_context); -- pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", -- mem_context->pid, -- mem_context->va, -- mem_context->size); -- -- kfree(mem_context); --} -- -- --static struct peer_memory_client amd_mem_client = { -- .acquire = amd_acquire, -- .get_pages = amd_get_pages, -- .dma_map = amd_dma_map, -- .dma_unmap = amd_dma_unmap, -- .put_pages = amd_put_pages, -- .get_page_size = amd_get_page_size, -- .release = amd_release, -- .get_context_private_data = NULL, -- .put_context_private_data = NULL, --}; -- --/** Initialize PeerDirect interface with RDMA Network stack. -- * -- * Because network stack could potentially be loaded later we check -- * presence of PeerDirect when HSA process is created. If PeerDirect was -- * already initialized we do nothing otherwise try to detect and register. -- */ --void kfd_init_peer_direct(void) --{ -- int result; -- -- if (pfn_ib_unregister_peer_memory_client) { -- pr_debug("PeerDirect support was already initialized\n"); -- return; -- } -- -- pr_debug("Try to initialize PeerDirect support\n"); -- -- pfn_ib_register_peer_memory_client = -- (void *(*)(struct peer_memory_client *, -- invalidate_peer_memory *)) -- symbol_request(ib_register_peer_memory_client); -- -- pfn_ib_unregister_peer_memory_client = (void (*)(void *)) -- symbol_request(ib_unregister_peer_memory_client); -- -- if (!pfn_ib_register_peer_memory_client || -- !pfn_ib_unregister_peer_memory_client) { -- pr_debug("PeerDirect interface was not detected\n"); -- /* Do cleanup */ -- kfd_close_peer_direct(); -- return; -- } -- -- result = amdkfd_query_rdma_interface(&rdma_interface); -- -- if (result < 0) { -- pr_err("Cannot get RDMA Interface (result = %d)\n", result); -- return; -- } -- -- strcpy(amd_mem_client.name, AMD_PEER_BRIDGE_DRIVER_NAME); -- strcpy(amd_mem_client.version, AMD_PEER_BRIDGE_DRIVER_VERSION); -- -- ib_reg_handle = pfn_ib_register_peer_memory_client(&amd_mem_client, -- &ib_invalidate_callback); -- -- if (!ib_reg_handle) { -- pr_err("Cannot register peer memory client\n"); -- /* Do cleanup */ -- kfd_close_peer_direct(); -- return; -- } -- -- pr_info("PeerDirect support was initialized successfully\n"); --} -- --/** -- * Close connection with PeerDirect interface with RDMA Network stack. -- * -- */ --void kfd_close_peer_direct(void) --{ -- if (pfn_ib_unregister_peer_memory_client) { -- if (ib_reg_handle) -- pfn_ib_unregister_peer_memory_client(ib_reg_handle); -- -- symbol_put(ib_unregister_peer_memory_client); -- } -- -- if (pfn_ib_register_peer_memory_client) -- symbol_put(ib_register_peer_memory_client); -- -- -- /* Reset pointers to be safe */ -- pfn_ib_unregister_peer_memory_client = NULL; -- pfn_ib_register_peer_memory_client = NULL; -- ib_reg_handle = NULL; --} -- -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h -index e50f73d..5b393f3 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h -@@ -28,19 +28,112 @@ - #define PM4_MES_HEADER_DEFINED - union PM4_MES_TYPE_3_HEADER { - struct { -- /* reserved */ -- uint32_t reserved1:8; -- /* IT opcode */ -- uint32_t opcode:8; -- /* number of DWORDs - 1 in the information body */ -- uint32_t count:14; -- /* packet identifier. It should be 3 for type 3 packets */ -- uint32_t type:2; -+ uint32_t reserved1:8; /* < reserved */ -+ uint32_t opcode:8; /* < IT opcode */ -+ uint32_t count:14; /* < number of DWORDs - 1 -+ * in the information body. -+ */ -+ uint32_t type:2; /* < packet identifier. -+ * It should be 3 for type 3 packets -+ */ - }; - uint32_t u32all; - }; - #endif /* PM4_MES_HEADER_DEFINED */ - -+/* --------------------MES_SET_RESOURCES-------------------- */ -+ -+#ifndef PM4_MES_SET_RESOURCES_DEFINED -+#define PM4_MES_SET_RESOURCES_DEFINED -+enum set_resources_queue_type_enum { -+ queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, -+ queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, -+ queue_type__mes_set_resources__hsa_debug_interface_queue = 4 -+}; -+ -+struct pm4_set_resources { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ uint32_t vmid_mask:16; -+ uint32_t unmap_latency:8; -+ uint32_t reserved1:5; -+ enum set_resources_queue_type_enum queue_type:3; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ uint32_t queue_mask_lo; -+ uint32_t queue_mask_hi; -+ uint32_t gws_mask_lo; -+ uint32_t gws_mask_hi; -+ -+ union { -+ struct { -+ uint32_t oac_mask:16; -+ uint32_t reserved2:16; -+ } bitfields7; -+ uint32_t ordinal7; -+ }; -+ -+ union { -+ struct { -+ uint32_t gds_heap_base:6; -+ uint32_t reserved3:5; -+ uint32_t gds_heap_size:6; -+ uint32_t reserved4:15; -+ } bitfields8; -+ uint32_t ordinal8; -+ }; -+ -+}; -+#endif -+ -+/*--------------------MES_RUN_LIST-------------------- */ -+ -+#ifndef PM4_MES_RUN_LIST_DEFINED -+#define PM4_MES_RUN_LIST_DEFINED -+ -+struct pm4_runlist { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved1:2; -+ uint32_t ib_base_lo:30; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ union { -+ struct { -+ uint32_t ib_base_hi:16; -+ uint32_t reserved2:16; -+ } bitfields3; -+ uint32_t ordinal3; -+ }; -+ -+ union { -+ struct { -+ uint32_t ib_size:20; -+ uint32_t chain:1; -+ uint32_t offload_polling:1; -+ uint32_t reserved3:1; -+ uint32_t valid:1; -+ uint32_t reserved4:8; -+ } bitfields4; -+ uint32_t ordinal4; -+ }; -+ -+}; -+#endif - - /*--------------------MES_MAP_PROCESS-------------------- */ - -@@ -93,58 +186,217 @@ struct pm4_map_process { - }; - #endif - --#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH --#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH -+/*--------------------MES_MAP_QUEUES--------------------*/ -+ -+#ifndef PM4_MES_MAP_QUEUES_DEFINED -+#define PM4_MES_MAP_QUEUES_DEFINED -+enum map_queues_queue_sel_enum { -+ queue_sel__mes_map_queues__map_to_specified_queue_slots = 0, -+ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots = 1, -+ queue_sel__mes_map_queues__enable_process_queues = 2 -+}; - --struct pm4_map_process_scratch_kv { -+enum map_queues_vidmem_enum { -+ vidmem__mes_map_queues__uses_no_video_memory = 0, -+ vidmem__mes_map_queues__uses_video_memory = 1 -+}; -+ -+enum map_queues_alloc_format_enum { -+ alloc_format__mes_map_queues__one_per_pipe = 0, -+ alloc_format__mes_map_queues__all_on_one_pipe = 1 -+}; -+ -+enum map_queues_engine_sel_enum { -+ engine_sel__mes_map_queues__compute = 0, -+ engine_sel__mes_map_queues__sdma0 = 2, -+ engine_sel__mes_map_queues__sdma1 = 3 -+}; -+ -+struct pm4_map_queues { - union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; - }; - - union { - struct { -- uint32_t pasid:16; -- uint32_t reserved1:8; -- uint32_t diq_enable:1; -- uint32_t process_quantum:7; -+ uint32_t reserved1:4; -+ enum map_queues_queue_sel_enum queue_sel:2; -+ uint32_t reserved2:2; -+ uint32_t vmid:4; -+ uint32_t reserved3:4; -+ enum map_queues_vidmem_enum vidmem:2; -+ uint32_t reserved4:6; -+ enum map_queues_alloc_format_enum alloc_format:2; -+ enum map_queues_engine_sel_enum engine_sel:3; -+ uint32_t num_queues:3; - } bitfields2; - uint32_t ordinal2; - }; - -+ struct { -+ union { -+ struct { -+ uint32_t is_static:1; -+ uint32_t reserved5:1; -+ uint32_t doorbell_offset:21; -+ uint32_t reserved6:3; -+ uint32_t queue:6; -+ } bitfields3; -+ uint32_t ordinal3; -+ }; -+ -+ uint32_t mqd_addr_lo; -+ uint32_t mqd_addr_hi; -+ uint32_t wptr_addr_lo; -+ uint32_t wptr_addr_hi; -+ -+ } mes_map_queues_ordinals[1]; /* 1..N of these ordinal groups */ -+ -+}; -+#endif -+ -+/*--------------------MES_QUERY_STATUS--------------------*/ -+ -+#ifndef PM4_MES_QUERY_STATUS_DEFINED -+#define PM4_MES_QUERY_STATUS_DEFINED -+enum query_status_interrupt_sel_enum { -+ interrupt_sel__mes_query_status__completion_status = 0, -+ interrupt_sel__mes_query_status__process_status = 1, -+ interrupt_sel__mes_query_status__queue_status = 2 -+}; -+ -+enum query_status_command_enum { -+ command__mes_query_status__interrupt_only = 0, -+ command__mes_query_status__fence_only_immediate = 1, -+ command__mes_query_status__fence_only_after_write_ack = 2, -+ command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 -+}; -+ -+enum query_status_engine_sel_enum { -+ engine_sel__mes_query_status__compute = 0, -+ engine_sel__mes_query_status__sdma0_queue = 2, -+ engine_sel__mes_query_status__sdma1_queue = 3 -+}; -+ -+struct pm4_query_status { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ - union { - struct { -- uint32_t page_table_base:28; -- uint32_t reserved2:4; -- } bitfields3; -+ uint32_t context_id:28; -+ enum query_status_interrupt_sel_enum interrupt_sel:2; -+ enum query_status_command_enum command:2; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ union { -+ struct { -+ uint32_t pasid:16; -+ uint32_t reserved1:16; -+ } bitfields3a; -+ struct { -+ uint32_t reserved2:2; -+ uint32_t doorbell_offset:21; -+ uint32_t reserved3:3; -+ enum query_status_engine_sel_enum engine_sel:3; -+ uint32_t reserved4:3; -+ } bitfields3b; - uint32_t ordinal3; - }; - -- uint32_t reserved3; -- uint32_t sh_mem_bases; -- uint32_t sh_mem_config; -- uint32_t sh_mem_ape1_base; -- uint32_t sh_mem_ape1_limit; -- uint32_t sh_hidden_private_base_vmid; -- uint32_t reserved4; -- uint32_t reserved5; -- uint32_t gds_addr_lo; -- uint32_t gds_addr_hi; -+ uint32_t addr_lo; -+ uint32_t addr_hi; -+ uint32_t data_lo; -+ uint32_t data_hi; -+}; -+#endif -+ -+/*--------------------MES_UNMAP_QUEUES--------------------*/ -+ -+#ifndef PM4_MES_UNMAP_QUEUES_DEFINED -+#define PM4_MES_UNMAP_QUEUES_DEFINED -+enum unmap_queues_action_enum { -+ action__mes_unmap_queues__preempt_queues = 0, -+ action__mes_unmap_queues__reset_queues = 1, -+ action__mes_unmap_queues__disable_process_queues = 2 -+}; -+ -+enum unmap_queues_queue_sel_enum { -+ queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, -+ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, -+ queue_sel__mes_unmap_queues__perform_request_on_all_active_queues = 2, -+ queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only = 3 -+}; -+ -+enum unmap_queues_engine_sel_enum { -+ engine_sel__mes_unmap_queues__compute = 0, -+ engine_sel__mes_unmap_queues__sdma0 = 2, -+ engine_sel__mes_unmap_queues__sdma1 = 3 -+}; -+ -+struct pm4_unmap_queues { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ enum unmap_queues_action_enum action:2; -+ uint32_t reserved1:2; -+ enum unmap_queues_queue_sel_enum queue_sel:2; -+ uint32_t reserved2:20; -+ enum unmap_queues_engine_sel_enum engine_sel:3; -+ uint32_t num_queues:3; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ union { -+ struct { -+ uint32_t pasid:16; -+ uint32_t reserved3:16; -+ } bitfields3a; -+ struct { -+ uint32_t reserved4:2; -+ uint32_t doorbell_offset0:21; -+ uint32_t reserved5:9; -+ } bitfields3b; -+ uint32_t ordinal3; -+ }; - - union { - struct { -- uint32_t num_gws:6; - uint32_t reserved6:2; -- uint32_t num_oac:4; -- uint32_t reserved7:4; -- uint32_t gds_size:6; -- uint32_t num_queues:10; -- } bitfields14; -- uint32_t ordinal14; -+ uint32_t doorbell_offset1:21; -+ uint32_t reserved7:9; -+ } bitfields4; -+ uint32_t ordinal4; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved8:2; -+ uint32_t doorbell_offset2:21; -+ uint32_t reserved9:9; -+ } bitfields5; -+ uint32_t ordinal5; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved10:2; -+ uint32_t doorbell_offset3:21; -+ uint32_t reserved11:9; -+ } bitfields6; -+ uint32_t ordinal6; - }; - -- uint32_t completion_signal_lo32; --uint32_t completion_signal_hi32; - }; - #endif - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h -deleted file mode 100644 -index ddad9be..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h -+++ /dev/null -@@ -1,583 +0,0 @@ --/* -- * Copyright 2016 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- * -- */ -- --#ifndef F32_MES_PM4_PACKETS_H --#define F32_MES_PM4_PACKETS_H -- --#ifndef PM4_MES_HEADER_DEFINED --#define PM4_MES_HEADER_DEFINED --union PM4_MES_TYPE_3_HEADER { -- struct { -- uint32_t reserved1 : 8; /* < reserved */ -- uint32_t opcode : 8; /* < IT opcode */ -- uint32_t count : 14;/* < number of DWORDs - 1 in the -- * information body. -- */ -- uint32_t type : 2; /* < packet identifier. -- * It should be 3 for type 3 packets -- */ -- }; -- uint32_t u32All; --}; --#endif /* PM4_MES_HEADER_DEFINED */ -- --/*--------------------MES_SET_RESOURCES--------------------*/ -- --#ifndef PM4_MES_SET_RESOURCES_DEFINED --#define PM4_MES_SET_RESOURCES_DEFINED --enum mes_set_resources_queue_type_enum { -- queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, -- queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, -- queue_type__mes_set_resources__hsa_debug_interface_queue = 4 --}; -- -- --struct pm4_mes_set_resources { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t vmid_mask:16; -- uint32_t unmap_latency:8; -- uint32_t reserved1:5; -- enum mes_set_resources_queue_type_enum queue_type:3; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- uint32_t queue_mask_lo; -- uint32_t queue_mask_hi; -- uint32_t gws_mask_lo; -- uint32_t gws_mask_hi; -- -- union { -- struct { -- uint32_t oac_mask:16; -- uint32_t reserved2:16; -- } bitfields7; -- uint32_t ordinal7; -- }; -- -- union { -- struct { -- uint32_t gds_heap_base:6; -- uint32_t reserved3:5; -- uint32_t gds_heap_size:6; -- uint32_t reserved4:15; -- } bitfields8; -- uint32_t ordinal8; -- }; -- --}; --#endif -- --/*--------------------MES_RUN_LIST--------------------*/ -- --#ifndef PM4_MES_RUN_LIST_DEFINED --#define PM4_MES_RUN_LIST_DEFINED -- --struct pm4_mes_runlist { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t reserved1:2; -- uint32_t ib_base_lo:30; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- uint32_t ib_base_hi; -- -- union { -- struct { -- uint32_t ib_size:20; -- uint32_t chain:1; -- uint32_t offload_polling:1; -- uint32_t reserved2:1; -- uint32_t valid:1; -- uint32_t process_cnt:4; -- uint32_t reserved3:4; -- } bitfields4; -- uint32_t ordinal4; -- }; -- --}; --#endif -- --/*--------------------MES_MAP_PROCESS--------------------*/ -- --#ifndef PM4_MES_MAP_PROCESS_DEFINED --#define PM4_MES_MAP_PROCESS_DEFINED -- --struct pm4_mes_map_process { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t pasid:16; -- uint32_t reserved1:8; -- uint32_t diq_enable:1; -- uint32_t process_quantum:7; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- uint32_t vm_context_page_table_base_addr_lo32; -- -- uint32_t vm_context_page_table_base_addr_hi32; -- -- uint32_t sh_mem_bases; -- -- uint32_t sh_mem_config; -- -- uint32_t sq_shader_tba_lo; -- -- uint32_t sq_shader_tba_hi; -- -- uint32_t sq_shader_tma_lo; -- -- uint32_t sq_shader_tma_hi; -- -- uint32_t reserved6; -- -- uint32_t gds_addr_lo; -- -- uint32_t gds_addr_hi; -- -- union { -- struct { -- uint32_t num_gws:6; -- uint32_t reserved7:1; -- uint32_t sdma_enable:1; -- uint32_t num_oac:4; -- uint32_t reserved8:4; -- uint32_t gds_size:6; -- uint32_t num_queues:10; -- } bitfields14; -- uint32_t ordinal14; -- }; -- -- uint32_t completion_signal_lo; -- -- uint32_t completion_signal_hi; -- --}; -- --#endif -- --/*--------------------MES_MAP_PROCESS_VM--------------------*/ -- --#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED --#define PM4_MES_MAP_PROCESS_VM_DEFINED -- --struct PM4_MES_MAP_PROCESS_VM { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- uint32_t reserved1; -- -- uint32_t vm_context_cntl; -- -- uint32_t reserved2; -- -- uint32_t vm_context_page_table_end_addr_lo32; -- -- uint32_t vm_context_page_table_end_addr_hi32; -- -- uint32_t vm_context_page_table_start_addr_lo32; -- -- uint32_t vm_context_page_table_start_addr_hi32; -- -- uint32_t reserved3; -- -- uint32_t reserved4; -- -- uint32_t reserved5; -- -- uint32_t reserved6; -- -- uint32_t reserved7; -- -- uint32_t reserved8; -- -- uint32_t completion_signal_lo32; -- -- uint32_t completion_signal_hi32; -- --}; --#endif -- --/*--------------------MES_MAP_QUEUES--------------------*/ -- --#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED --#define PM4_MES_MAP_QUEUES_VI_DEFINED --enum mes_map_queues_queue_sel_enum { -- queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, --queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 --}; -- --enum mes_map_queues_queue_type_enum { -- queue_type__mes_map_queues__normal_compute_vi = 0, -- queue_type__mes_map_queues__debug_interface_queue_vi = 1, -- queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, --queue_type__mes_map_queues__low_latency_static_queue_vi = 3 --}; -- --enum mes_map_queues_alloc_format_enum { -- alloc_format__mes_map_queues__one_per_pipe_vi = 0, --alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 --}; -- --enum mes_map_queues_engine_sel_enum { -- engine_sel__mes_map_queues__compute_vi = 0, -- engine_sel__mes_map_queues__sdma0_vi = 2, -- engine_sel__mes_map_queues__sdma1_vi = 3 --}; -- -- --struct pm4_mes_map_queues { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t reserved1:4; -- enum mes_map_queues_queue_sel_enum queue_sel:2; -- uint32_t reserved2:15; -- enum mes_map_queues_queue_type_enum queue_type:3; -- enum mes_map_queues_alloc_format_enum alloc_format:2; -- enum mes_map_queues_engine_sel_enum engine_sel:3; -- uint32_t num_queues:3; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- union { -- struct { -- uint32_t reserved3:1; -- uint32_t check_disable:1; -- uint32_t doorbell_offset:26; -- uint32_t reserved4:4; -- } bitfields3; -- uint32_t ordinal3; -- }; -- -- uint32_t mqd_addr_lo; -- uint32_t mqd_addr_hi; -- uint32_t wptr_addr_lo; -- uint32_t wptr_addr_hi; --}; --#endif -- --/*--------------------MES_QUERY_STATUS--------------------*/ -- --#ifndef PM4_MES_QUERY_STATUS_DEFINED --#define PM4_MES_QUERY_STATUS_DEFINED --enum mes_query_status_interrupt_sel_enum { -- interrupt_sel__mes_query_status__completion_status = 0, -- interrupt_sel__mes_query_status__process_status = 1, -- interrupt_sel__mes_query_status__queue_status = 2 --}; -- --enum mes_query_status_command_enum { -- command__mes_query_status__interrupt_only = 0, -- command__mes_query_status__fence_only_immediate = 1, -- command__mes_query_status__fence_only_after_write_ack = 2, -- command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 --}; -- --enum mes_query_status_engine_sel_enum { -- engine_sel__mes_query_status__compute = 0, -- engine_sel__mes_query_status__sdma0_queue = 2, -- engine_sel__mes_query_status__sdma1_queue = 3 --}; -- --struct pm4_mes_query_status { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t context_id:28; -- enum mes_query_status_interrupt_sel_enum interrupt_sel:2; -- enum mes_query_status_command_enum command:2; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- union { -- struct { -- uint32_t pasid:16; -- uint32_t reserved1:16; -- } bitfields3a; -- struct { -- uint32_t reserved2:2; -- uint32_t doorbell_offset:26; -- enum mes_query_status_engine_sel_enum engine_sel:3; -- uint32_t reserved3:1; -- } bitfields3b; -- uint32_t ordinal3; -- }; -- -- uint32_t addr_lo; -- uint32_t addr_hi; -- uint32_t data_lo; -- uint32_t data_hi; --}; --#endif -- --/*--------------------MES_UNMAP_QUEUES--------------------*/ -- --#ifndef PM4_MES_UNMAP_QUEUES_DEFINED --#define PM4_MES_UNMAP_QUEUES_DEFINED --enum mes_unmap_queues_action_enum { -- action__mes_unmap_queues__preempt_queues = 0, -- action__mes_unmap_queues__reset_queues = 1, -- action__mes_unmap_queues__disable_process_queues = 2, -- action__mes_unmap_queues__reserved = 3 --}; -- --enum mes_unmap_queues_queue_sel_enum { -- queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, -- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, -- queue_sel__mes_unmap_queues__unmap_all_queues = 2, -- queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 --}; -- --enum mes_unmap_queues_engine_sel_enum { -- engine_sel__mes_unmap_queues__compute = 0, -- engine_sel__mes_unmap_queues__sdma0 = 2, -- engine_sel__mes_unmap_queues__sdmal = 3 --}; -- --struct pm4_mes_unmap_queues { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- enum mes_unmap_queues_action_enum action:2; -- uint32_t reserved1:2; -- enum mes_unmap_queues_queue_sel_enum queue_sel:2; -- uint32_t reserved2:20; -- enum mes_unmap_queues_engine_sel_enum engine_sel:3; -- uint32_t num_queues:3; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- union { -- struct { -- uint32_t pasid:16; -- uint32_t reserved3:16; -- } bitfields3a; -- struct { -- uint32_t reserved4:2; -- uint32_t doorbell_offset0:26; -- int32_t reserved5:4; -- } bitfields3b; -- uint32_t ordinal3; -- }; -- -- union { -- struct { -- uint32_t reserved6:2; -- uint32_t doorbell_offset1:26; -- uint32_t reserved7:4; -- } bitfields4; -- uint32_t ordinal4; -- }; -- -- union { -- struct { -- uint32_t reserved8:2; -- uint32_t doorbell_offset2:26; -- uint32_t reserved9:4; -- } bitfields5; -- uint32_t ordinal5; -- }; -- -- union { -- struct { -- uint32_t reserved10:2; -- uint32_t doorbell_offset3:26; -- uint32_t reserved11:4; -- } bitfields6; -- uint32_t ordinal6; -- }; --}; --#endif -- --#ifndef PM4_MEC_RELEASE_MEM_DEFINED --#define PM4_MEC_RELEASE_MEM_DEFINED -- --enum mec_release_mem_event_index_enum { -- event_index__mec_release_mem__end_of_pipe = 5, -- event_index__mec_release_mem__shader_done = 6 --}; -- --enum mec_release_mem_cache_policy_enum { -- cache_policy__mec_release_mem__lru = 0, -- cache_policy__mec_release_mem__stream = 1 --}; -- --enum mec_release_mem_pq_exe_status_enum { -- pq_exe_status__mec_release_mem__default = 0, -- pq_exe_status__mec_release_mem__phase_update = 1 --}; -- --enum mec_release_mem_dst_sel_enum { -- dst_sel__mec_release_mem__memory_controller = 0, -- dst_sel__mec_release_mem__tc_l2 = 1, -- dst_sel__mec_release_mem__queue_write_pointer_register = 2, -- dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 --}; -- --enum mec_release_mem_int_sel_enum { -- int_sel__mec_release_mem__none = 0, -- int_sel__mec_release_mem__send_interrupt_only = 1, -- int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, -- int_sel__mec_release_mem__send_data_after_write_confirm = 3, -- int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, -- int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, -- int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 --}; -- --enum mec_release_mem_data_sel_enum { -- data_sel__mec_release_mem__none = 0, -- data_sel__mec_release_mem__send_32_bit_low = 1, -- data_sel__mec_release_mem__send_64_bit_data = 2, -- data_sel__mec_release_mem__send_gpu_clock_counter = 3, -- data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, -- data_sel__mec_release_mem__store_gds_data_to_memory = 5 --}; -- --struct pm4_mec_release_mem { -- union { -- union PM4_MES_TYPE_3_HEADER header; /*header */ -- unsigned int ordinal1; -- }; -- -- union { -- struct { -- unsigned int event_type:6; -- unsigned int reserved1:2; -- enum mec_release_mem_event_index_enum event_index:4; -- unsigned int tcl1_vol_action_ena:1; -- unsigned int tc_vol_action_ena:1; -- unsigned int reserved2:1; -- unsigned int tc_wb_action_ena:1; -- unsigned int tcl1_action_ena:1; -- unsigned int tc_action_ena:1; -- uint32_t reserved3:1; -- uint32_t tc_nc_action_ena:1; -- uint32_t tc_wc_action_ena:1; -- uint32_t tc_md_action_ena:1; -- uint32_t reserved4:3; -- enum mec_release_mem_cache_policy_enum cache_policy:2; -- uint32_t reserved5:2; -- enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; -- uint32_t reserved6:2; -- } bitfields2; -- unsigned int ordinal2; -- }; -- -- union { -- struct { -- uint32_t reserved7:16; -- enum mec_release_mem_dst_sel_enum dst_sel:2; -- uint32_t reserved8:6; -- enum mec_release_mem_int_sel_enum int_sel:3; -- uint32_t reserved9:2; -- enum mec_release_mem_data_sel_enum data_sel:3; -- } bitfields3; -- unsigned int ordinal3; -- }; -- -- union { -- struct { -- uint32_t reserved10:2; -- unsigned int address_lo_32b:30; -- } bitfields4; -- struct { -- uint32_t reserved11:3; -- uint32_t address_lo_64b:29; -- } bitfields4b; -- uint32_t reserved12; -- unsigned int ordinal4; -- }; -- -- union { -- uint32_t address_hi; -- uint32_t reserved13; -- uint32_t ordinal5; -- }; -- -- union { -- uint32_t data_lo; -- uint32_t cmp_data_lo; -- struct { -- uint32_t dw_offset:16; -- uint32_t num_dwords:16; -- } bitfields6c; -- uint32_t reserved14; -- uint32_t ordinal6; -- }; -- -- union { -- uint32_t data_hi; -- uint32_t cmp_data_hi; -- uint32_t reserved15; -- uint32_t reserved16; -- uint32_t ordinal7; -- }; -- -- uint32_t int_ctxid; -- --}; -- --#endif -- --enum { -- CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 --}; --#endif -- -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h -index 0b314a8..a0ff348 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h -@@ -77,6 +77,103 @@ struct pm4__indirect_buffer_pasid { - - #endif - -+/*--------------------_RELEASE_MEM-------------------- */ -+ -+#ifndef _PM4__RELEASE_MEM_DEFINED -+#define _PM4__RELEASE_MEM_DEFINED -+enum _RELEASE_MEM_event_index_enum { -+ event_index___release_mem__end_of_pipe = 5, -+ event_index___release_mem__shader_done = 6 -+}; -+ -+enum _RELEASE_MEM_cache_policy_enum { -+ cache_policy___release_mem__lru = 0, -+ cache_policy___release_mem__stream = 1, -+ cache_policy___release_mem__bypass = 2 -+}; -+ -+enum _RELEASE_MEM_dst_sel_enum { -+ dst_sel___release_mem__memory_controller = 0, -+ dst_sel___release_mem__tc_l2 = 1, -+ dst_sel___release_mem__queue_write_pointer_register = 2, -+ dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 -+}; -+ -+enum _RELEASE_MEM_int_sel_enum { -+ int_sel___release_mem__none = 0, -+ int_sel___release_mem__send_interrupt_only = 1, -+ int_sel___release_mem__send_interrupt_after_write_confirm = 2, -+ int_sel___release_mem__send_data_after_write_confirm = 3 -+}; -+ -+enum _RELEASE_MEM_data_sel_enum { -+ data_sel___release_mem__none = 0, -+ data_sel___release_mem__send_32_bit_low = 1, -+ data_sel___release_mem__send_64_bit_data = 2, -+ data_sel___release_mem__send_gpu_clock_counter = 3, -+ data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, -+ data_sel___release_mem__store_gds_data_to_memory = 5 -+}; -+ -+struct pm4__release_mem { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /*header */ -+ unsigned int ordinal1; -+ }; -+ -+ union { -+ struct { -+ unsigned int event_type:6; -+ unsigned int reserved1:2; -+ enum _RELEASE_MEM_event_index_enum event_index:4; -+ unsigned int tcl1_vol_action_ena:1; -+ unsigned int tc_vol_action_ena:1; -+ unsigned int reserved2:1; -+ unsigned int tc_wb_action_ena:1; -+ unsigned int tcl1_action_ena:1; -+ unsigned int tc_action_ena:1; -+ unsigned int reserved3:6; -+ unsigned int atc:1; -+ enum _RELEASE_MEM_cache_policy_enum cache_policy:2; -+ unsigned int reserved4:5; -+ } bitfields2; -+ unsigned int ordinal2; -+ }; -+ -+ union { -+ struct { -+ unsigned int reserved5:16; -+ enum _RELEASE_MEM_dst_sel_enum dst_sel:2; -+ unsigned int reserved6:6; -+ enum _RELEASE_MEM_int_sel_enum int_sel:3; -+ unsigned int reserved7:2; -+ enum _RELEASE_MEM_data_sel_enum data_sel:3; -+ } bitfields3; -+ unsigned int ordinal3; -+ }; -+ -+ union { -+ struct { -+ unsigned int reserved8:2; -+ unsigned int address_lo_32b:30; -+ } bitfields4; -+ struct { -+ unsigned int reserved9:3; -+ unsigned int address_lo_64b:29; -+ } bitfields5; -+ unsigned int ordinal4; -+ }; -+ -+ unsigned int address_hi; -+ -+ unsigned int data_lo; -+ -+ unsigned int data_hi; -+ -+}; -+#endif -+ -+ - /*--------------------_SET_CONFIG_REG-------------------- */ - - #ifndef _PM4__SET_CONFIG_REG_DEFINED -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h -index 7c8d9b3..08c7219 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h -@@ -30,12 +30,10 @@ union PM4_MES_TYPE_3_HEADER { - struct { - uint32_t reserved1 : 8; /* < reserved */ - uint32_t opcode : 8; /* < IT opcode */ -- uint32_t count : 14;/* < Number of DWORDS - 1 in the -- * information body -- */ -- uint32_t type : 2; /* < packet identifier -- * It should be 3 for type 3 packets -- */ -+ uint32_t count : 14;/* < number of DWORDs - 1 in the -+ information body. */ -+ uint32_t type : 2; /* < packet identifier. -+ It should be 3 for type 3 packets */ - }; - uint32_t u32All; - }; -@@ -126,10 +124,9 @@ struct pm4_mes_runlist { - uint32_t ib_size:20; - uint32_t chain:1; - uint32_t offload_polling:1; -- uint32_t reserved2:1; -+ uint32_t reserved3:1; - uint32_t valid:1; -- uint32_t process_cnt:4; -- uint32_t reserved3:4; -+ uint32_t reserved4:8; - } bitfields4; - uint32_t ordinal4; - }; -@@ -144,8 +141,8 @@ struct pm4_mes_runlist { - - struct pm4_mes_map_process { - union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; - }; - - union { -@@ -156,48 +153,36 @@ struct pm4_mes_map_process { - uint32_t process_quantum:7; - } bitfields2; - uint32_t ordinal2; -- }; -+}; - - union { - struct { - uint32_t page_table_base:28; -- uint32_t reserved3:4; -+ uint32_t reserved2:4; - } bitfields3; - uint32_t ordinal3; - }; - -- uint32_t reserved; -- - uint32_t sh_mem_bases; -- uint32_t sh_mem_config; - uint32_t sh_mem_ape1_base; - uint32_t sh_mem_ape1_limit; -- -- uint32_t sh_hidden_private_base_vmid; -- -- uint32_t reserved2; -- uint32_t reserved3; -- -+ uint32_t sh_mem_config; - uint32_t gds_addr_lo; - uint32_t gds_addr_hi; - - union { - struct { - uint32_t num_gws:6; -- uint32_t reserved4:2; -+ uint32_t reserved3:2; - uint32_t num_oac:4; -- uint32_t reserved5:4; -+ uint32_t reserved4:4; - uint32_t gds_size:6; - uint32_t num_queues:10; - } bitfields10; - uint32_t ordinal10; - }; - -- uint32_t completion_signal_lo; -- uint32_t completion_signal_hi; -- - }; -- - #endif - - /*--------------------MES_MAP_QUEUES--------------------*/ -@@ -350,7 +335,7 @@ enum mes_unmap_queues_engine_sel_enum { - engine_sel__mes_unmap_queues__sdmal = 3 - }; - --struct pm4_mes_unmap_queues { -+struct PM4_MES_UNMAP_QUEUES { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; -@@ -410,101 +395,4 @@ struct pm4_mes_unmap_queues { - }; - #endif - --#ifndef PM4_MEC_RELEASE_MEM_DEFINED --#define PM4_MEC_RELEASE_MEM_DEFINED --enum RELEASE_MEM_event_index_enum { -- event_index___release_mem__end_of_pipe = 5, -- event_index___release_mem__shader_done = 6 --}; -- --enum RELEASE_MEM_cache_policy_enum { -- cache_policy___release_mem__lru = 0, -- cache_policy___release_mem__stream = 1, -- cache_policy___release_mem__bypass = 2 --}; -- --enum RELEASE_MEM_dst_sel_enum { -- dst_sel___release_mem__memory_controller = 0, -- dst_sel___release_mem__tc_l2 = 1, -- dst_sel___release_mem__queue_write_pointer_register = 2, -- dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 --}; -- --enum RELEASE_MEM_int_sel_enum { -- int_sel___release_mem__none = 0, -- int_sel___release_mem__send_interrupt_only = 1, -- int_sel___release_mem__send_interrupt_after_write_confirm = 2, -- int_sel___release_mem__send_data_after_write_confirm = 3 --}; -- --enum RELEASE_MEM_data_sel_enum { -- data_sel___release_mem__none = 0, -- data_sel___release_mem__send_32_bit_low = 1, -- data_sel___release_mem__send_64_bit_data = 2, -- data_sel___release_mem__send_gpu_clock_counter = 3, -- data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, -- data_sel___release_mem__store_gds_data_to_memory = 5 --}; -- --struct pm4_mec_release_mem { -- union { -- union PM4_MES_TYPE_3_HEADER header; /*header */ -- unsigned int ordinal1; -- }; -- -- union { -- struct { -- unsigned int event_type:6; -- unsigned int reserved1:2; -- enum RELEASE_MEM_event_index_enum event_index:4; -- unsigned int tcl1_vol_action_ena:1; -- unsigned int tc_vol_action_ena:1; -- unsigned int reserved2:1; -- unsigned int tc_wb_action_ena:1; -- unsigned int tcl1_action_ena:1; -- unsigned int tc_action_ena:1; -- unsigned int reserved3:6; -- unsigned int atc:1; -- enum RELEASE_MEM_cache_policy_enum cache_policy:2; -- unsigned int reserved4:5; -- } bitfields2; -- unsigned int ordinal2; -- }; -- -- union { -- struct { -- unsigned int reserved5:16; -- enum RELEASE_MEM_dst_sel_enum dst_sel:2; -- unsigned int reserved6:6; -- enum RELEASE_MEM_int_sel_enum int_sel:3; -- unsigned int reserved7:2; -- enum RELEASE_MEM_data_sel_enum data_sel:3; -- } bitfields3; -- unsigned int ordinal3; -- }; -- -- union { -- struct { -- unsigned int reserved8:2; -- unsigned int address_lo_32b:30; -- } bitfields4; -- struct { -- unsigned int reserved9:3; -- unsigned int address_lo_64b:29; -- } bitfields5; -- unsigned int ordinal4; -- }; -- -- unsigned int address_hi; -- -- unsigned int data_lo; -- -- unsigned int data_hi; --}; --#endif -- --enum { -- CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 --}; -- - #endif -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -old mode 100755 -new mode 100644 -index 88fdfc9..4750cab ---- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -@@ -30,49 +30,13 @@ - #include <linux/atomic.h> - #include <linux/workqueue.h> - #include <linux/spinlock.h> --#include <linux/idr.h> - #include <linux/kfd_ioctl.h> --#include <linux/pid.h> --#include <linux/interval_tree.h> --#include <linux/seq_file.h> --#include <linux/kref.h> --#include <linux/kfifo.h> - #include <kgd_kfd_interface.h> - --#include <drm/amd_rdma.h> --#include "amd_shared.h" -- - #define KFD_SYSFS_FILE_MODE 0444 - --/* GPU ID hash width in bits */ --#define KFD_GPU_ID_HASH_WIDTH 16 -- --/* Use upper bits of mmap offset to store KFD driver specific information. -- * BITS[63:62] - Encode MMAP type -- * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to -- * BITS[45:40] - Reserved. Not Used. -- * BITS[39:0] - MMAP offset value. Used by TTM. -- * -- * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these -- * defines are w.r.t to PAGE_SIZE -- */ --#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) --#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) --#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) --#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) --#define KFD_MMAP_TYPE_MAP_BO (0x1ULL << KFD_MMAP_TYPE_SHIFT) --#define KFD_MMAP_TYPE_RESERVED_MEM (0x0ULL << KFD_MMAP_TYPE_SHIFT) -- --#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) --#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ -- << KFD_MMAP_GPU_ID_SHIFT) --#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ -- & KFD_MMAP_GPU_ID_MASK) --#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ -- >> KFD_MMAP_GPU_ID_SHIFT) -- --#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT) --#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) -+#define KFD_MMAP_DOORBELL_MASK 0x8000000000000 -+#define KFD_MMAP_EVENTS_MASK 0x4000000000000 - - /* - * When working with cp scheduler we should assign the HIQ manually or via -@@ -84,6 +48,8 @@ - #define KFD_CIK_HIQ_PIPE 4 - #define KFD_CIK_HIQ_QUEUE 0 - -+/* GPU ID hash width in bits */ -+#define KFD_GPU_ID_HASH_WIDTH 16 - - /* Macro for allocating structures */ - #define kfd_alloc_struct(ptr_to_struct) \ -@@ -108,42 +74,12 @@ extern int max_num_of_queues_per_device; - /* Kernel module parameter to specify the scheduling policy */ - extern int sched_policy; - --extern int cwsr_enable; -- --/* -- * Kernel module parameter to specify the maximum process -- * number per HW scheduler -- */ --extern int hws_max_conc_proc; -- - /* - * Kernel module parameter to specify whether to send sigterm to HSA process on - * unhandled exception - */ - extern int send_sigterm; - --/* -- * This kernel module is used to simulate large bar machine on non-large bar -- * enabled machines. -- */ --extern int debug_largebar; -- --/* -- * Ignore CRAT table during KFD initialization, can be used to work around -- * broken CRAT tables on some AMD systems -- */ --extern int ignore_crat; -- --/* -- * Set sh_mem_config.retry_disable on Vega10 -- */ --extern int vega10_noretry; -- --/* -- * Enable privileged mode for all CP queues including user queues -- */ --extern int priv_cp_queues; -- - /** - * enum kfd_sched_policy - * -@@ -176,28 +112,26 @@ enum cache_policy { - cache_policy_noncoherent - }; - --#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) -+enum asic_family_type { -+ CHIP_KAVERI = 0, -+ CHIP_CARRIZO -+}; - - struct kfd_event_interrupt_class { - bool (*interrupt_isr)(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry, uint32_t *patched_ihre, -- bool *patched_flag); -+ const uint32_t *ih_ring_entry); - void (*interrupt_wq)(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry); -+ const uint32_t *ih_ring_entry); - }; - - struct kfd_device_info { -- enum amd_asic_type asic_family; -+ unsigned int asic_family; - const struct kfd_event_interrupt_class *event_interrupt_class; - unsigned int max_pasid_bits; - unsigned int max_no_of_hqd; -- unsigned int doorbell_size; - size_t ih_ring_entry_size; - uint8_t num_of_watch_points; - uint16_t mqd_size_aligned; -- bool is_need_iommu_device; -- bool supports_cwsr; -- bool needs_pci_atomics; - }; - - struct kfd_mem_obj { -@@ -205,13 +139,6 @@ struct kfd_mem_obj { - uint32_t range_end; - uint64_t gpu_addr; - uint32_t *cpu_ptr; -- void *gtt_mem; --}; -- --struct kfd_vmid_info { -- uint32_t first_vmid_kfd; -- uint32_t last_vmid_kfd; -- uint32_t vmid_num_kfd; - }; - - struct kfd_dev { -@@ -238,12 +165,11 @@ struct kfd_dev { - */ - - struct kgd2kfd_shared_resources shared_resources; -- struct kfd_vmid_info vm_info; - - const struct kfd2kgd_calls *kfd2kgd; - struct mutex doorbell_mutex; -- unsigned long doorbell_available_index[DIV_ROUND_UP( -- KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; -+ DECLARE_BITMAP(doorbell_available_index, -+ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); - - void *gtt_mem; - uint64_t gtt_start_gpu_addr; -@@ -253,17 +179,18 @@ struct kfd_dev { - unsigned int gtt_sa_chunk_size; - unsigned int gtt_sa_num_of_chunks; - -- /* QCM Device instance */ -- struct device_queue_manager *dqm; -- -- bool init_complete; -- - /* Interrupts */ -- struct kfifo ih_fifo; -- struct workqueue_struct *ih_wq; -+ void *interrupt_ring; -+ size_t interrupt_ring_size; -+ atomic_t interrupt_ring_rptr; -+ atomic_t interrupt_ring_wptr; - struct work_struct interrupt_work; - spinlock_t interrupt_lock; - -+ /* QCM Device instance */ -+ struct device_queue_manager *dqm; -+ -+ bool init_complete; - /* - * Interrupts of interest to KFD are copied - * from the HW ring into a SW ring. -@@ -271,32 +198,7 @@ struct kfd_dev { - bool interrupts_active; - - /* Debug manager */ -- struct kfd_dbgmgr *dbgmgr; -- -- /* MEC firmware version*/ -- uint16_t mec_fw_version; -- -- /* Maximum process number mapped to HW scheduler */ -- unsigned int max_proc_per_quantum; -- -- /* cwsr */ -- bool cwsr_enabled; -- struct page *cwsr_pages; -- uint32_t cwsr_size; -- uint32_t tma_offset; /*Offset for TMA from the start of cwsr_mem*/ -- -- /* IB usage */ -- uint32_t ib_size; --}; -- --struct kfd_ipc_obj; -- --struct kfd_bo { -- void *mem; -- struct interval_tree_node it; -- struct kfd_dev *dev; -- struct list_head cb_data_head; -- struct kfd_ipc_obj *kfd_ipc_obj; -+ struct kfd_dbgmgr *dbgmgr; - }; - - /* KGD2KFD callbacks */ -@@ -319,22 +221,27 @@ void kfd_chardev_exit(void); - struct device *kfd_chardev(void); - - /** -- * enum kfd_unmap_queues_filter -+ * enum kfd_preempt_type_filter - * -- * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue. -+ * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue. - * -- * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the -+ * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the - * running queues list. - * -- * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to -+ * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to - * specific process. - * - */ --enum kfd_unmap_queues_filter { -- KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE, -- KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, -- KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, -- KFD_UNMAP_QUEUES_FILTER_BY_PASID -+enum kfd_preempt_type_filter { -+ KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE, -+ KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, -+ KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, -+ KFD_PREEMPT_TYPE_FILTER_BY_PASID -+}; -+ -+enum kfd_preempt_type { -+ KFD_PREEMPT_TYPE_WAVEFRONT, -+ KFD_PREEMPT_TYPE_WAVEFRONT_RESET - }; - - /** -@@ -360,11 +267,6 @@ enum kfd_queue_format { - KFD_QUEUE_FORMAT_AQL - }; - --enum KFD_QUEUE_PRIORITY { -- KFD_QUEUE_PRIORITY_MINIMUM = 0, -- KFD_QUEUE_PRIORITY_MAXIMUM = 15 --}; -- - /** - * struct queue_properties - * -@@ -392,13 +294,13 @@ enum KFD_QUEUE_PRIORITY { - * @write_ptr: Defines the number of dwords written to the ring buffer. - * - * @doorbell_ptr: This field aim is to notify the H/W of new packet written to -- * the queue ring buffer. This field should be similar to write_ptr and the -- * user should update this field after he updated the write_ptr. -+ * the queue ring buffer. This field should be similar to write_ptr and the user -+ * should update this field after he updated the write_ptr. - * - * @doorbell_off: The doorbell offset in the doorbell pci-bar. - * -- * @is_interop: Defines if this is a interop queue. Interop queue means that -- * the queue can access both graphics and compute resources. -+ * @is_interop: Defines if this is a interop queue. Interop queue means that the -+ * queue can access both graphics and compute resources. - * - * @is_active: Defines if the queue is active or not. - * -@@ -419,10 +321,9 @@ struct queue_properties { - uint32_t queue_percent; - uint32_t *read_ptr; - uint32_t *write_ptr; -- void __iomem *doorbell_ptr; -+ uint32_t __iomem *doorbell_ptr; - uint32_t doorbell_off; - bool is_interop; -- bool is_evicted; /* true -> queue is evicted */ - bool is_active; - /* Not relevant for user mode queues in cp scheduling */ - unsigned int vmid; -@@ -435,12 +336,6 @@ struct queue_properties { - uint32_t eop_ring_buffer_size; - uint64_t ctx_save_restore_area_address; - uint32_t ctx_save_restore_area_size; -- uint32_t ctl_stack_size; -- uint64_t tba_addr; -- uint64_t tma_addr; -- /* Relevant for CU */ -- uint32_t cu_mask_count; /* Must be a multiple of 32 */ -- uint32_t *cu_mask; - }; - - /** -@@ -457,10 +352,9 @@ struct queue_properties { - * @properties: The queue properties. - * - * @mec: Used only in no cp scheduling mode and identifies to micro engine id -- * that the queue should be execute on. -+ * that the queue should be execute on. - * -- * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe -- * id. -+ * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe id. - * - * @queue: Used only in no cp scheduliong mode and identifies the queue's slot. - * -@@ -485,7 +379,6 @@ struct queue { - uint32_t queue; - - unsigned int sdma_id; -- unsigned int doorbell_id; - - struct kfd_process *process; - struct kfd_dev *device; -@@ -502,19 +395,6 @@ enum KFD_MQD_TYPE { - KFD_MQD_TYPE_MAX - }; - --enum KFD_PIPE_PRIORITY { -- KFD_PIPE_PRIORITY_CS_LOW = 0, -- KFD_PIPE_PRIORITY_CS_MEDIUM, -- KFD_PIPE_PRIORITY_CS_HIGH --}; -- --enum KFD_SPI_PRIORITY { -- KFD_SPI_PRIORITY_EXTRA_LOW = 0, -- KFD_SPI_PRIORITY_LOW, -- KFD_SPI_PRIORITY_MEDIUM, -- KFD_SPI_PRIORITY_HIGH --}; -- - struct scheduling_resources { - unsigned int vmid_mask; - enum kfd_queue_type type; -@@ -528,6 +408,7 @@ struct scheduling_resources { - struct process_queue_manager { - /* data */ - struct kfd_process *process; -+ unsigned int num_concurrent_processes; - struct list_head queues; - unsigned long *queue_slot_bitmap; - }; -@@ -543,13 +424,6 @@ struct qcm_process_device { - unsigned int queue_count; - unsigned int vmid; - bool is_debug; -- unsigned int evicted; /* eviction counter, 0=active */ -- -- /* This flag tells if we should reset all wavefronts on -- * process termination -- */ -- bool reset_wavefronts; -- - /* - * All the memory management data should be here too - */ -@@ -562,55 +436,6 @@ struct qcm_process_device { - uint32_t gds_size; - uint32_t num_gws; - uint32_t num_oac; -- uint32_t sh_hidden_private_base; -- -- /*cwsr memory*/ -- uint64_t cwsr_base; -- uint64_t tba_addr; -- uint64_t tma_addr; -- void *cwsr_kaddr; -- struct page *cwsr_pages; -- -- /* IB memory */ -- uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */ -- void *ib_kaddr; -- -- /*doorbell resources per process per device*/ -- unsigned long *doorbell_bitmap; --}; -- --/* KFD Memory Eviction */ --struct kfd_eviction_work { -- struct delayed_work dwork; -- struct dma_fence *quiesce_fence; --}; -- --/* Approx. wait time before attempting to restore evicted BOs */ --#define PROCESS_RESTORE_TIME_MS 100 --/* Approx. back off time if restore fails due to lack of memory */ --#define PROCESS_BACK_OFF_TIME_MS 100 --/* Approx. time before evicting the process again */ --#define PROCESS_ACTIVE_TIME_MS 10 -- --void kfd_evict_bo_worker(struct work_struct *work); --void kfd_restore_bo_worker(struct work_struct *work); --int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, -- struct dma_fence *fence); --int quiesce_process_mm(struct kfd_process *p); -- -- --/* 8 byte handle containing GPU ID in the most significant 4 bytes and -- * idr_handle in the least significant 4 bytes -- */ --#define MAKE_HANDLE(gpu_id, idr_handle) \ -- (((uint64_t)(gpu_id) << 32) + idr_handle) --#define GET_GPU_ID(handle) (handle >> 32) --#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) -- --enum kfd_pdd_bound { -- PDD_UNBOUND = 0, -- PDD_BOUND, -- PDD_BOUND_SUSPENDED, - }; - - /* Data that is per-process-per device. */ -@@ -624,8 +449,6 @@ struct kfd_process_device { - /* The device that owns this data. */ - struct kfd_dev *dev; - -- /* The process that owns this kfd_process_device. */ -- struct kfd_process *process; - - /* per-process-per device QCM data structure */ - struct qcm_process_device qpd; -@@ -637,27 +460,14 @@ struct kfd_process_device { - uint64_t gpuvm_limit; - uint64_t scratch_base; - uint64_t scratch_limit; -- uint64_t dgpu_base; -- uint64_t dgpu_limit; -- -- uint64_t sh_hidden_private_base_vmid; -- -- /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) -- */ -- enum kfd_pdd_bound bound; - -- /* VM context for GPUVM allocations */ -- void *vm; -+ /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ -+ bool bound; - -- /* GPUVM allocations storage */ -- struct idr alloc_idr; -- -- /* Flag used to tell the pdd has dequeued from the dqm. -- * This is used to prevent dev->dqm->ops.process_termination() from -- * being called twice when it is already called in IOMMU callback -- * function. -+ /* This flag tells if we should reset all -+ * wavefronts on process termination - */ -- bool already_dequeued; -+ bool reset_wavefronts; - }; - - #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) -@@ -670,15 +480,7 @@ struct kfd_process { - */ - struct hlist_node kfd_processes; - -- /* -- * Opaque pointer to mm_struct. We don't hold a reference to -- * it so it should never be dereferenced from here. This is -- * only used for looking up processes by their mm. -- */ -- void *mm; -- -- struct kref ref; -- struct work_struct release_work; -+ struct mm_struct *mm; - - struct mutex mutex; - -@@ -686,8 +488,6 @@ struct kfd_process { - * In any process, the thread that started main() is the lead - * thread and outlives the rest. - * It is here because amd_iommu_bind_pasid wants a task_struct. -- * It can also be used for safely getting a reference to the -- * mm_struct of the process. - */ - struct task_struct *lead_thread; - -@@ -707,8 +507,11 @@ struct kfd_process { - - struct process_queue_manager pqm; - -- unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, -- BITS_PER_LONG)]; -+ /* The process's queues. */ -+ size_t queue_array_size; -+ -+ /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ -+ struct kfd_queue **queues; - - /*Is the user space process 32 bit?*/ - bool is_32bit_user_mode; -@@ -717,29 +520,10 @@ struct kfd_process { - struct mutex event_mutex; - /* All events in process hashed by ID, linked on kfd_event.events. */ - DECLARE_HASHTABLE(events, 4); -- /* struct slot_page_header.event_pages */ -- struct list_head signal_event_pages; -+ struct list_head signal_event_pages; /* struct slot_page_header. -+ event_pages */ - u32 next_nonsignal_event_id; - size_t signal_event_count; -- bool signal_event_limit_reached; -- -- struct rb_root_cached bo_interval_tree; -- -- /* Information used for memory eviction */ -- void *process_info; -- /* Eviction fence that is attached to all the BOs of this process. The -- * fence will be triggered during eviction and new one will be created -- * during restore -- */ -- struct dma_fence *ef; -- -- /* Work items for evicting and restoring BOs */ -- struct kfd_eviction_work eviction_work; -- struct delayed_work restore_work; -- /* Approx. the last timestamp (in jiffies) when the process was -- * restored after an eviction -- */ -- unsigned long last_restore_timestamp; - }; - - /** -@@ -762,55 +546,21 @@ struct amdkfd_ioctl_desc { - - void kfd_process_create_wq(void); - void kfd_process_destroy_wq(void); --struct kfd_process *kfd_create_process(struct file *filep); --struct kfd_process *kfd_get_process(const struct task_struct *task); -+struct kfd_process *kfd_create_process(const struct task_struct *); -+struct kfd_process *kfd_get_process(const struct task_struct *); - struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); --struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); --void kfd_unref_process(struct kfd_process *p); --void kfd_suspend_all_processes(void); --int kfd_resume_all_processes(void); - - struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, - struct kfd_process *p); --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) --int kfd_bind_processes_to_device(struct kfd_dev *dev); --void kfd_unbind_processes_from_device(struct kfd_dev *dev); --#endif --void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid); -+void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); - struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, - struct kfd_process *p); - struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, - struct kfd_process *p); - --int kfd_reserved_mem_mmap(struct kfd_process *process, -- struct vm_area_struct *vma); -- --/* KFD process API for creating and translating handles */ --int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, -- void *mem, uint64_t start, -- uint64_t length, -- struct kfd_ipc_obj *ipc_obj); --void *kfd_process_device_translate_handle(struct kfd_process_device *p, -- int handle); --struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, -- int handle); --void *kfd_process_find_bo_from_interval(struct kfd_process *p, -- uint64_t start_addr, -- uint64_t last_addr); --void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, -- int handle); -- --void run_rdma_free_callback(struct kfd_bo *buf_obj); --struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); -- --/* kfd dgpu memory */ --int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd); -- - /* Process device data iterator */ --struct kfd_process_device *kfd_get_first_process_device_data( -- struct kfd_process *p); --struct kfd_process_device *kfd_get_next_process_device_data( -- struct kfd_process *p, -+struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p); -+struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, - struct kfd_process_device *pdd); - bool kfd_has_process_device_data(struct kfd_process *p); - -@@ -823,20 +573,16 @@ unsigned int kfd_pasid_alloc(void); - void kfd_pasid_free(unsigned int pasid); - - /* Doorbells */ --size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); --int kfd_doorbell_init(struct kfd_dev *kfd); --void kfd_doorbell_fini(struct kfd_dev *kfd); --int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process, -- struct vm_area_struct *vma); --void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, -+void kfd_doorbell_init(struct kfd_dev *kfd); -+int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); -+u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, - unsigned int *doorbell_off); - void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); - u32 read_kernel_doorbell(u32 __iomem *db); --void write_kernel_doorbell(void __iomem *db, u32 value); --void write_kernel_doorbell64(void __iomem *db, u64 value); --unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, -+void write_kernel_doorbell(u32 __iomem *db, u32 value); -+unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, - struct kfd_process *process, -- unsigned int doorbell_id); -+ unsigned int queue_id); - - /* GTT Sub-Allocator */ - -@@ -852,22 +598,16 @@ int kfd_topology_init(void); - void kfd_topology_shutdown(void); - int kfd_topology_add_device(struct kfd_dev *gpu); - int kfd_topology_remove_device(struct kfd_dev *gpu); --struct kfd_topology_device *kfd_topology_device_by_proximity_domain( -- uint32_t proximity_domain); - struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); - struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); --struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd); --int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); --int kfd_numa_node_to_apic_id(int numa_node_id); -+struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); - - /* Interrupts */ - int kfd_interrupt_init(struct kfd_dev *dev); - void kfd_interrupt_exit(struct kfd_dev *dev); - void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); - bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry); --bool interrupt_is_wanted(struct kfd_dev *dev, -- const uint32_t *ih_ring_entry, -- uint32_t *patched_ihre, bool *flag); -+bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry); - - /* Power Management */ - void kgd2kfd_suspend(struct kfd_dev *kfd); -@@ -875,10 +615,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd); - - /* amdkfd Apertures */ - int kfd_init_apertures(struct kfd_process *process); --int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, -- uint64_t base, uint64_t limit); - - /* Queue Context Management */ -+struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); -+ - int init_queue(struct queue **q, const struct queue_properties *properties); - void uninit_queue(struct queue *q); - void print_queue_properties(struct queue_properties *q); -@@ -888,20 +628,13 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); - struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); --struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, -- struct kfd_dev *dev); - struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); --struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, -- struct kfd_dev *dev); --struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, -- struct kfd_dev *dev); - struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); - void device_queue_manager_uninit(struct device_queue_manager *dqm); - struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, - enum kfd_queue_type type); - void kernel_queue_uninit(struct kernel_queue *kq); --int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid); - - /* Process Queue Manager */ - struct process_queue_node { -@@ -910,36 +643,32 @@ struct process_queue_node { - struct list_head process_queue_list; - }; - --void kfd_process_dequeue_from_device(struct kfd_process_device *pdd); --void kfd_process_dequeue_from_all_devices(struct kfd_process *p); - int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p); - void pqm_uninit(struct process_queue_manager *pqm); - int pqm_create_queue(struct process_queue_manager *pqm, - struct kfd_dev *dev, - struct file *f, - struct queue_properties *properties, -+ unsigned int flags, -+ enum kfd_queue_type type, - unsigned int *qid); - int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); - int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, - struct queue_properties *p); --int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, -- struct queue_properties *p); - struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, - unsigned int qid); --int pqm_get_wave_state(struct process_queue_manager *pqm, -- unsigned int qid, -- void __user *ctl_stack, -- u32 *ctl_stack_used_size, -- u32 *save_area_used_size); --int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm); --int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); -+ -+int amdkfd_fence_wait_timeout(unsigned int *fence_addr, -+ unsigned int fence_value, -+ unsigned long timeout); - - /* Packet Manager */ - -+#define KFD_HIQ_TIMEOUT (500) -+ - #define KFD_FENCE_COMPLETED (100) - #define KFD_FENCE_INIT (10) -- --struct packet_manager_func; -+#define KFD_UNMAP_LATENCY (150) - - struct packet_manager { - struct device_queue_manager *dqm; -@@ -947,42 +676,9 @@ struct packet_manager { - struct mutex lock; - bool allocated; - struct kfd_mem_obj *ib_buffer_obj; -- unsigned int ib_size_bytes; -- -- struct packet_manager_funcs *pmf; --}; -- --struct packet_manager_funcs { -- /* Support different firmware versions for PM4 packets */ -- int (*map_process)(struct packet_manager *pm, uint32_t *buffer, -- struct qcm_process_device *qpd); -- int (*runlist)(struct packet_manager *pm, uint32_t *buffer, -- uint64_t ib, size_t ib_size_in_dwords, bool chain); -- int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, -- struct scheduling_resources *res); -- int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, -- struct queue *q, bool is_static); -- int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, -- enum kfd_queue_type type, -- enum kfd_unmap_queues_filter mode, -- uint32_t filter_param, bool reset, -- unsigned int sdma_engine); -- int (*query_status)(struct packet_manager *pm, uint32_t *buffer, -- uint64_t fence_address, uint32_t fence_value); -- uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); -- -- uint32_t (*get_map_process_packet_size)(void); -- uint32_t (*get_runlist_packet_size)(void); -- uint32_t (*get_set_resources_packet_size)(void); -- uint32_t (*get_map_queues_packet_size)(void); -- uint32_t (*get_unmap_queues_packet_size)(void); -- uint32_t (*get_query_status_packet_size)(void); -- uint32_t (*get_release_mem_packet_size)(void); -- - }; - --int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, -- uint16_t fw_ver); -+int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); - void pm_uninit(struct packet_manager *pm); - int pm_send_set_resources(struct packet_manager *pm, - struct scheduling_resources *res); -@@ -991,55 +687,18 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, - uint32_t fence_value); - - int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, -- enum kfd_unmap_queues_filter mode, -+ enum kfd_preempt_type_filter mode, - uint32_t filter_param, bool reset, - unsigned int sdma_engine); - - void pm_release_ib(struct packet_manager *pm); - --/* Following PM funcs can be shared among CIK and VI */ --unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); --int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, -- uint64_t ib, size_t ib_size_in_dwords, bool chain); --int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, -- struct queue *q, bool is_static); --int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, -- struct scheduling_resources *res); --int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, -- enum kfd_queue_type type, -- enum kfd_unmap_queues_filter filter, -- uint32_t filter_param, bool reset, -- unsigned int sdma_engine); --int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, -- uint64_t fence_address, uint32_t fence_value); --uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer); -- --uint32_t pm_get_map_process_packet_size_vi(void); --uint32_t pm_get_runlist_packet_size_vi(void); --uint32_t pm_get_set_resources_packet_size_vi(void); --uint32_t pm_get_map_queues_packet_size_vi(void); --uint32_t pm_get_unmap_queues_packet_size_vi(void); --uint32_t pm_get_query_status_packet_size_vi(void); --uint32_t pm_get_release_mem_packet_size_vi(void); -- -- --void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver); --void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver); -- --void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver); -- -- - uint64_t kfd_get_number_elems(struct kfd_dev *kfd); - phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, - struct kfd_process *process); --int amdkfd_fence_wait_timeout(unsigned int *fence_addr, -- unsigned int fence_value, -- unsigned long timeout_ms); - - /* Events */ - extern const struct kfd_event_interrupt_class event_interrupt_class_cik; --extern const struct kfd_event_interrupt_class event_interrupt_class_v9; -- - extern const struct kfd_device_global_init_class device_global_init_class_cik; - - enum kfd_event_wait_result { -@@ -1057,55 +716,18 @@ int kfd_wait_on_events(struct kfd_process *p, - enum kfd_event_wait_result *wait_result); - void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, - uint32_t valid_id_bits); --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - void kfd_signal_iommu_event(struct kfd_dev *dev, - unsigned int pasid, unsigned long address, - bool is_write_requested, bool is_execute_requested); --#endif - void kfd_signal_hw_exception_event(unsigned int pasid); - int kfd_set_event(struct kfd_process *p, uint32_t event_id); - int kfd_reset_event(struct kfd_process *p, uint32_t event_id); - int kfd_event_create(struct file *devkfd, struct kfd_process *p, - uint32_t event_type, bool auto_reset, uint32_t node_id, - uint32_t *event_id, uint32_t *event_trigger_data, -- uint64_t *event_page_offset, uint32_t *event_slot_index, -- void *kern_addr); -+ uint64_t *event_page_offset, uint32_t *event_slot_index); - int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); --void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle); -- --void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, -- struct kfd_vm_fault_info *info); -- --void kfd_flush_tlb(struct kfd_dev *dev, uint32_t pasid); - - int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); - --#define KFD_SCRATCH_KV_FW_VER 413 -- --/* PeerDirect support */ --void kfd_init_peer_direct(void); --void kfd_close_peer_direct(void); -- --/* IPC Support */ --int kfd_ipc_init(void); -- --/* Debugfs */ --#if defined(CONFIG_DEBUG_FS) -- --void kfd_debugfs_init(void); --void kfd_debugfs_fini(void); --int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data); --int pqm_debugfs_mqds(struct seq_file *m, void *data); --int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data); --int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data); --int kfd_debugfs_rls_by_device(struct seq_file *m, void *data); --int pm_debugfs_runlist(struct seq_file *m, void *data); -- --#else -- --static inline void kfd_debugfs_init(void) {} --static inline void kfd_debugfs_fini(void) {} -- --#endif -- - #endif -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c -index c798fa3..035bbc9 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c -@@ -24,16 +24,10 @@ - #include <linux/log2.h> - #include <linux/sched.h> - #include <linux/sched/mm.h> --#include <linux/sched/task.h> - #include <linux/slab.h> --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - #include <linux/amd-iommu.h> --#endif - #include <linux/notifier.h> - #include <linux/compat.h> --#include <linux/mman.h> --#include <linux/highmem.h> --#include "kfd_ipc.h" - - struct mm_struct; - -@@ -41,6 +35,13 @@ struct mm_struct; - #include "kfd_dbgmgr.h" - - /* -+ * Initial size for the array of queues. -+ * The allocated size is doubled each time -+ * it is exceeded up to MAX_PROCESS_QUEUES. -+ */ -+#define INITIAL_QUEUE_ARRAY_SIZE 16 -+ -+/* - * List of struct kfd_process (field kfd_process). - * Unique/indexed by mm_struct* - */ -@@ -52,16 +53,13 @@ DEFINE_STATIC_SRCU(kfd_processes_srcu); - - static struct workqueue_struct *kfd_process_wq; - --#define MIN_IDR_ID 1 --#define MAX_IDR_ID 0 /*0 - for unlimited*/ -- --static struct kfd_process *find_process(const struct task_struct *thread, -- bool ref); --static void kfd_process_ref_release(struct kref *ref); --static struct kfd_process *create_process(const struct task_struct *thread, -- struct file *filep); --static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); -+struct kfd_process_release_work { -+ struct work_struct kfd_work; -+ struct kfd_process *p; -+}; - -+static struct kfd_process *find_process(const struct task_struct *thread); -+static struct kfd_process *create_process(const struct task_struct *thread); - - void kfd_process_create_wq(void) - { -@@ -77,144 +75,22 @@ void kfd_process_destroy_wq(void) - } - } - --static void kfd_process_free_gpuvm(struct kgd_mem *mem, -- struct kfd_process_device *pdd) --{ -- kfd_unmap_memory_from_gpu(mem, pdd); -- pdd->dev->kfd2kgd->free_memory_of_gpu(pdd->dev->kgd, mem, pdd->vm); --} -- --/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process -- * This function should be only called right after the process -- * is created and when kfd_processes_mutex is still being held -- * to avoid concurrency. Because of that exclusiveness, we do -- * not need to take p->mutex. -- */ --static int kfd_process_alloc_gpuvm(struct kfd_process *p, -- struct kfd_dev *kdev, uint64_t gpu_va, uint32_t size, -- void **kptr, struct kfd_process_device *pdd, uint32_t flags) --{ -- int err; -- void *mem = NULL; -- int handle; -- -- err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, -- pdd->vm, -- (struct kgd_mem **)&mem, NULL, flags); -- if (err) -- goto err_alloc_mem; -- -- err = kdev->kfd2kgd->map_memory_to_gpu( -- kdev->kgd, (struct kgd_mem *)mem, pdd->vm); -- if (err) -- goto err_map_mem; -- -- err = kdev->kfd2kgd->sync_memory(kdev->kgd, (struct kgd_mem *) mem, -- true); -- if (err) { -- pr_debug("Sync memory failed, wait interrupted by user signal\n"); -- goto sync_memory_failed; -- } -- -- kfd_flush_tlb(kdev, p->pasid); -- -- /* Create an obj handle so kfd_process_device_remove_obj_handle -- * will take care of the bo removal when the process finishes. -- * We do not need to take p->mutex, because the process is just -- * created and the ioctls have not had the chance to run. -- */ -- handle = kfd_process_device_create_obj_handle( -- pdd, mem, gpu_va, size, NULL); -- -- if (handle < 0) { -- err = handle; -- goto free_gpuvm; -- } -- -- if (kptr) { -- err = kdev->kfd2kgd->map_gtt_bo_to_kernel(kdev->kgd, -- (struct kgd_mem *)mem, kptr); -- if (err) { -- pr_debug("Map GTT BO to kernel failed\n"); -- goto free_obj_handle; -- } -- } -- -- return err; -- --free_obj_handle: -- kfd_process_device_remove_obj_handle(pdd, handle); --free_gpuvm: --sync_memory_failed: -- kfd_process_free_gpuvm(mem, pdd); -- return err; -- --err_map_mem: -- kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem, pdd->vm); --err_alloc_mem: -- *kptr = NULL; -- return err; --} -- --/* kfd_process_reserve_ib_mem - Reserve memory inside the process for IB usage -- * The memory reserved is for KFD to submit IB to AMDGPU from kernel. -- * If the memory is reserved successfully, ib_kaddr_assigned will have -- * the CPU/kernel address. Check ib_kaddr_assigned before accessing the -- * memory. -- */ --static int kfd_process_reserve_ib_mem(struct kfd_process *p) --{ -- int ret = 0; -- struct kfd_process_device *temp, *pdd = NULL; -- struct kfd_dev *kdev = NULL; -- struct qcm_process_device *qpd = NULL; -- void *kaddr; -- uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | -- ALLOC_MEM_FLAGS_NO_SUBSTITUTE | -- ALLOC_MEM_FLAGS_EXECUTE_ACCESS; -- -- list_for_each_entry_safe(pdd, temp, &p->per_device_data, -- per_device_list) { -- kdev = pdd->dev; -- qpd = &pdd->qpd; -- if (!kdev->ib_size || qpd->ib_kaddr) -- continue; -- -- if (qpd->ib_base) { /* is dGPU */ -- ret = kfd_process_alloc_gpuvm(p, kdev, -- qpd->ib_base, kdev->ib_size, -- &kaddr, pdd, flags); -- if (!ret) -- qpd->ib_kaddr = kaddr; -- else -- /* In case of error, the kfd_bos for some pdds -- * which are already allocated successfully -- * will be freed in upper level function -- * i.e. create_process(). -- */ -- return ret; -- } else { -- /* FIXME: Support APU */ -- continue; -- } -- } -- -- return 0; --} -- --struct kfd_process *kfd_create_process(struct file *filep) -+struct kfd_process *kfd_create_process(const struct task_struct *thread) - { - struct kfd_process *process; - -- struct task_struct *thread = current; -+ BUG_ON(!kfd_process_wq); - -- if (!thread->mm) -+ if (thread->mm == NULL) - return ERR_PTR(-EINVAL); - - /* Only the pthreads threading model is supported. */ - if (thread->group_leader->mm != thread->mm) - return ERR_PTR(-EINVAL); - -+ /* Take mmap_sem because we call __mmu_notifier_register inside */ -+ down_write(&thread->mm->mmap_sem); -+ - /* - * take kfd processes mutex before starting of process creation - * so there won't be a case where two threads of the same process -@@ -223,14 +99,17 @@ struct kfd_process *kfd_create_process(struct file *filep) - mutex_lock(&kfd_processes_mutex); - - /* A prior open of /dev/kfd could have already created the process. */ -- process = find_process(thread, false); -+ process = find_process(thread); - if (process) -- pr_debug("Process already found\n"); -- else -- process = create_process(thread, filep); -+ pr_debug("kfd: process already found\n"); -+ -+ if (!process) -+ process = create_process(thread); - - mutex_unlock(&kfd_processes_mutex); - -+ up_write(&thread->mm->mmap_sem); -+ - return process; - } - -@@ -238,14 +117,14 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) - { - struct kfd_process *process; - -- if (!thread->mm) -+ if (thread->mm == NULL) - return ERR_PTR(-EINVAL); - - /* Only the pthreads threading model is supported. */ - if (thread->group_leader->mm != thread->mm) - return ERR_PTR(-EINVAL); - -- process = find_process(thread, false); -+ process = find_process(thread); - - return process; - } -@@ -262,158 +141,81 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) - return NULL; - } - --static struct kfd_process *find_process(const struct task_struct *thread, -- bool ref) -+static struct kfd_process *find_process(const struct task_struct *thread) - { - struct kfd_process *p; - int idx; - - idx = srcu_read_lock(&kfd_processes_srcu); - p = find_process_by_mm(thread->mm); -- if (p && ref) -- kref_get(&p->ref); - srcu_read_unlock(&kfd_processes_srcu, idx); - - return p; - } - --void kfd_unref_process(struct kfd_process *p) --{ -- kref_put(&p->ref, kfd_process_ref_release); --} -- --/* This increments the process->ref counter. */ --struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid) -+static void kfd_process_wq_release(struct work_struct *work) - { -- struct task_struct *task = NULL; -- struct kfd_process *p = NULL; -- -- if (!pid) -- task = current; -- else -- task = get_pid_task(pid, PIDTYPE_PID); -+ struct kfd_process_release_work *my_work; -+ struct kfd_process_device *pdd, *temp; -+ struct kfd_process *p; - -- if (task) -- p = find_process(task, true); -+ my_work = (struct kfd_process_release_work *) work; - -- return p; --} -+ p = my_work->p; - --static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p) --{ -- struct kfd_process_device *pdd, *peer_pdd; -- struct kfd_bo *buf_obj; -- int id; -- -- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -- /* -- * Remove all handles from idr and release appropriate -- * local memory object -- */ -- idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) { -- list_for_each_entry(peer_pdd, &p->per_device_data, -- per_device_list) { -- peer_pdd->dev->kfd2kgd->unmap_memory_to_gpu( -- peer_pdd->dev->kgd, -- buf_obj->mem, peer_pdd->vm); -- } -- -- run_rdma_free_callback(buf_obj); -- pdd->dev->kfd2kgd->free_memory_of_gpu( -- pdd->dev->kgd, buf_obj->mem, pdd->vm); -- kfd_process_device_remove_obj_handle(pdd, id); -- } -- } --} -+ pr_debug("Releasing process (pasid %d) in workqueue\n", -+ p->pasid); - --static void kfd_process_destroy_pdds(struct kfd_process *p) --{ -- struct kfd_process_device *pdd, *temp; -+ mutex_lock(&p->mutex); - - list_for_each_entry_safe(pdd, temp, &p->per_device_data, -- per_device_list) { -- kfd_flush_tlb(pdd->dev, p->pasid); -- /* Destroy the GPUVM VM context */ -- if (pdd->vm) { -- dma_fence_put(p->ef); -- pdd->dev->kfd2kgd->destroy_process_vm( -- pdd->dev->kgd, pdd->vm); -- } -- list_del(&pdd->per_device_list); -+ per_device_list) { -+ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", -+ pdd->dev->id, p->pasid); - -- if (pdd->qpd.cwsr_pages) { -- kunmap(pdd->qpd.cwsr_pages); -- __free_pages(pdd->qpd.cwsr_pages, -- get_order(pdd->dev->cwsr_size)); -- } -+ if (pdd->reset_wavefronts) -+ dbgdev_wave_reset_wavefronts(pdd->dev, p); - -- kfree(pdd->qpd.doorbell_bitmap); -- idr_destroy(&pdd->alloc_idr); -+ amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); -+ list_del(&pdd->per_device_list); - - kfree(pdd); - } --} -- --/* No process locking is needed in this function, because the process -- * is not findable any more. We must assume that no other thread is -- * using it any more, otherwise we couldn't safely free the process -- * structure in the end. -- */ --static void kfd_process_wq_release(struct work_struct *work) --{ -- struct kfd_process *p = container_of(work, struct kfd_process, -- release_work); --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- struct kfd_process_device *pdd; -- -- pr_debug("Releasing process (pasid %d)\n", -- p->pasid); -- -- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -- pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", -- pdd->dev->id, p->pasid); -- -- if (pdd->dev->device_info->is_need_iommu_device) { -- if (pdd->bound == PDD_BOUND) { -- amd_iommu_unbind_pasid(pdd->dev->pdev, -- p->pasid); -- pdd->bound = PDD_UNBOUND; -- } -- } -- } --#endif -- -- kfd_process_free_outstanding_kfd_bos(p); -- -- kfd_process_destroy_pdds(p); - - kfd_event_free_process(p); - - kfd_pasid_free(p->pasid); - -+ mutex_unlock(&p->mutex); -+ - mutex_destroy(&p->mutex); - -- put_task_struct(p->lead_thread); -+ kfree(p->queues); - - kfree(p); -+ -+ kfree(work); - } - --static void kfd_process_ref_release(struct kref *ref) -+static void kfd_process_destroy_delayed(struct rcu_head *rcu) - { -- struct kfd_process *p = container_of(ref, struct kfd_process, ref); -+ struct kfd_process_release_work *work; -+ struct kfd_process *p; - -- if (WARN_ON(!kfd_process_wq)) -- return; -+ BUG_ON(!kfd_process_wq); - -- INIT_WORK(&p->release_work, kfd_process_wq_release); -- queue_work(kfd_process_wq, &p->release_work); --} -+ p = container_of(rcu, struct kfd_process, rcu); -+ BUG_ON(atomic_read(&p->mm->mm_count) <= 0); - --static void kfd_process_destroy_delayed(struct rcu_head *rcu) --{ -- struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); -+ mmdrop(p->mm); -+ -+ work = kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC); - -- kfd_unref_process(p); -+ if (work) { -+ INIT_WORK((struct work_struct *) work, kfd_process_wq_release); -+ work->p = p; -+ queue_work(kfd_process_wq, (struct work_struct *) work); -+ } - } - - static void kfd_process_notifier_release(struct mmu_notifier *mn, -@@ -421,19 +223,13 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, - { - struct kfd_process *p; - struct kfd_process_device *pdd = NULL; -- struct kfd_dev *dev = NULL; -- long status = -EFAULT; - - /* - * The kfd_process structure can not be free because the - * mmu_notifier srcu is read locked - */ - p = container_of(mn, struct kfd_process, mmu_notifier); -- if (WARN_ON(p->mm != mm)) -- return; -- -- cancel_delayed_work_sync(&p->eviction_work.dwork); -- cancel_delayed_work_sync(&p->restore_work); -+ BUG_ON(p->mm != mm); - - mutex_lock(&kfd_processes_mutex); - hash_del_rcu(&p->kfd_processes); -@@ -442,46 +238,33 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, - - mutex_lock(&p->mutex); - -- /* Iterate over all process device data structures and if the pdd is in -- * debug mode,we should first force unregistration, then we will be -- * able to destroy the queues -- */ -- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { -- dev = pdd->dev; -- mutex_lock(kfd_get_dbgmgr_mutex()); -- -- if (dev && dev->dbgmgr && (dev->dbgmgr->pasid == p->pasid)) { -- -- status = kfd_dbgmgr_unregister(dev->dbgmgr, p); -- if (status == 0) { -- kfd_dbgmgr_destroy(dev->dbgmgr); -- dev->dbgmgr = NULL; -- } -- } -- mutex_unlock(kfd_get_dbgmgr_mutex()); -- } -- -- kfd_process_dequeue_from_all_devices(p); -- -- /* now we can uninit the pqm: */ -+ /* In case our notifier is called before IOMMU notifier */ - pqm_uninit(&p->pqm); - - /* Iterate over all process device data structure and check -- * if we should delete debug managers -+ * if we should delete debug managers and reset all wavefronts - */ - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - if ((pdd->dev->dbgmgr) && - (pdd->dev->dbgmgr->pasid == p->pasid)) - kfd_dbgmgr_destroy(pdd->dev->dbgmgr); - -+ if (pdd->reset_wavefronts) { -+ pr_warn("amdkfd: Resetting all wave fronts\n"); -+ dbgdev_wave_reset_wavefronts(pdd->dev, p); -+ pdd->reset_wavefronts = false; -+ } - } - -- /* Indicate to other users that MM is no longer valid */ -- p->mm = NULL; -- - mutex_unlock(&p->mutex); - -- mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); -+ /* -+ * Because we drop mm_count inside kfd_process_destroy_delayed -+ * and because the mmu_notifier_unregister function also drop -+ * mm_count we need to take an extra count here. -+ */ -+ mmgrab(p->mm); -+ mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm); - mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); - } - -@@ -489,68 +272,7 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { - .release = kfd_process_notifier_release, - }; - --static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep) --{ -- int ret; -- unsigned long offset; -- struct kfd_process_device *temp, *pdd = NULL; -- struct kfd_dev *dev = NULL; -- struct qcm_process_device *qpd = NULL; -- void *kaddr; -- uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | -- ALLOC_MEM_FLAGS_NO_SUBSTITUTE | -- ALLOC_MEM_FLAGS_READONLY | -- ALLOC_MEM_FLAGS_EXECUTE_ACCESS; -- -- list_for_each_entry_safe(pdd, temp, &p->per_device_data, -- per_device_list) { -- dev = pdd->dev; -- qpd = &pdd->qpd; -- if (!dev->cwsr_enabled || qpd->cwsr_kaddr) -- continue; -- if (qpd->cwsr_base) { -- /* cwsr_base is only set for DGPU */ -- ret = kfd_process_alloc_gpuvm(p, dev, qpd->cwsr_base, -- dev->cwsr_size, &kaddr, pdd, flags); -- if (!ret) { -- qpd->cwsr_kaddr = kaddr; -- qpd->tba_addr = qpd->cwsr_base; -- } else -- /* In case of error, the kfd_bos for some pdds -- * which are already allocated successfully -- * will be freed in upper level function -- * i.e. create_process(). -- */ -- return ret; -- } else { -- offset = (dev->id | -- KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT; -- qpd->tba_addr = (uint64_t)vm_mmap(filep, 0, -- dev->cwsr_size, PROT_READ | PROT_EXEC, -- MAP_SHARED, offset); -- -- if (IS_ERR_VALUE(qpd->tba_addr)) { -- pr_err("Failure to set tba address. error -%d.\n", -- (int)qpd->tba_addr); -- qpd->tba_addr = 0; -- qpd->cwsr_kaddr = NULL; -- return -ENOMEM; -- } -- } -- -- memcpy(qpd->cwsr_kaddr, kmap(dev->cwsr_pages), PAGE_SIZE); -- kunmap(dev->cwsr_pages); -- -- qpd->tma_addr = qpd->tba_addr + dev->tma_offset; -- pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n", -- qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr); -- } -- -- return 0; --} -- --static struct kfd_process *create_process(const struct task_struct *thread, -- struct file *filep) -+static struct kfd_process *create_process(const struct task_struct *thread) - { - struct kfd_process *process; - int err = -ENOMEM; -@@ -560,20 +282,22 @@ static struct kfd_process *create_process(const struct task_struct *thread, - if (!process) - goto err_alloc_process; - -- process->bo_interval_tree = RB_ROOT_CACHED; -+ process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, -+ sizeof(process->queues[0]), GFP_KERNEL); -+ if (!process->queues) -+ goto err_alloc_queues; - - process->pasid = kfd_pasid_alloc(); - if (process->pasid == 0) - goto err_alloc_pasid; - -- kref_init(&process->ref); - mutex_init(&process->mutex); - - process->mm = thread->mm; - - /* register notifier */ - process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; -- err = mmu_notifier_register(&process->mmu_notifier, process->mm); -+ err = __mmu_notifier_register(&process->mmu_notifier, process->mm); - if (err) - goto err_mmu_notifier; - -@@ -581,7 +305,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, - (uintptr_t)process->mm); - - process->lead_thread = thread->group_leader; -- get_task_struct(process->lead_thread); -+ -+ process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE; - - INIT_LIST_HEAD(&process->per_device_data); - -@@ -597,28 +322,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, - if (err != 0) - goto err_init_apertures; - -- err = kfd_process_reserve_ib_mem(process); -- if (err) -- goto err_reserve_ib_mem; -- err = kfd_process_init_cwsr(process, filep); -- if (err) -- goto err_init_cwsr; -- -- INIT_DELAYED_WORK(&process->eviction_work.dwork, kfd_evict_bo_worker); -- INIT_DELAYED_WORK(&process->restore_work, kfd_restore_bo_worker); -- process->last_restore_timestamp = get_jiffies_64(); -- -- /* If PeerDirect interface was not detected try to detect it again -- * in case if network driver was loaded later. -- */ -- kfd_init_peer_direct(); -- - return process; - --err_init_cwsr: --err_reserve_ib_mem: -- kfd_process_free_outstanding_kfd_bos(process); -- kfd_process_destroy_pdds(process); - err_init_apertures: - pqm_uninit(&process->pqm); - err_process_pqm_init: -@@ -629,36 +334,13 @@ static struct kfd_process *create_process(const struct task_struct *thread, - mutex_destroy(&process->mutex); - kfd_pasid_free(process->pasid); - err_alloc_pasid: -+ kfree(process->queues); -+err_alloc_queues: - kfree(process); - err_alloc_process: - return ERR_PTR(err); - } - --static int init_doorbell_bitmap(struct qcm_process_device *qpd, -- struct kfd_dev *dev) --{ -- unsigned int i; -- -- if (!KFD_IS_SOC15(dev->device_info->asic_family)) -- return 0; -- -- qpd->doorbell_bitmap = -- kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, -- BITS_PER_BYTE), GFP_KERNEL); -- if (!qpd->doorbell_bitmap) -- return -ENOMEM; -- -- /* Mask out any reserved doorbells */ -- for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) -- if ((dev->shared_resources.reserved_doorbell_mask & i) == -- dev->shared_resources.reserved_doorbell_val) { -- set_bit(i, qpd->doorbell_bitmap); -- pr_debug("reserved doorbell 0x%03x\n", i); -- } -- -- return 0; --} -- - struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, - struct kfd_process *p) - { -@@ -666,9 +348,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) - if (pdd->dev == dev) -- return pdd; -+ break; - -- return NULL; -+ return pdd; - } - - struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, -@@ -677,41 +359,16 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, - struct kfd_process_device *pdd = NULL; - - pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); -- if (!pdd) -- return NULL; -- -- pdd->dev = dev; -- INIT_LIST_HEAD(&pdd->qpd.queues_list); -- INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); -- pdd->qpd.dqm = dev->dqm; -- pdd->qpd.pqm = &p->pqm; -- pdd->qpd.evicted = 0; -- pdd->process = p; -- pdd->bound = PDD_UNBOUND; -- pdd->already_dequeued = false; -- list_add(&pdd->per_device_list, &p->per_device_data); -- -- /* Init idr used for memory handle translation */ -- idr_init(&pdd->alloc_idr); -- if (init_doorbell_bitmap(&pdd->qpd, dev)) { -- pr_err("Failed to init doorbell for process\n"); -- goto err_create_pdd; -+ if (pdd != NULL) { -+ pdd->dev = dev; -+ INIT_LIST_HEAD(&pdd->qpd.queues_list); -+ INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); -+ pdd->qpd.dqm = dev->dqm; -+ pdd->reset_wavefronts = false; -+ list_add(&pdd->per_device_list, &p->per_device_data); - } - -- /* Create the GPUVM context for this specific device */ -- if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm, -- &p->process_info, &p->ef)) { -- pr_err("Failed to create process VM object\n"); -- goto err_create_pdd; -- } - return pdd; -- --err_create_pdd: -- kfree(pdd->qpd.doorbell_bitmap); -- idr_destroy(&pdd->alloc_idr); -- list_del(&pdd->per_device_list); -- kfree(pdd); -- return NULL; - } - - /* -@@ -725,6 +382,7 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, - struct kfd_process *p) - { - struct kfd_process_device *pdd; -+ int err; - - pdd = kfd_get_process_device_data(dev, p); - if (!pdd) { -@@ -732,89 +390,24 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, - return ERR_PTR(-ENOMEM); - } - -- if (pdd->bound == PDD_BOUND) -+ if (pdd->bound) - return pdd; - -- if (pdd->bound == PDD_BOUND_SUSPENDED) { -- pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); -- return ERR_PTR(-EINVAL); -- } -+ err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); -+ if (err < 0) -+ return ERR_PTR(err); - --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- if (dev->device_info->is_need_iommu_device) { -- int err = amd_iommu_bind_pasid(dev->pdev, p->pasid, -- p->lead_thread); -- if (err < 0) -- return ERR_PTR(err); -- } --#endif -- -- pdd->bound = PDD_BOUND; -+ pdd->bound = true; - - return pdd; - } - --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) --int kfd_bind_processes_to_device(struct kfd_dev *dev) -+void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) - { -- struct kfd_process_device *pdd; - struct kfd_process *p; -- unsigned int temp; -- int err = 0; -- -- int idx = srcu_read_lock(&kfd_processes_srcu); -- -- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { -- mutex_lock(&p->mutex); -- pdd = kfd_get_process_device_data(dev, p); -- if (pdd->bound != PDD_BOUND_SUSPENDED) { -- mutex_unlock(&p->mutex); -- continue; -- } -- -- err = amd_iommu_bind_pasid(dev->pdev, p->pasid, -- p->lead_thread); -- if (err < 0) { -- pr_err("Unexpected pasid %d binding failure\n", -- p->pasid); -- mutex_unlock(&p->mutex); -- break; -- } -- -- pdd->bound = PDD_BOUND; -- mutex_unlock(&p->mutex); -- } -- -- srcu_read_unlock(&kfd_processes_srcu, idx); -- -- return err; --} -- --void kfd_unbind_processes_from_device(struct kfd_dev *dev) --{ - struct kfd_process_device *pdd; -- struct kfd_process *p; -- unsigned int temp; -- -- int idx = srcu_read_lock(&kfd_processes_srcu); -- -- -- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { -- mutex_lock(&p->mutex); -- pdd = kfd_get_process_device_data(dev, p); - -- if (pdd->bound == PDD_BOUND) -- pdd->bound = PDD_BOUND_SUSPENDED; -- mutex_unlock(&p->mutex); -- } -- -- srcu_read_unlock(&kfd_processes_srcu, idx); --} -- --void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) --{ -- struct kfd_process *p; -- struct kfd_process_device *pdd; -+ BUG_ON(dev == NULL); - - /* - * Look for the process that matches the pasid. If there is no such -@@ -827,43 +420,43 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) - - pr_debug("Unbinding process %d from IOMMU\n", pasid); - -- mutex_lock(kfd_get_dbgmgr_mutex()); -+ if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) -+ kfd_dbgmgr_destroy(dev->dbgmgr); - -- if (dev->dbgmgr && (dev->dbgmgr->pasid == p->pasid)) { -+ pqm_uninit(&p->pqm); - -- if (kfd_dbgmgr_unregister(dev->dbgmgr, p) == 0) { -- kfd_dbgmgr_destroy(dev->dbgmgr); -- dev->dbgmgr = NULL; -- } -- } -+ pdd = kfd_get_process_device_data(dev, p); - -- mutex_unlock(kfd_get_dbgmgr_mutex()); -+ if (!pdd) { -+ mutex_unlock(&p->mutex); -+ return; -+ } - -- mutex_lock(&p->mutex); -+ if (pdd->reset_wavefronts) { -+ dbgdev_wave_reset_wavefronts(pdd->dev, p); -+ pdd->reset_wavefronts = false; -+ } - -- pdd = kfd_get_process_device_data(dev, p); -- if (pdd) -- /* For GPU relying on IOMMU, we need to dequeue here -- * when PASID is still bound. -- */ -- kfd_process_dequeue_from_device(pdd); -+ /* -+ * Just mark pdd as unbound, because we still need it -+ * to call amd_iommu_unbind_pasid() in when the -+ * process exits. -+ * We don't call amd_iommu_unbind_pasid() here -+ * because the IOMMU called us. -+ */ -+ pdd->bound = false; - - mutex_unlock(&p->mutex); -- -- kfd_unref_process(p); - } --#endif /* CONFIG_AMD_IOMMU_V2 */ - --struct kfd_process_device *kfd_get_first_process_device_data( -- struct kfd_process *p) -+struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p) - { - return list_first_entry(&p->per_device_data, - struct kfd_process_device, - per_device_list); - } - --struct kfd_process_device *kfd_get_next_process_device_data( -- struct kfd_process *p, -+struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, - struct kfd_process_device *pdd) - { - if (list_is_last(&pdd->per_device_list, &p->per_device_data)) -@@ -876,272 +469,22 @@ bool kfd_has_process_device_data(struct kfd_process *p) - return !(list_empty(&p->per_device_data)); - } - --/* Create specific handle mapped to mem from process local memory idr -- * Assumes that the process lock is held. -- */ --int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, -- void *mem, uint64_t start, -- uint64_t length, -- struct kfd_ipc_obj *ipc_obj) --{ -- int handle; -- struct kfd_bo *buf_obj; -- struct kfd_process *p; -- -- p = pdd->process; -- -- buf_obj = kzalloc(sizeof(*buf_obj), GFP_KERNEL); -- -- if (!buf_obj) -- return -ENOMEM; -- -- buf_obj->it.start = start; -- buf_obj->it.last = start + length - 1; -- interval_tree_insert(&buf_obj->it, &p->bo_interval_tree); -- -- buf_obj->mem = mem; -- buf_obj->dev = pdd->dev; -- buf_obj->kfd_ipc_obj = ipc_obj; -- -- INIT_LIST_HEAD(&buf_obj->cb_data_head); -- -- idr_preload(GFP_KERNEL); -- -- handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID, -- GFP_NOWAIT); -- -- idr_preload_end(); -- -- if (handle < 0) -- kfree(buf_obj); -- -- return handle; --} -- --struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, -- int handle) --{ -- if (handle < 0) -- return NULL; -- -- return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle); --} -- --/* Translate specific handle from process local memory idr -- * Assumes that the process lock is held. -- */ --void *kfd_process_device_translate_handle(struct kfd_process_device *pdd, -- int handle) --{ -- struct kfd_bo *buf_obj; -- -- buf_obj = kfd_process_device_find_bo(pdd, handle); -- -- return buf_obj->mem; --} -- --void *kfd_process_find_bo_from_interval(struct kfd_process *p, -- uint64_t start_addr, -- uint64_t last_addr) --{ -- struct interval_tree_node *it_node; -- struct kfd_bo *buf_obj; -- -- it_node = interval_tree_iter_first(&p->bo_interval_tree, -- start_addr, last_addr); -- if (!it_node) { -- pr_err("0x%llx-0x%llx does not relate to an existing buffer\n", -- start_addr, last_addr); -- return NULL; -- } -- -- if (interval_tree_iter_next(it_node, start_addr, last_addr)) { -- pr_err("0x%llx-0x%llx spans more than a single BO\n", -- start_addr, last_addr); -- return NULL; -- } -- -- buf_obj = container_of(it_node, struct kfd_bo, it); -- -- return buf_obj; --} -- --/* Remove specific handle from process local memory idr -- * Assumes that the process lock is held. -- */ --void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, -- int handle) --{ -- struct kfd_bo *buf_obj; -- struct kfd_process *p; -- -- p = pdd->process; -- -- if (handle < 0) -- return; -- -- buf_obj = kfd_process_device_find_bo(pdd, handle); -- -- if (buf_obj->kfd_ipc_obj) -- ipc_obj_put(&buf_obj->kfd_ipc_obj); -- -- idr_remove(&pdd->alloc_idr, handle); -- -- interval_tree_remove(&buf_obj->it, &p->bo_interval_tree); -- -- kfree(buf_obj); --} -- --/* This increments the process->ref counter. */ -+/* This returns with process->mutex locked. */ - struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) - { -- struct kfd_process *p, *ret_p = NULL; -+ struct kfd_process *p; - unsigned int temp; - - int idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - if (p->pasid == pasid) { -- kref_get(&p->ref); -- ret_p = p; -+ mutex_lock(&p->mutex); - break; - } - } - - srcu_read_unlock(&kfd_processes_srcu, idx); - -- return ret_p; --} -- --void kfd_suspend_all_processes(void) --{ -- struct kfd_process *p; -- unsigned int temp; -- int idx = srcu_read_lock(&kfd_processes_srcu); -- -- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { -- if (cancel_delayed_work_sync(&p->eviction_work.dwork)) -- dma_fence_put(p->eviction_work.quiesce_fence); -- cancel_delayed_work_sync(&p->restore_work); -- -- if (quiesce_process_mm(p)) -- pr_err("Failed to suspend process %d\n", p->pasid); -- dma_fence_signal(p->ef); -- dma_fence_put(p->ef); -- p->ef = NULL; -- } -- srcu_read_unlock(&kfd_processes_srcu, idx); --} -- --int kfd_resume_all_processes(void) --{ -- struct kfd_process *p; -- unsigned int temp; -- int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); -- -- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { -- if (!schedule_delayed_work(&p->restore_work, 0)) { -- pr_err("Restore process %d failed during resume\n", -- p->pasid); -- ret = -EFAULT; -- } -- } -- srcu_read_unlock(&kfd_processes_srcu, idx); -- return ret; --} -- --/* This increments the process->ref counter. */ --struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) --{ -- struct kfd_process *p; -- -- int idx = srcu_read_lock(&kfd_processes_srcu); -- -- p = find_process_by_mm(mm); -- if (p) -- kref_get(&p->ref); -- -- srcu_read_unlock(&kfd_processes_srcu, idx); -- - return p; - } -- --int kfd_reserved_mem_mmap(struct kfd_process *process, -- struct vm_area_struct *vma) --{ -- unsigned long pfn, i; -- int ret = 0; -- struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); -- struct kfd_process_device *temp, *pdd = NULL; -- struct qcm_process_device *qpd = NULL; -- -- if (!dev) -- return -EINVAL; -- if (((vma->vm_end - vma->vm_start) != dev->cwsr_size) || -- (vma->vm_start & (PAGE_SIZE - 1)) || -- (vma->vm_end & (PAGE_SIZE - 1))) { -- pr_err("KFD only support page aligned memory map and correct size.\n"); -- return -EINVAL; -- } -- -- pr_debug("kfd reserved mem mmap been called.\n"); -- -- list_for_each_entry_safe(pdd, temp, &process->per_device_data, -- per_device_list) { -- if (dev == pdd->dev) { -- qpd = &pdd->qpd; -- break; -- } -- } -- if (!qpd) -- return -EINVAL; -- -- qpd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, -- get_order(dev->cwsr_size)); -- if (!qpd->cwsr_pages) { -- pr_err("amdkfd: error alloc CWSR isa memory per process.\n"); -- return -ENOMEM; -- } -- qpd->cwsr_kaddr = kmap(qpd->cwsr_pages); -- -- vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND -- | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; -- for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); ++i) { -- pfn = page_to_pfn(&qpd->cwsr_pages[i]); -- /* mapping the page to user process */ -- ret = remap_pfn_range(vma, vma->vm_start + (i << PAGE_SHIFT), -- pfn, PAGE_SIZE, vma->vm_page_prot); -- if (ret) -- break; -- } -- return ret; --} -- --#if defined(CONFIG_DEBUG_FS) -- --int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) --{ -- struct kfd_process *p; -- unsigned int temp; -- int r = 0; -- -- int idx = srcu_read_lock(&kfd_processes_srcu); -- -- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { -- seq_printf(m, "Process %d PASID %d:\n", -- p->lead_thread->tgid, p->pasid); -- -- mutex_lock(&p->mutex); -- r = pqm_debugfs_mqds(m, &p->pqm); -- mutex_unlock(&p->mutex); -- -- if (r != 0) -- break; -- } -- -- srcu_read_unlock(&kfd_processes_srcu, idx); -- -- return r; --} -- --#endif -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c -index a87fcab..46f497e 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c -@@ -32,9 +32,12 @@ static inline struct process_queue_node *get_queue_by_qid( - { - struct process_queue_node *pqn; - -+ BUG_ON(!pqm); -+ - list_for_each_entry(pqn, &pqm->queues, process_queue_list) { -- if ((pqn->q && pqn->q->properties.queue_id == qid) || -- (pqn->kq && pqn->kq->queue->properties.queue_id == qid)) -+ if (pqn->q && pqn->q->properties.queue_id == qid) -+ return pqn; -+ if (pqn->kq && pqn->kq->queue->properties.queue_id == qid) - return pqn; - } - -@@ -46,13 +49,17 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, - { - unsigned long found; - -+ BUG_ON(!pqm || !qid); -+ -+ pr_debug("kfd: in %s\n", __func__); -+ - found = find_first_zero_bit(pqm->queue_slot_bitmap, - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); - -- pr_debug("The new slot id %lu\n", found); -+ pr_debug("kfd: the new slot id %lu\n", found); - - if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { -- pr_info("Cannot open more queues for process with pasid %d\n", -+ pr_info("amdkfd: Can not open more queues for process with pasid %d\n", - pqm->process->pasid); - return -ENOMEM; - } -@@ -63,33 +70,15 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, - return 0; - } - --void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) --{ -- struct kfd_dev *dev = pdd->dev; -- int retval; -- -- if (pdd->already_dequeued) -- return; -- -- retval = dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd); -- pdd->already_dequeued = true; --} -- --void kfd_process_dequeue_from_all_devices(struct kfd_process *p) --{ -- struct kfd_process_device *pdd; -- -- list_for_each_entry(pdd, &p->per_device_data, per_device_list) -- kfd_process_dequeue_from_device(pdd); --} -- - int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) - { -+ BUG_ON(!pqm); -+ - INIT_LIST_HEAD(&pqm->queues); - pqm->queue_slot_bitmap = - kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, - BITS_PER_BYTE), GFP_KERNEL); -- if (!pqm->queue_slot_bitmap) -+ if (pqm->queue_slot_bitmap == NULL) - return -ENOMEM; - pqm->process = p; - -@@ -98,14 +87,25 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) - - void pqm_uninit(struct process_queue_manager *pqm) - { -+ int retval; - struct process_queue_node *pqn, *next; - -+ BUG_ON(!pqm); -+ -+ pr_debug("In func %s\n", __func__); -+ - list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { -- uninit_queue(pqn->q); -- list_del(&pqn->process_queue_list); -- kfree(pqn); -+ retval = pqm_destroy_queue( -+ pqm, -+ (pqn->q != NULL) ? -+ pqn->q->properties.queue_id : -+ pqn->kq->queue->properties.queue_id); -+ -+ if (retval != 0) { -+ pr_err("kfd: failed to destroy queue\n"); -+ return; -+ } - } -- - kfree(pqm->queue_slot_bitmap); - pqm->queue_slot_bitmap = NULL; - } -@@ -117,39 +117,54 @@ static int create_cp_queue(struct process_queue_manager *pqm, - { - int retval; - -+ retval = 0; -+ - /* Doorbell initialized in user space*/ - q_properties->doorbell_ptr = NULL; - -+ q_properties->doorbell_off = -+ kfd_queue_id_to_doorbell(dev, pqm->process, qid); -+ - /* let DQM handle it*/ - q_properties->vmid = 0; - q_properties->queue_id = qid; - - retval = init_queue(q, q_properties); - if (retval != 0) -- return retval; -+ goto err_init_queue; - - (*q)->device = dev; - (*q)->process = pqm->process; - -- pr_debug("PQM After init queue"); -+ pr_debug("kfd: PQM After init queue"); - - return retval; -+ -+err_init_queue: -+ return retval; - } - - int pqm_create_queue(struct process_queue_manager *pqm, - struct kfd_dev *dev, - struct file *f, - struct queue_properties *properties, -+ unsigned int flags, -+ enum kfd_queue_type type, - unsigned int *qid) - { - int retval; - struct kfd_process_device *pdd; -+ struct queue_properties q_properties; - struct queue *q; - struct process_queue_node *pqn; - struct kernel_queue *kq; -- enum kfd_queue_type type = properties->type; -- unsigned int max_queues = 127; /* HWS limit */ -+ int num_queues = 0; -+ struct queue *cur; -+ -+ BUG_ON(!pqm || !dev || !properties || !qid); - -+ memset(&q_properties, 0, sizeof(struct queue_properties)); -+ memcpy(&q_properties, properties, sizeof(struct queue_properties)); - q = NULL; - kq = NULL; - -@@ -165,21 +180,24 @@ int pqm_create_queue(struct process_queue_manager *pqm, - * If we are just about to create DIQ, the is_debug flag is not set yet - * Hence we also check the type as well - */ -- if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ)) -- max_queues = dev->device_info->max_no_of_hqd/2; -- -- if (pdd->qpd.queue_count >= max_queues) -- return -ENOSPC; -+ if ((pdd->qpd.is_debug) || -+ (type == KFD_QUEUE_TYPE_DIQ)) { -+ list_for_each_entry(cur, &pdd->qpd.queues_list, list) -+ num_queues++; -+ if (num_queues >= dev->device_info->max_no_of_hqd/2) -+ return (-ENOSPC); -+ } - - retval = find_available_queue_slot(pqm, qid); - if (retval != 0) - return retval; - -- if (list_empty(&pdd->qpd.queues_list) && -- list_empty(&pdd->qpd.priv_queue_list)) -+ if (list_empty(&pqm->queues)) { -+ pdd->qpd.pqm = pqm; - dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); -+ } - -- pqn = kzalloc(sizeof(*pqn), GFP_KERNEL); -+ pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL); - if (!pqn) { - retval = -ENOMEM; - goto err_allocate_pqn; -@@ -187,35 +205,18 @@ int pqm_create_queue(struct process_queue_manager *pqm, - - switch (type) { - case KFD_QUEUE_TYPE_SDMA: -- if (dev->dqm->sdma_queue_count >= CIK_SDMA_QUEUES) { -- pr_err("Over-subscription is not allowed for SDMA\n"); -- retval = -EPERM; -- goto err_create_queue; -- } -- -- retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); -- if (retval != 0) -- goto err_create_queue; -- pqn->q = q; -- pqn->kq = NULL; -- retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, -- &q->properties.vmid); -- pr_debug("DQM returned %d for create_queue\n", retval); -- print_queue(q); -- break; - - case KFD_QUEUE_TYPE_COMPUTE: - /* check if there is over subscription */ -- if ((dev->dqm->sched_policy == -- KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && -- ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || -+ if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && -+ ((dev->dqm->processes_count >= VMID_PER_DEVICE) || - (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) { -- pr_err("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); -+ pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); - retval = -EPERM; - goto err_create_queue; - } - -- retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); -+ retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid); - if (retval != 0) - goto err_create_queue; - pqn->q = q; -@@ -227,7 +228,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, - break; - case KFD_QUEUE_TYPE_DIQ: - kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ); -- if (!kq) { -+ if (kq == NULL) { - retval = -ENOMEM; - goto err_create_queue; - } -@@ -238,31 +239,23 @@ int pqm_create_queue(struct process_queue_manager *pqm, - kq, &pdd->qpd); - break; - default: -- WARN(1, "Invalid queue type %d", type); -- retval = -EINVAL; -+ BUG(); -+ break; - } - - if (retval != 0) { -- pr_err("DQM create queue failed\n"); -+ pr_debug("Error dqm create queue\n"); - goto err_create_queue; - } - -- if (q) -- /* Return the doorbell offset within the doorbell page -- * to the caller so it can be passed up to user mode -- * (in bytes). -- */ -- properties->doorbell_off = -- (q->properties.doorbell_off * sizeof(uint32_t)) & -- (kfd_doorbell_process_slice(dev) - 1); -- -- pr_debug("PQM After DQM create queue\n"); -+ pr_debug("kfd: PQM After DQM create queue\n"); - - list_add(&pqn->process_queue_list, &pqm->queues); - - if (q) { -- pr_debug("PQM done creating queue\n"); -- print_queue_properties(&q->properties); -+ *properties = q->properties; -+ pr_debug("kfd: PQM done creating queue\n"); -+ print_queue_properties(properties); - } - - return retval; -@@ -272,8 +265,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, - err_allocate_pqn: - /* check if queues list is empty unregister process from device */ - clear_bit(*qid, pqm->queue_slot_bitmap); -- if (list_empty(&pdd->qpd.queues_list) && -- list_empty(&pdd->qpd.priv_queue_list)) -+ if (list_empty(&pqm->queues)) - dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); - return retval; - } -@@ -288,11 +280,14 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) - - dqm = NULL; - -+ BUG_ON(!pqm); - retval = 0; - -+ pr_debug("kfd: In Func %s\n", __func__); -+ - pqn = get_queue_by_qid(pqm, qid); -- if (!pqn) { -- pr_err("Queue id does not match any known queue\n"); -+ if (pqn == NULL) { -+ pr_err("kfd: queue id does not match any known queue\n"); - return -EINVAL; - } - -@@ -301,8 +296,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) - dev = pqn->kq->dev; - if (pqn->q) - dev = pqn->q->device; -- if (WARN_ON(!dev)) -- return -ENODEV; -+ BUG_ON(!dev); - - pdd = kfd_get_process_device_data(dev, pqm->process); - if (!pdd) { -@@ -319,9 +313,10 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) - - if (pqn->q) { - dqm = pqn->q->device->dqm; -- kfree(pqn->q->properties.cu_mask); -- pqn->q->properties.cu_mask = NULL; - retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); -+ if (retval != 0) -+ return retval; -+ - uninit_queue(pqn->q); - } - -@@ -329,8 +324,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) - kfree(pqn); - clear_bit(qid, pqm->queue_slot_bitmap); - -- if (list_empty(&pdd->qpd.queues_list) && -- list_empty(&pdd->qpd.priv_queue_list)) -+ if (list_empty(&pqm->queues)) - dqm->ops.unregister_process(dqm, &pdd->qpd); - - return retval; -@@ -342,9 +336,12 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, - int retval; - struct process_queue_node *pqn; - -+ BUG_ON(!pqm); -+ - pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { -- pr_debug("No queue %d exists for update operation\n", qid); -+ pr_debug("amdkfd: No queue %d exists for update operation\n", -+ qid); - return -EFAULT; - } - -@@ -361,40 +358,14 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, - return 0; - } - --int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, -- struct queue_properties *p) --{ -- int retval; -- struct process_queue_node *pqn; -- -- pqn = get_queue_by_qid(pqm, qid); -- if (!pqn) { -- pr_debug("No queue %d exists for update operation\n", qid); -- return -EFAULT; -- } -- -- /* Free the old CU mask memory if it is already allocated, then -- * allocate memory for the new CU mask. -- */ -- kfree(pqn->q->properties.cu_mask); -- -- pqn->q->properties.cu_mask_count = p->cu_mask_count; -- pqn->q->properties.cu_mask = p->cu_mask; -- -- retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, -- pqn->q); -- if (retval != 0) -- return retval; -- -- return 0; --} -- - struct kernel_queue *pqm_get_kernel_queue( - struct process_queue_manager *pqm, - unsigned int qid) - { - struct process_queue_node *pqn; - -+ BUG_ON(!pqm); -+ - pqn = get_queue_by_qid(pqm, qid); - if (pqn && pqn->kq) - return pqn->kq; -@@ -402,89 +373,4 @@ struct kernel_queue *pqm_get_kernel_queue( - return NULL; - } - --int pqm_get_wave_state(struct process_queue_manager *pqm, -- unsigned int qid, -- void __user *ctl_stack, -- u32 *ctl_stack_used_size, -- u32 *save_area_used_size) --{ -- struct process_queue_node *pqn; -- -- pqn = get_queue_by_qid(pqm, qid); -- if (!pqn) { -- pr_debug("amdkfd: No queue %d exists for operation\n", -- qid); -- return -EFAULT; -- } -- -- return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm, -- pqn->q, -- ctl_stack, -- ctl_stack_used_size, -- save_area_used_size); --} -- --#if defined(CONFIG_DEBUG_FS) -- --int pqm_debugfs_mqds(struct seq_file *m, void *data) --{ -- struct process_queue_manager *pqm = data; -- struct process_queue_node *pqn; -- struct queue *q; -- enum KFD_MQD_TYPE mqd_type; -- struct mqd_manager *mqd_manager; -- int r = 0; -- -- list_for_each_entry(pqn, &pqm->queues, process_queue_list) { -- if (pqn->q) { -- q = pqn->q; -- switch (q->properties.type) { -- case KFD_QUEUE_TYPE_SDMA: -- seq_printf(m, " SDMA queue on device %x\n", -- q->device->id); -- mqd_type = KFD_MQD_TYPE_SDMA; -- break; -- case KFD_QUEUE_TYPE_COMPUTE: -- seq_printf(m, " Compute queue on device %x\n", -- q->device->id); -- mqd_type = KFD_MQD_TYPE_CP; -- break; -- default: -- seq_printf(m, -- " Bad user queue type %d on device %x\n", -- q->properties.type, q->device->id); -- continue; -- } -- mqd_manager = q->device->dqm->ops.get_mqd_manager( -- q->device->dqm, mqd_type); -- } else if (pqn->kq) { -- q = pqn->kq->queue; -- mqd_manager = pqn->kq->mqd; -- switch (q->properties.type) { -- case KFD_QUEUE_TYPE_DIQ: -- seq_printf(m, " DIQ on device %x\n", -- pqn->kq->dev->id); -- mqd_type = KFD_MQD_TYPE_HIQ; -- break; -- default: -- seq_printf(m, -- " Bad kernel queue type %d on device %x\n", -- q->properties.type, -- pqn->kq->dev->id); -- continue; -- } -- } else { -- seq_printf(m, -- " Weird: Queue node with neither kernel nor user queue\n"); -- continue; -- } -- -- r = mqd_manager->debugfs_show_mqd(m, q->mqd); -- if (r != 0) -- break; -- } -- -- return r; --} - --#endif -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c -index a5315d4..0ab1970 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c -@@ -65,15 +65,17 @@ void print_queue(struct queue *q) - - int init_queue(struct queue **q, const struct queue_properties *properties) - { -- struct queue *tmp_q; -+ struct queue *tmp; - -- tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL); -- if (!tmp_q) -+ BUG_ON(!q); -+ -+ tmp = kzalloc(sizeof(struct queue), GFP_KERNEL); -+ if (!tmp) - return -ENOMEM; - -- memcpy(&tmp_q->properties, properties, sizeof(*properties)); -+ memcpy(&tmp->properties, properties, sizeof(struct queue_properties)); - -- *q = tmp_q; -+ *q = tmp; - return 0; - } - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c -deleted file mode 100644 -index 2f5cdb9..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c -+++ /dev/null -@@ -1,294 +0,0 @@ --/* -- * Copyright 2015 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- --#include <linux/device.h> --#include <linux/export.h> --#include <linux/pid.h> --#include <linux/err.h> --#include <linux/slab.h> --#include "kfd_priv.h" -- -- --struct rdma_cb { -- struct list_head node; -- struct amd_p2p_info amd_p2p_data; -- void (*free_callback)(void *client_priv); -- void *client_priv; --}; -- --/** -- * This function makes the pages underlying a range of GPU virtual memory -- * accessible for DMA operations from another PCIe device -- * -- * \param address - The start address in the Unified Virtual Address -- * space in the specified process -- * \param length - The length of requested mapping -- * \param pid - Pointer to structure pid to which address belongs. -- * Could be NULL for current process address space. -- * \param p2p_data - On return: Pointer to structure describing -- * underlying pages/locations -- * \param free_callback - Pointer to callback which will be called when access -- * to such memory must be stopped immediately: Memory -- * was freed, GECC events, etc. -- * Client should immediately stop any transfer -- * operations and returned as soon as possible. -- * After return all resources associated with address -- * will be release and no access will be allowed. -- * \param client_priv - Pointer to be passed as parameter on -- * 'free_callback; -- * -- * \return 0 if operation was successful -- */ --static int get_pages(uint64_t address, uint64_t length, struct pid *pid, -- struct amd_p2p_info **amd_p2p_data, -- void (*free_callback)(void *client_priv), -- void *client_priv) --{ -- struct kfd_bo *buf_obj; -- struct kgd_mem *mem; -- struct sg_table *sg_table_tmp; -- struct kfd_dev *dev; -- uint64_t last = address + length - 1; -- uint64_t offset; -- struct kfd_process *p; -- struct rdma_cb *rdma_cb_data; -- int ret = 0; -- -- p = kfd_lookup_process_by_pid(pid); -- if (!p) { -- pr_err("Could not find the process\n"); -- return -EINVAL; -- } -- mutex_lock(&p->mutex); -- -- buf_obj = kfd_process_find_bo_from_interval(p, address, last); -- if (!buf_obj) { -- pr_err("Cannot find a kfd_bo for the range\n"); -- ret = -EINVAL; -- goto out; -- } -- -- rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL); -- if (!rdma_cb_data) { -- *amd_p2p_data = NULL; -- ret = -ENOMEM; -- goto out; -- } -- -- mem = buf_obj->mem; -- dev = buf_obj->dev; -- offset = address - buf_obj->it.start; -- -- ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem, -- offset, length, &sg_table_tmp); -- -- if (ret) { -- pr_err("pin_get_sg_table_bo failed.\n"); -- *amd_p2p_data = NULL; -- goto free_mem; -- } -- -- rdma_cb_data->amd_p2p_data.va = address; -- rdma_cb_data->amd_p2p_data.size = length; -- rdma_cb_data->amd_p2p_data.pid = pid; -- rdma_cb_data->amd_p2p_data.priv = buf_obj; -- rdma_cb_data->amd_p2p_data.pages = sg_table_tmp; -- -- rdma_cb_data->free_callback = free_callback; -- rdma_cb_data->client_priv = client_priv; -- -- list_add(&rdma_cb_data->node, &buf_obj->cb_data_head); -- -- *amd_p2p_data = &rdma_cb_data->amd_p2p_data; -- -- goto out; -- --free_mem: -- kfree(rdma_cb_data); --out: -- mutex_unlock(&p->mutex); -- kfd_unref_process(p); -- -- return ret; --} -- --static int put_pages_helper(struct amd_p2p_info *p2p_data) --{ -- struct kfd_bo *buf_obj; -- struct kfd_dev *dev; -- struct sg_table *sg_table_tmp; -- struct rdma_cb *rdma_cb_data; -- -- if (!p2p_data) { -- pr_err("amd_p2p_info pointer is invalid.\n"); -- return -EINVAL; -- } -- -- rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data); -- -- buf_obj = p2p_data->priv; -- dev = buf_obj->dev; -- sg_table_tmp = p2p_data->pages; -- -- list_del(&rdma_cb_data->node); -- kfree(rdma_cb_data); -- -- dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp); -- -- -- return 0; --} -- --void run_rdma_free_callback(struct kfd_bo *buf_obj) --{ -- struct rdma_cb *tmp, *rdma_cb_data; -- -- list_for_each_entry_safe(rdma_cb_data, tmp, -- &buf_obj->cb_data_head, node) { -- if (rdma_cb_data->free_callback) -- rdma_cb_data->free_callback( -- rdma_cb_data->client_priv); -- -- put_pages_helper(&rdma_cb_data->amd_p2p_data); -- } --} -- --/** -- * -- * This function release resources previously allocated by get_pages() call. -- * -- * \param p_p2p_data - A pointer to pointer to amd_p2p_info entries -- * allocated by get_pages() call. -- * -- * \return 0 if operation was successful -- */ --static int put_pages(struct amd_p2p_info **p_p2p_data) --{ -- struct kfd_process *p = NULL; -- int ret = 0; -- -- if (!(*p_p2p_data)) { -- pr_err("amd_p2p_info pointer is invalid.\n"); -- return -EINVAL; -- } -- -- p = kfd_lookup_process_by_pid((*p_p2p_data)->pid); -- if (!p) { -- pr_err("Could not find the process\n"); -- return -EINVAL; -- } -- -- ret = put_pages_helper(*p_p2p_data); -- -- if (!ret) -- *p_p2p_data = NULL; -- -- kfd_unref_process(p); -- -- return ret; --} -- --/** -- * Check if given address belongs to GPU address space. -- * -- * \param address - Address to check -- * \param pid - Process to which given address belongs. -- * Could be NULL if current one. -- * -- * \return 0 - This is not GPU address managed by AMD driver -- * 1 - This is GPU address managed by AMD driver -- */ --static int is_gpu_address(uint64_t address, struct pid *pid) --{ -- struct kfd_bo *buf_obj; -- struct kfd_process *p; -- -- p = kfd_lookup_process_by_pid(pid); -- if (!p) { -- pr_debug("Could not find the process\n"); -- return 0; -- } -- -- buf_obj = kfd_process_find_bo_from_interval(p, address, address); -- -- kfd_unref_process(p); -- if (!buf_obj) -- return 0; -- -- return 1; --} -- --/** -- * Return the single page size to be used when building scatter/gather table -- * for given range. -- * -- * \param address - Address -- * \param length - Range length -- * \param pid - Process id structure. Could be NULL if current one. -- * \param page_size - On return: Page size -- * -- * \return 0 if operation was successful -- */ --static int get_page_size(uint64_t address, uint64_t length, struct pid *pid, -- unsigned long *page_size) --{ -- /* -- * As local memory is always consecutive, we can assume the local -- * memory page size to be arbitrary. -- * Currently we assume the local memory page size to be the same -- * as system memory, which is 4KB. -- */ -- *page_size = PAGE_SIZE; -- -- return 0; --} -- -- --/** -- * Singleton object: rdma interface function pointers -- */ --static const struct amd_rdma_interface rdma_ops = { -- .get_pages = get_pages, -- .put_pages = put_pages, -- .is_gpu_address = is_gpu_address, -- .get_page_size = get_page_size, --}; -- --/** -- * amdkfd_query_rdma_interface - Return interface (function pointers table) for -- * rdma interface -- * -- * -- * \param interace - OUT: Pointer to interface -- * -- * \return 0 if operation was successful. -- */ --int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops) --{ -- *ops = &rdma_ops; -- -- return 0; --} --EXPORT_SYMBOL(amdkfd_query_rdma_interface); -- -- -- -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -index d08e3de..1e50647 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -@@ -28,32 +28,27 @@ - #include <linux/hash.h> - #include <linux/cpufreq.h> - #include <linux/log2.h> --#include <linux/dmi.h> --#include <linux/atomic.h> - - #include "kfd_priv.h" - #include "kfd_crat.h" - #include "kfd_topology.h" --#include "kfd_device_queue_manager.h" - --/* topology_device_list - Master list of all topology devices */ - static struct list_head topology_device_list; -+static int topology_crat_parsed; - static struct kfd_system_properties sys_props; - - static DECLARE_RWSEM(topology_lock); --static atomic_t topology_crat_proximity_domain; - --struct kfd_topology_device *kfd_topology_device_by_proximity_domain( -- uint32_t proximity_domain) -+struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) - { - struct kfd_topology_device *top_dev; -- struct kfd_topology_device *device = NULL; -+ struct kfd_dev *device = NULL; - - down_read(&topology_lock); - - list_for_each_entry(top_dev, &topology_device_list, list) -- if (top_dev->proximity_domain == proximity_domain) { -- device = top_dev; -+ if (top_dev->gpu_id == gpu_id) { -+ device = top_dev->gpu; - break; - } - -@@ -62,7 +57,7 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain( - return device; - } - --struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) -+struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) - { - struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; -@@ -70,7 +65,7 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) - down_read(&topology_lock); - - list_for_each_entry(top_dev, &topology_device_list, list) -- if (top_dev->gpu_id == gpu_id) { -+ if (top_dev->gpu->pdev == pdev) { - device = top_dev->gpu; - break; - } -@@ -80,49 +75,300 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) - return device; - } - --struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) -+static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) - { -- struct kfd_topology_device *top_dev; -- struct kfd_dev *device = NULL; -+ struct acpi_table_header *crat_table; -+ acpi_status status; - -- down_read(&topology_lock); -+ if (!size) -+ return -EINVAL; - -- list_for_each_entry(top_dev, &topology_device_list, list) -- if (top_dev->gpu && top_dev->gpu->pdev == pdev) { -- device = top_dev->gpu; -+ /* -+ * Fetch the CRAT table from ACPI -+ */ -+ status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); -+ if (status == AE_NOT_FOUND) { -+ pr_warn("CRAT table not found\n"); -+ return -ENODATA; -+ } else if (ACPI_FAILURE(status)) { -+ const char *err = acpi_format_exception(status); -+ -+ pr_err("CRAT table error: %s\n", err); -+ return -EINVAL; -+ } -+ -+ if (*size >= crat_table->length && crat_image != NULL) -+ memcpy(crat_image, crat_table, crat_table->length); -+ -+ *size = crat_table->length; -+ -+ return 0; -+} -+ -+static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, -+ struct crat_subtype_computeunit *cu) -+{ -+ BUG_ON(!dev); -+ BUG_ON(!cu); -+ -+ dev->node_props.cpu_cores_count = cu->num_cpu_cores; -+ dev->node_props.cpu_core_id_base = cu->processor_id_low; -+ if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) -+ dev->node_props.capability |= HSA_CAP_ATS_PRESENT; -+ -+ pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, -+ cu->processor_id_low); -+} -+ -+static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, -+ struct crat_subtype_computeunit *cu) -+{ -+ BUG_ON(!dev); -+ BUG_ON(!cu); -+ -+ dev->node_props.simd_id_base = cu->processor_id_low; -+ dev->node_props.simd_count = cu->num_simd_cores; -+ dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; -+ dev->node_props.max_waves_per_simd = cu->max_waves_simd; -+ dev->node_props.wave_front_size = cu->wave_front_size; -+ dev->node_props.mem_banks_count = cu->num_banks; -+ dev->node_props.array_count = cu->num_arrays; -+ dev->node_props.cu_per_simd_array = cu->num_cu_per_array; -+ dev->node_props.simd_per_cu = cu->num_simd_per_cu; -+ dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; -+ if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) -+ dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; -+ pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, -+ cu->processor_id_low); -+} -+ -+/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ -+static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) -+{ -+ struct kfd_topology_device *dev; -+ int i = 0; -+ -+ BUG_ON(!cu); -+ -+ pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", -+ cu->proximity_domain, cu->hsa_capability); -+ list_for_each_entry(dev, &topology_device_list, list) { -+ if (cu->proximity_domain == i) { -+ if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) -+ kfd_populated_cu_info_cpu(dev, cu); -+ -+ if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) -+ kfd_populated_cu_info_gpu(dev, cu); - break; - } -+ i++; -+ } - -- up_read(&topology_lock); -+ return 0; -+} - -- return device; -+/* -+ * kfd_parse_subtype_mem is called when the topology mutex is -+ * already acquired -+ */ -+static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) -+{ -+ struct kfd_mem_properties *props; -+ struct kfd_topology_device *dev; -+ int i = 0; -+ -+ BUG_ON(!mem); -+ -+ pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", -+ mem->promixity_domain); -+ list_for_each_entry(dev, &topology_device_list, list) { -+ if (mem->promixity_domain == i) { -+ props = kfd_alloc_struct(props); -+ if (props == NULL) -+ return -ENOMEM; -+ -+ if (dev->node_props.cpu_cores_count == 0) -+ props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; -+ else -+ props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; -+ -+ if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) -+ props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; -+ if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) -+ props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; -+ -+ props->size_in_bytes = -+ ((uint64_t)mem->length_high << 32) + -+ mem->length_low; -+ props->width = mem->width; -+ -+ dev->mem_bank_count++; -+ list_add_tail(&props->list, &dev->mem_props); -+ -+ break; -+ } -+ i++; -+ } -+ -+ return 0; - } - --struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd) -+/* -+ * kfd_parse_subtype_cache is called when the topology mutex -+ * is already acquired -+ */ -+static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) - { -- struct kfd_topology_device *top_dev; -- struct kfd_dev *device = NULL; -+ struct kfd_cache_properties *props; -+ struct kfd_topology_device *dev; -+ uint32_t id; - -- down_read(&topology_lock); -+ BUG_ON(!cache); -+ -+ id = cache->processor_id_low; -+ -+ pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); -+ list_for_each_entry(dev, &topology_device_list, list) -+ if (id == dev->node_props.cpu_core_id_base || -+ id == dev->node_props.simd_id_base) { -+ props = kfd_alloc_struct(props); -+ if (props == NULL) -+ return -ENOMEM; -+ -+ props->processor_id_low = id; -+ props->cache_level = cache->cache_level; -+ props->cache_size = cache->cache_size; -+ props->cacheline_size = cache->cache_line_size; -+ props->cachelines_per_tag = cache->lines_per_tag; -+ props->cache_assoc = cache->associativity; -+ props->cache_latency = cache->cache_latency; -+ -+ if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) -+ props->cache_type |= HSA_CACHE_TYPE_DATA; -+ if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) -+ props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; -+ if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) -+ props->cache_type |= HSA_CACHE_TYPE_CPU; -+ if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) -+ props->cache_type |= HSA_CACHE_TYPE_HSACU; -+ -+ dev->cache_count++; -+ dev->node_props.caches_count++; -+ list_add_tail(&props->list, &dev->cache_props); - -- list_for_each_entry(top_dev, &topology_device_list, list) -- if (top_dev->gpu && top_dev->gpu->kgd == kgd) { -- device = top_dev->gpu; - break; - } - -- up_read(&topology_lock); -+ return 0; -+} - -- return device; -+/* -+ * kfd_parse_subtype_iolink is called when the topology mutex -+ * is already acquired -+ */ -+static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) -+{ -+ struct kfd_iolink_properties *props; -+ struct kfd_topology_device *dev; -+ uint32_t i = 0; -+ uint32_t id_from; -+ uint32_t id_to; -+ -+ BUG_ON(!iolink); -+ -+ id_from = iolink->proximity_domain_from; -+ id_to = iolink->proximity_domain_to; -+ -+ pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); -+ list_for_each_entry(dev, &topology_device_list, list) { -+ if (id_from == i) { -+ props = kfd_alloc_struct(props); -+ if (props == NULL) -+ return -ENOMEM; -+ -+ props->node_from = id_from; -+ props->node_to = id_to; -+ props->ver_maj = iolink->version_major; -+ props->ver_min = iolink->version_minor; -+ -+ /* -+ * weight factor (derived from CDIR), currently always 1 -+ */ -+ props->weight = 1; -+ -+ props->min_latency = iolink->minimum_latency; -+ props->max_latency = iolink->maximum_latency; -+ props->min_bandwidth = iolink->minimum_bandwidth_mbs; -+ props->max_bandwidth = iolink->maximum_bandwidth_mbs; -+ props->rec_transfer_size = -+ iolink->recommended_transfer_size; -+ -+ dev->io_link_count++; -+ dev->node_props.io_links_count++; -+ list_add_tail(&props->list, &dev->io_link_props); -+ -+ break; -+ } -+ i++; -+ } -+ -+ return 0; -+} -+ -+static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) -+{ -+ struct crat_subtype_computeunit *cu; -+ struct crat_subtype_memory *mem; -+ struct crat_subtype_cache *cache; -+ struct crat_subtype_iolink *iolink; -+ int ret = 0; -+ -+ BUG_ON(!sub_type_hdr); -+ -+ switch (sub_type_hdr->type) { -+ case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: -+ cu = (struct crat_subtype_computeunit *)sub_type_hdr; -+ ret = kfd_parse_subtype_cu(cu); -+ break; -+ case CRAT_SUBTYPE_MEMORY_AFFINITY: -+ mem = (struct crat_subtype_memory *)sub_type_hdr; -+ ret = kfd_parse_subtype_mem(mem); -+ break; -+ case CRAT_SUBTYPE_CACHE_AFFINITY: -+ cache = (struct crat_subtype_cache *)sub_type_hdr; -+ ret = kfd_parse_subtype_cache(cache); -+ break; -+ case CRAT_SUBTYPE_TLB_AFFINITY: -+ /* -+ * For now, nothing to do here -+ */ -+ pr_info("Found TLB entry in CRAT table (not processing)\n"); -+ break; -+ case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: -+ /* -+ * For now, nothing to do here -+ */ -+ pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); -+ break; -+ case CRAT_SUBTYPE_IOLINK_AFFINITY: -+ iolink = (struct crat_subtype_iolink *)sub_type_hdr; -+ ret = kfd_parse_subtype_iolink(iolink); -+ break; -+ default: -+ pr_warn("Unknown subtype (%d) in CRAT\n", -+ sub_type_hdr->type); -+ } -+ -+ return ret; - } - --/* Called with write topology_lock acquired */ - static void kfd_release_topology_device(struct kfd_topology_device *dev) - { - struct kfd_mem_properties *mem; - struct kfd_cache_properties *cache; - struct kfd_iolink_properties *iolink; -- struct kfd_perf_properties *perf; -+ -+ BUG_ON(!dev); - - list_del(&dev->list); - -@@ -147,40 +393,30 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) - kfree(iolink); - } - -- while (dev->perf_props.next != &dev->perf_props) { -- perf = container_of(dev->perf_props.next, -- struct kfd_perf_properties, list); -- list_del(&perf->list); -- kfree(perf); -- } -- - kfree(dev); -+ -+ sys_props.num_devices--; - } - --void kfd_release_topology_device_list(struct list_head *device_list) -+static void kfd_release_live_view(void) - { - struct kfd_topology_device *dev; - -- while (!list_empty(device_list)) { -- dev = list_first_entry(device_list, -- struct kfd_topology_device, list); -+ while (topology_device_list.next != &topology_device_list) { -+ dev = container_of(topology_device_list.next, -+ struct kfd_topology_device, list); - kfd_release_topology_device(dev); -- } - } - --static void kfd_release_live_view(void) --{ -- kfd_release_topology_device_list(&topology_device_list); - memset(&sys_props, 0, sizeof(sys_props)); - } - --struct kfd_topology_device *kfd_create_topology_device( -- struct list_head *device_list) -+static struct kfd_topology_device *kfd_create_topology_device(void) - { - struct kfd_topology_device *dev; - - dev = kfd_alloc_struct(dev); -- if (!dev) { -+ if (dev == NULL) { - pr_err("No memory to allocate a topology device"); - return NULL; - } -@@ -188,13 +424,66 @@ struct kfd_topology_device *kfd_create_topology_device( - INIT_LIST_HEAD(&dev->mem_props); - INIT_LIST_HEAD(&dev->cache_props); - INIT_LIST_HEAD(&dev->io_link_props); -- INIT_LIST_HEAD(&dev->perf_props); - -- list_add_tail(&dev->list, device_list); -+ list_add_tail(&dev->list, &topology_device_list); -+ sys_props.num_devices++; - - return dev; - } - -+static int kfd_parse_crat_table(void *crat_image) -+{ -+ struct kfd_topology_device *top_dev; -+ struct crat_subtype_generic *sub_type_hdr; -+ uint16_t node_id; -+ int ret; -+ struct crat_header *crat_table = (struct crat_header *)crat_image; -+ uint16_t num_nodes; -+ uint32_t image_len; -+ -+ if (!crat_image) -+ return -EINVAL; -+ -+ num_nodes = crat_table->num_domains; -+ image_len = crat_table->length; -+ -+ pr_info("Parsing CRAT table with %d nodes\n", num_nodes); -+ -+ for (node_id = 0; node_id < num_nodes; node_id++) { -+ top_dev = kfd_create_topology_device(); -+ if (!top_dev) { -+ kfd_release_live_view(); -+ return -ENOMEM; -+ } -+ } -+ -+ sys_props.platform_id = -+ (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; -+ sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); -+ sys_props.platform_rev = crat_table->revision; -+ -+ sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); -+ while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < -+ ((char *)crat_image) + image_len) { -+ if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { -+ ret = kfd_parse_subtype(sub_type_hdr); -+ if (ret != 0) { -+ kfd_release_live_view(); -+ return ret; -+ } -+ } -+ -+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + -+ sub_type_hdr->length); -+ } -+ -+ sys_props.generation_count++; -+ topology_crat_parsed = 1; -+ -+ return 0; -+} -+ -+ - #define sysfs_show_gen_prop(buffer, fmt, ...) \ - snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) - #define sysfs_show_32bit_prop(buffer, name, value) \ -@@ -203,8 +492,6 @@ struct kfd_topology_device *kfd_create_topology_device( - sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) - #define sysfs_show_32bit_val(buffer, value) \ - sysfs_show_gen_prop(buffer, "%u\n", value) --#define sysfs_show_64bit_val(buffer, value) \ -- sysfs_show_gen_prop(buffer, "%llu\n", value) - #define sysfs_show_str_val(buffer, value) \ - sysfs_show_gen_prop(buffer, "%s\n", value) - -@@ -232,17 +519,11 @@ static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, - return ret; - } - --static void kfd_topology_kobj_release(struct kobject *kobj) --{ -- kfree(kobj); --} -- - static const struct sysfs_ops sysprops_ops = { - .show = sysprops_show, - }; - - static struct kobj_type sysprops_type = { -- .release = kfd_topology_kobj_release, - .sysfs_ops = &sysprops_ops, - }; - -@@ -278,7 +559,6 @@ static const struct sysfs_ops iolink_ops = { - }; - - static struct kobj_type iolink_type = { -- .release = kfd_topology_kobj_release, - .sysfs_ops = &iolink_ops, - }; - -@@ -287,23 +567,11 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, - { - ssize_t ret; - struct kfd_mem_properties *mem; -- uint64_t used_mem; - - /* Making sure that the buffer is an empty string */ - buffer[0] = 0; - -- if (strcmp(attr->name, "used_memory") == 0) { -- mem = container_of(attr, struct kfd_mem_properties, -- attr_used); -- if (mem->gpu) { -- used_mem = mem->gpu->kfd2kgd->get_vram_usage(mem->gpu->kgd); -- return sysfs_show_64bit_val(buffer, used_mem); -- } -- /* TODO: Report APU/CPU-allocated memory; For now return 0 */ -- return 0; -- } -- -- mem = container_of(attr, struct kfd_mem_properties, attr_props); -+ mem = container_of(attr, struct kfd_mem_properties, attr); - sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); - sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); - sysfs_show_32bit_prop(buffer, "flags", mem->flags); -@@ -318,7 +586,6 @@ static const struct sysfs_ops mem_ops = { - }; - - static struct kobj_type mem_type = { -- .release = kfd_topology_kobj_release, - .sysfs_ops = &mem_ops, - }; - -@@ -326,7 +593,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, - char *buffer) - { - ssize_t ret; -- uint32_t i, j; -+ uint32_t i; - struct kfd_cache_properties *cache; - - /* Making sure that the buffer is an empty string */ -@@ -344,18 +611,12 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, - sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); - sysfs_show_32bit_prop(buffer, "type", cache->cache_type); - snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); -- for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) -- for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { -- /* Check each bit */ -- if (cache->sibling_map[i] & (1 << j)) -- ret = snprintf(buffer, PAGE_SIZE, -- "%s%d%s", buffer, 1, ","); -- else -- ret = snprintf(buffer, PAGE_SIZE, -- "%s%d%s", buffer, 0, ","); -- } -- /* Replace the last "," with end of line */ -- *(buffer + strlen(buffer) - 1) = 0xA; -+ for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) -+ ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", -+ buffer, cache->sibling_map[i], -+ (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? -+ "\n" : ","); -+ - return ret; - } - -@@ -364,43 +625,9 @@ static const struct sysfs_ops cache_ops = { - }; - - static struct kobj_type cache_type = { -- .release = kfd_topology_kobj_release, - .sysfs_ops = &cache_ops, - }; - --/****** Sysfs of Performance Counters ******/ -- --struct kfd_perf_attr { -- struct kobj_attribute attr; -- uint32_t data; --}; -- --static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, -- char *buf) --{ -- struct kfd_perf_attr *attr; -- -- buf[0] = 0; -- attr = container_of(attrs, struct kfd_perf_attr, attr); -- if (!attr->data) /* invalid data for PMC */ -- return 0; -- else -- return sysfs_show_32bit_val(buf, attr->data); --} -- --#define KFD_PERF_DESC(_name, _data) \ --{ \ -- .attr = __ATTR(_name, 0444, perf_show, NULL), \ -- .data = _data, \ --} -- --static struct kfd_perf_attr perf_attr_iommu[] = { -- KFD_PERF_DESC(max_concurrent, 0), -- KFD_PERF_DESC(num_counters, 0), -- KFD_PERF_DESC(counter_ids, 0), --}; --/****************************************/ -- - static ssize_t node_show(struct kobject *kobj, struct attribute *attr, - char *buffer) - { -@@ -408,7 +635,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, - char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; - uint32_t i; - uint32_t log_max_watch_addr; -- struct kfd_local_mem_info local_mem_info; - - /* Making sure that the buffer is an empty string */ - buffer[0] = 0; -@@ -438,8 +664,18 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, - dev->node_props.cpu_cores_count); - sysfs_show_32bit_prop(buffer, "simd_count", - dev->node_props.simd_count); -- sysfs_show_32bit_prop(buffer, "mem_banks_count", -- dev->node_props.mem_banks_count); -+ -+ if (dev->mem_bank_count < dev->node_props.mem_banks_count) { -+ pr_info_once("kfd: mem_banks_count truncated from %d to %d\n", -+ dev->node_props.mem_banks_count, -+ dev->mem_bank_count); -+ sysfs_show_32bit_prop(buffer, "mem_banks_count", -+ dev->mem_bank_count); -+ } else { -+ sysfs_show_32bit_prop(buffer, "mem_banks_count", -+ dev->node_props.mem_banks_count); -+ } -+ - sysfs_show_32bit_prop(buffer, "caches_count", - dev->node_props.caches_count); - sysfs_show_32bit_prop(buffer, "io_links_count", -@@ -487,28 +723,17 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, - HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); - } - -- if (dev->gpu->device_info->asic_family == CHIP_TONGA) -- dev->node_props.capability |= -- HSA_CAP_AQL_QUEUE_DOUBLE_MAP; -- - sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", -- dev->node_props.max_engine_clk_fcompute); -+ dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( -+ dev->gpu->kgd)); - -- /* -- * If the ASIC is CZ, set local memory size to 0 to disable -- * local memory support -- */ -- if (dev->gpu->device_info->asic_family != CHIP_CARRIZO) { -- dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, -- &local_mem_info); -- sysfs_show_64bit_prop(buffer, "local_mem_size", -- local_mem_info.local_mem_size_private + -- local_mem_info.local_mem_size_public); -- } else -- sysfs_show_64bit_prop(buffer, "local_mem_size", 0ULL); -+ sysfs_show_64bit_prop(buffer, "local_mem_size", -+ (unsigned long long int) 0); - - sysfs_show_32bit_prop(buffer, "fw_version", -- dev->gpu->mec_fw_version); -+ dev->gpu->kfd2kgd->get_fw_version( -+ dev->gpu->kgd, -+ KGD_ENGINE_MEC1)); - sysfs_show_32bit_prop(buffer, "capability", - dev->node_props.capability); - } -@@ -522,7 +747,6 @@ static const struct sysfs_ops node_ops = { - }; - - static struct kobj_type node_type = { -- .release = kfd_topology_kobj_release, - .sysfs_ops = &node_ops, - }; - -@@ -538,7 +762,8 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) - struct kfd_iolink_properties *iolink; - struct kfd_cache_properties *cache; - struct kfd_mem_properties *mem; -- struct kfd_perf_properties *perf; -+ -+ BUG_ON(!dev); - - if (dev->kobj_iolink) { - list_for_each_entry(iolink, &dev->io_link_props, list) -@@ -567,12 +792,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) - if (dev->kobj_mem) { - list_for_each_entry(mem, &dev->mem_props, list) - if (mem->kobj) { -- /* TODO: Remove when CPU/APU supported */ -- if (dev->node_props.cpu_cores_count == 0) -- sysfs_remove_file(mem->kobj, -- &mem->attr_used); -- kfd_remove_sysfs_file(mem->kobj, -- &mem->attr_props); -+ kfd_remove_sysfs_file(mem->kobj, &mem->attr); - mem->kobj = NULL; - } - kobject_del(dev->kobj_mem); -@@ -580,16 +800,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) - dev->kobj_mem = NULL; - } - -- if (dev->kobj_perf) { -- list_for_each_entry(perf, &dev->perf_props, list) { -- kfree(perf->attr_group); -- perf->attr_group = NULL; -- } -- kobject_del(dev->kobj_perf); -- kobject_put(dev->kobj_perf); -- dev->kobj_perf = NULL; -- } -- - if (dev->kobj_node) { - sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); - sysfs_remove_file(dev->kobj_node, &dev->attr_name); -@@ -606,18 +816,15 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, - struct kfd_iolink_properties *iolink; - struct kfd_cache_properties *cache; - struct kfd_mem_properties *mem; -- struct kfd_perf_properties *perf; -- uint32_t num_attrs; -- struct attribute **attrs; - int ret; - uint32_t i; - -- if (WARN_ON(dev->kobj_node)) -- return -EEXIST; -+ BUG_ON(!dev); - - /* - * Creating the sysfs folders - */ -+ BUG_ON(dev->kobj_node); - dev->kobj_node = kfd_alloc_struct(dev->kobj_node); - if (!dev->kobj_node) - return -ENOMEM; -@@ -639,10 +846,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, - if (!dev->kobj_iolink) - return -ENOMEM; - -- dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node); -- if (!dev->kobj_perf) -- return -ENOMEM; -- - /* - * Creating sysfs files for node properties - */ -@@ -675,23 +878,12 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, - if (ret < 0) - return ret; - -- mem->attr_props.name = "properties"; -- mem->attr_props.mode = KFD_SYSFS_FILE_MODE; -- sysfs_attr_init(&mem->attr_props); -- ret = sysfs_create_file(mem->kobj, &mem->attr_props); -+ mem->attr.name = "properties"; -+ mem->attr.mode = KFD_SYSFS_FILE_MODE; -+ sysfs_attr_init(&mem->attr); -+ ret = sysfs_create_file(mem->kobj, &mem->attr); - if (ret < 0) - return ret; -- -- /* TODO: Support APU/CPU memory usage */ -- if (dev->node_props.cpu_cores_count == 0) { -- mem->attr_used.name = "used_memory"; -- mem->attr_used.mode = KFD_SYSFS_FILE_MODE; -- sysfs_attr_init(&mem->attr_used); -- ret = sysfs_create_file(mem->kobj, &mem->attr_used); -- if (ret < 0) -- return ret; -- } -- - i++; - } - -@@ -731,38 +923,11 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, - if (ret < 0) - return ret; - i++; -- } -- -- /* All hardware blocks have the same number of attributes. */ -- num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr); -- list_for_each_entry(perf, &dev->perf_props, list) { -- perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr) -- * num_attrs + sizeof(struct attribute_group), -- GFP_KERNEL); -- if (!perf->attr_group) -- return -ENOMEM; -- -- attrs = (struct attribute **)(perf->attr_group + 1); -- if (!strcmp(perf->block_name, "iommu")) { -- /* Information of IOMMU's num_counters and counter_ids is shown -- * under /sys/bus/event_source/devices/amd_iommu. We don't -- * duplicate here. -- */ -- perf_attr_iommu[0].data = perf->max_concurrent; -- for (i = 0; i < num_attrs; i++) -- attrs[i] = &perf_attr_iommu[i].attr.attr; -- } -- perf->attr_group->name = perf->block_name; -- perf->attr_group->attrs = attrs; -- ret = sysfs_create_group(dev->kobj_perf, perf->attr_group); -- if (ret < 0) -- return ret; -- } -+} - - return 0; - } - --/* Called with write topology lock acquired */ - static int kfd_build_sysfs_node_tree(void) - { - struct kfd_topology_device *dev; -@@ -779,7 +944,6 @@ static int kfd_build_sysfs_node_tree(void) - return 0; - } - --/* Called with write topology lock acquired */ - static void kfd_remove_sysfs_node_tree(void) - { - struct kfd_topology_device *dev; -@@ -793,7 +957,7 @@ static int kfd_topology_update_sysfs(void) - int ret; - - pr_info("Creating topology SYSFS entries\n"); -- if (!sys_props.kobj_topology) { -+ if (sys_props.kobj_topology == NULL) { - sys_props.kobj_topology = - kfd_alloc_struct(sys_props.kobj_topology); - if (!sys_props.kobj_topology) -@@ -851,251 +1015,75 @@ static void kfd_topology_release_sysfs(void) - } - } - --/* Called with write topology_lock acquired */ --static void kfd_topology_update_device_list(struct list_head *temp_list, -- struct list_head *master_list) --{ -- while (!list_empty(temp_list)) { -- list_move_tail(temp_list->next, master_list); -- sys_props.num_devices++; -- } --} -- --static void kfd_debug_print_topology(void) --{ -- struct kfd_topology_device *dev; -- -- down_read(&topology_lock); -- -- dev = list_last_entry(&topology_device_list, -- struct kfd_topology_device, list); -- if (dev) { -- if (dev->node_props.cpu_cores_count && -- dev->node_props.simd_count) { -- pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", -- dev->node_props.device_id, -- dev->node_props.vendor_id); -- } else if (dev->node_props.cpu_cores_count) -- pr_info("Topology: Add CPU node\n"); -- else if (dev->node_props.simd_count) -- pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", -- dev->node_props.device_id, -- dev->node_props.vendor_id); -- } -- up_read(&topology_lock); --} -- --/* Helper function for intializing platform_xx members of kfd_system_properties -- */ --static void kfd_update_system_properties(void) --{ -- struct kfd_topology_device *dev; -- -- down_read(&topology_lock); -- dev = list_last_entry(&topology_device_list, -- struct kfd_topology_device, list); -- if (dev) { -- sys_props.platform_id = -- (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; -- sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); -- sys_props.platform_rev = dev->oem_revision; -- } -- up_read(&topology_lock); --} -- --static void find_system_memory(const struct dmi_header *dm, -- void *private) --{ -- struct kfd_mem_properties *mem; -- u16 mem_width, mem_clock; -- struct kfd_topology_device *kdev = -- (struct kfd_topology_device *)private; -- const u8 *dmi_data = (const u8 *)(dm + 1); -- -- if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { -- mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); -- mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); -- list_for_each_entry(mem, &kdev->mem_props, list) { -- if (mem_width != 0xFFFF && mem_width != 0) -- mem->width = mem_width; -- if (mem_clock != 0) -- mem->mem_clk_max = mem_clock; -- } -- } --} -- --/* -- * Performance counters information is not part of CRAT but we would like to -- * put them in the sysfs under topology directory for Thunk to get the data. -- * This function is called before updating the sysfs. -- */ --static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) --{ --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -- struct kfd_perf_properties *props; -- -- if (amd_iommu_pc_supported()) { -- props = kfd_alloc_struct(props); -- if (!props) -- return -ENOMEM; -- strcpy(props->block_name, "iommu"); -- props->max_concurrent = amd_iommu_pc_get_max_banks(0) * -- amd_iommu_pc_get_max_counters(0); /* assume one iommu */ -- list_add_tail(&props->list, &kdev->perf_props); -- } --#endif -- -- return 0; --} -- --/* kfd_add_non_crat_information - Add information that is not currently -- * defined in CRAT but is necessary for KFD topology -- * @dev - topology device to which addition info is added -- */ --static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) --{ -- /* Check if CPU only node. */ -- if (!kdev->gpu) { -- /* Add system memory information */ -- dmi_walk(find_system_memory, kdev); -- } -- /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ --} -- --#ifdef CONFIG_ACPI --/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices. -- * Ignore CRAT for all other devices. AMD APU is identified if both CPU -- * and GPU cores are present. -- * @device_list - topology device list created by parsing ACPI CRAT table. -- * @return - TRUE if invalid, FALSE is valid. -- */ --static bool kfd_is_acpi_crat_invalid(struct list_head *device_list) --{ -- struct kfd_topology_device *dev; -- -- list_for_each_entry(dev, device_list, list) { -- if (dev->node_props.cpu_cores_count && -- dev->node_props.simd_count) -- return false; -- } -- pr_info("Ignoring ACPI CRAT on non-APU system\n"); -- return true; --} --#endif -- - int kfd_topology_init(void) - { - void *crat_image = NULL; - size_t image_size = 0; - int ret; -- struct list_head temp_topology_device_list; -- int cpu_only_node = 0; -- struct kfd_topology_device *kdev; -- int proximity_domain; -- -- /* topology_device_list - Master list of all topology devices -- * temp_topology_device_list - temporary list created while parsing CRAT -- * or VCRAT. Once parsing is complete the contents of list is moved to -- * topology_device_list -- */ - -- /* Initialize the head for the both the lists */ -+ /* -+ * Initialize the head for the topology device list -+ */ - INIT_LIST_HEAD(&topology_device_list); -- INIT_LIST_HEAD(&temp_topology_device_list); - init_rwsem(&topology_lock); -+ topology_crat_parsed = 0; - - memset(&sys_props, 0, sizeof(sys_props)); - -- /* Proximity domains in ACPI CRAT tables start counting at -- * 0. The same should be true for virtual CRAT tables created -- * at this stage. GPUs added later in kfd_topology_add_device -- * use a counter. -- */ -- proximity_domain = 0; -- - /* -- * Get the CRAT image from the ACPI. If ACPI doesn't have one -- * or if ACPI CRAT is invalid create a virtual CRAT. -- * NOTE: The current implementation expects all AMD APUs to have -- * CRAT. If no CRAT is available, it is assumed to be a CPU -+ * Get the CRAT image from the ACPI - */ --#ifdef CONFIG_ACPI -- ret = kfd_create_crat_image_acpi(&crat_image, &image_size); -- if (ret == 0) { -- ret = kfd_parse_crat_table(crat_image, -- &temp_topology_device_list, -- proximity_domain); -- if (ret || -- kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { -- -- kfd_release_topology_device_list( -- &temp_topology_device_list); -- kfd_destroy_crat_image(crat_image); -- crat_image = NULL; -- } -- } --#endif -- if (!crat_image) { -- ret = kfd_create_crat_image_virtual(&crat_image, &image_size, -- COMPUTE_UNIT_CPU, NULL, -- proximity_domain); -- cpu_only_node = 1; -- if (ret) { -- pr_err("Error creating VCRAT table for CPU\n"); -- return ret; -- } -- -- ret = kfd_parse_crat_table(crat_image, -- &temp_topology_device_list, -- proximity_domain); -- if (ret) { -- pr_err("Error parsing VCRAT table for CPU\n"); -+ ret = kfd_topology_get_crat_acpi(crat_image, &image_size); -+ if (ret == 0 && image_size > 0) { -+ pr_info("Found CRAT image with size=%zd\n", image_size); -+ crat_image = kmalloc(image_size, GFP_KERNEL); -+ if (!crat_image) { -+ ret = -ENOMEM; -+ pr_err("No memory for allocating CRAT image\n"); - goto err; - } -- } -- -- kdev = list_first_entry(&temp_topology_device_list, -- struct kfd_topology_device, list); -- kfd_add_perf_to_topology(kdev); -- -- down_write(&topology_lock); -- kfd_topology_update_device_list(&temp_topology_device_list, -- &topology_device_list); -- atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1); -- ret = kfd_topology_update_sysfs(); -- up_write(&topology_lock); -- -- if (ret == 0) { -- sys_props.generation_count++; -- kfd_update_system_properties(); -- kfd_debug_print_topology(); -- pr_info("Finished initializing topology\n"); -- } else -- pr_err("Failed to update topology in sysfs ret=%d\n", ret); -- -- /* For nodes with GPU, this information gets added -- * when GPU is detected (kfd_topology_add_device). -- */ -- if (cpu_only_node) { -- /* Add additional information to CPU only node created above */ -- down_write(&topology_lock); -- kdev = list_first_entry(&topology_device_list, -- struct kfd_topology_device, list); -- up_write(&topology_lock); -- kfd_add_non_crat_information(kdev); -+ ret = kfd_topology_get_crat_acpi(crat_image, &image_size); -+ -+ if (ret == 0) { -+ down_write(&topology_lock); -+ ret = kfd_parse_crat_table(crat_image); -+ if (ret == 0) -+ ret = kfd_topology_update_sysfs(); -+ up_write(&topology_lock); -+ } else { -+ pr_err("Couldn't get CRAT table size from ACPI\n"); -+ } -+ kfree(crat_image); -+ } else if (ret == -ENODATA) { -+ ret = 0; -+ } else { -+ pr_err("Couldn't get CRAT table size from ACPI\n"); - } - - err: -- kfd_destroy_crat_image(crat_image); -+ pr_info("Finished initializing topology ret=%d\n", ret); - return ret; - } - - void kfd_topology_shutdown(void) - { -- down_write(&topology_lock); - kfd_topology_release_sysfs(); - kfd_release_live_view(); -- up_write(&topology_lock); -+} -+ -+static void kfd_debug_print_topology(void) -+{ -+ struct kfd_topology_device *dev; -+ uint32_t i = 0; -+ -+ pr_info("DEBUG PRINT OF TOPOLOGY:"); -+ list_for_each_entry(dev, &topology_device_list, list) { -+ pr_info("Node: %d\n", i); -+ pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); -+ pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); -+ pr_info("\tSIMD count: %d", dev->node_props.simd_count); -+ i++; -+ } - } - - static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) -@@ -1104,15 +1092,11 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) - uint32_t buf[7]; - uint64_t local_mem_size; - int i; -- struct kfd_local_mem_info local_mem_info; - - if (!gpu) - return 0; - -- gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info); -- -- local_mem_size = local_mem_info.local_mem_size_private + -- local_mem_info.local_mem_size_public; -+ local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd); - - buf[0] = gpu->pdev->devfn; - buf[1] = gpu->pdev->subsystem_vendor; -@@ -1127,32 +1111,20 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) - - return hashout; - } --/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If -- * the GPU device is not already present in the topology device -- * list then return NULL. This means a new topology device has to -- * be created for this GPU. -- * TODO: Rather than assiging @gpu to first topology device withtout -- * gpu attached, it will better to have more stringent check. -- */ -+ - static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) - { - struct kfd_topology_device *dev; - struct kfd_topology_device *out_dev = NULL; -- struct kfd_mem_properties *mem; - -- down_write(&topology_lock); -+ BUG_ON(!gpu); -+ - list_for_each_entry(dev, &topology_device_list, list) -- if (!dev->gpu && (dev->node_props.simd_count > 0)) { -+ if (dev->gpu == NULL && dev->node_props.simd_count > 0) { - dev->gpu = gpu; - out_dev = dev; -- -- /* Assign mem->gpu */ -- list_for_each_entry(mem, &dev->mem_props, list) -- mem->gpu = dev->gpu; -- - break; - } -- up_write(&topology_lock); - - return out_dev; - } -@@ -1165,202 +1137,88 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) - */ - } - --/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, -- * patch this after CRAT parsing. -- */ --static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) --{ -- struct kfd_mem_properties *mem; -- struct kfd_local_mem_info local_mem_info; -- -- if (!dev) -- return; -- -- /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with -- * single bank of VRAM local memory. -- * for dGPUs - VCRAT reports only one bank of Local Memory -- * for APUs - If CRAT from ACPI reports more than one bank, then -- * all the banks will report the same mem_clk_max information -- */ -- dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, -- &local_mem_info); -- -- list_for_each_entry(mem, &dev->mem_props, list) -- mem->mem_clk_max = local_mem_info.mem_clk_max; --} -- --static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) --{ -- struct kfd_iolink_properties *link; -- -- if (!dev || !dev->gpu) -- return; -- -- /* GPU only creates direck links so apply flags setting to all */ -- if (dev->gpu->device_info->asic_family == CHIP_HAWAII) -- list_for_each_entry(link, &dev->io_link_props, list) -- link->flags = CRAT_IOLINK_FLAGS_ENABLED | -- CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | -- CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; --} -- - int kfd_topology_add_device(struct kfd_dev *gpu) - { - uint32_t gpu_id; - struct kfd_topology_device *dev; -- struct kfd_cu_info cu_info; -- int res = 0; -- struct list_head temp_topology_device_list; -- void *crat_image = NULL; -- size_t image_size = 0; -- int proximity_domain; -+ int res; - -- INIT_LIST_HEAD(&temp_topology_device_list); -+ BUG_ON(!gpu); - - gpu_id = kfd_generate_gpu_id(gpu); - -- pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); -- -- proximity_domain = atomic_inc_return(& -- topology_crat_proximity_domain); -+ pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id); - -- /* Check to see if this gpu device exists in the topology_device_list. -- * If so, assign the gpu to that device, -- * else create a Virtual CRAT for this gpu device and then parse that -- * CRAT to create a new topology device. Once created assign the gpu to -- * that topology device -+ down_write(&topology_lock); -+ /* -+ * Try to assign the GPU to existing topology device (generated from -+ * CRAT table - */ - dev = kfd_assign_gpu(gpu); - if (!dev) { -- res = kfd_create_crat_image_virtual(&crat_image, &image_size, -- COMPUTE_UNIT_GPU, -- gpu, proximity_domain); -- if (res) { -- pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", -- gpu_id); -- return res; -- } -- res = kfd_parse_crat_table(crat_image, -- &temp_topology_device_list, proximity_domain); -- if (res) { -- pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", -- gpu_id); -+ pr_info("GPU was not found in the current topology. Extending.\n"); -+ kfd_debug_print_topology(); -+ dev = kfd_create_topology_device(); -+ if (!dev) { -+ res = -ENOMEM; - goto err; - } -+ dev->gpu = gpu; - -- down_write(&topology_lock); -- kfd_topology_update_device_list(&temp_topology_device_list, -- &topology_device_list); -+ /* -+ * TODO: Make a call to retrieve topology information from the -+ * GPU vBIOS -+ */ - -- /* Update the SYSFS tree, since we added another topology -- * device -+ /* -+ * Update the SYSFS tree, since we added another topology device - */ -- res = kfd_topology_update_sysfs(); -- up_write(&topology_lock); -- -- if (res == 0) -- sys_props.generation_count++; -- else -- pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", -- gpu_id, res); -- dev = kfd_assign_gpu(gpu); -- if (!dev) { -- pr_err("Could not assign GPU\n"); -- res = -ENODEV; -- goto err; -- } -+ if (kfd_topology_update_sysfs() < 0) -+ kfd_topology_release_sysfs(); -+ - } - - dev->gpu_id = gpu_id; - gpu->id = gpu_id; -- -- /* TODO: Move the following lines to function -- * kfd_add_non_crat_information -- */ -- -- /* Fill-in additional information that is not available in CRAT but -- * needed for the topology -- */ -- -- dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); -- dev->node_props.simd_arrays_per_engine = -- cu_info.num_shader_arrays_per_engine; -- - dev->node_props.vendor_id = gpu->pdev->vendor; - dev->node_props.device_id = gpu->pdev->device; -- dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, -- gpu->pdev->devfn); -- dev->node_props.max_engine_clk_fcompute = -- dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); -- dev->node_props.max_engine_clk_ccompute = -- cpufreq_quick_get_max(0) / 1000; -- -- kfd_fill_mem_clk_max_info(dev); -- kfd_fill_iolink_non_crat_info(dev); -- -- switch (dev->gpu->device_info->asic_family) { -- case CHIP_KAVERI: -- case CHIP_HAWAII: -- case CHIP_TONGA: -- dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << -- HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & -- HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); -- break; -- case CHIP_CARRIZO: -- case CHIP_FIJI: -- case CHIP_POLARIS10: -- case CHIP_POLARIS11: -- pr_debug("Adding doorbell packet type capability\n"); -- dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << -- HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & -- HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); -- break; -- case CHIP_VEGA10: -- case CHIP_RAVEN: -- dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << -- HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & -- HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); -- break; -- default: -- BUG(); -- } -- -- /* Fix errors in CZ CRAT. -- * simd_count: Carrizo CRAT reports wrong simd_count, probably because -- * it doesn't consider masked out CUs -- * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd. -- * capability flag: Carrizo CRAT doesn't report IOMMU flags. -+ dev->node_props.location_id = (gpu->pdev->bus->number << 24) + -+ (gpu->pdev->devfn & 0xffffff); -+ /* -+ * TODO: Retrieve max engine clock values from KGD - */ -+ - if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { -- dev->node_props.simd_count = -- cu_info.simd_per_cu * cu_info.cu_active_number; -- dev->node_props.max_waves_per_simd = 10; -- dev->node_props.capability |= HSA_CAP_ATS_PRESENT; -+ dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; -+ pr_info("amdkfd: adding doorbell packet type capability\n"); - } - -- kfd_debug_print_topology(); -+ res = 0; - -- if (!res) -- kfd_notify_gpu_change(gpu_id, 1); - err: -- kfd_destroy_crat_image(crat_image); -+ up_write(&topology_lock); -+ -+ if (res == 0) -+ kfd_notify_gpu_change(gpu_id, 1); -+ - return res; - } - - int kfd_topology_remove_device(struct kfd_dev *gpu) - { -- struct kfd_topology_device *dev, *tmp; -+ struct kfd_topology_device *dev; - uint32_t gpu_id; - int res = -ENODEV; - -+ BUG_ON(!gpu); -+ - down_write(&topology_lock); - -- list_for_each_entry_safe(dev, tmp, &topology_device_list, list) -+ list_for_each_entry(dev, &topology_device_list, list) - if (dev->gpu == gpu) { - gpu_id = dev->gpu_id; - kfd_remove_sysfs_node_entry(dev); - kfd_release_topology_device(dev); -- sys_props.num_devices--; - res = 0; - if (kfd_topology_update_sysfs() < 0) - kfd_topology_release_sysfs(); -@@ -1375,26 +1233,22 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) - return res; - } - --/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD -- * topology. If GPU device is found @idx, then valid kfd_dev pointer is -- * returned through @kdev -- * Return - 0: On success (@kdev will be NULL for non GPU nodes) -- * -1: If end of list -+/* -+ * When idx is out of bounds, the function will return NULL - */ --int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) -+struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) - { - - struct kfd_topology_device *top_dev; -+ struct kfd_dev *device = NULL; - uint8_t device_idx = 0; - -- *kdev = NULL; - down_read(&topology_lock); - - list_for_each_entry(top_dev, &topology_device_list, list) { - if (device_idx == idx) { -- *kdev = top_dev->gpu; -- up_read(&topology_lock); -- return 0; -+ device = top_dev->gpu; -+ break; - } - - device_idx++; -@@ -1402,89 +1256,6 @@ int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) - - up_read(&topology_lock); - -- return -1; -- --} -- --static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) --{ -- int first_cpu_of_numa_node; -- -- if (!cpumask || (cpumask == cpu_none_mask)) -- return -1; -- first_cpu_of_numa_node = cpumask_first(cpumask); -- if (first_cpu_of_numa_node >= nr_cpu_ids) -- return -1; --#ifdef CONFIG_X86_64 -- return cpu_data(first_cpu_of_numa_node).apicid; --#else -- return first_cpu_of_numa_node; --#endif --} -- --/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor -- * of the given NUMA node (numa_node_id) -- * Return -1 on failure -- */ --int kfd_numa_node_to_apic_id(int numa_node_id) --{ -- if (numa_node_id == -1) { -- pr_warn("Invalid NUMA Node. Use online CPU mask\n"); -- return kfd_cpumask_to_apic_id(cpu_online_mask); -- } -- return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); --} -- --#if defined(CONFIG_DEBUG_FS) -- --int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) --{ -- struct kfd_topology_device *dev; -- unsigned int i = 0; -- int r = 0; -- -- down_read(&topology_lock); -- -- list_for_each_entry(dev, &topology_device_list, list) { -- if (!dev->gpu) { -- i++; -- continue; -- } -- -- seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); -- r = device_queue_manager_debugfs_hqds(m, dev->gpu->dqm); -- if (r != 0) -- break; -- } -- -- up_read(&topology_lock); -- -- return r; --} -- --int kfd_debugfs_rls_by_device(struct seq_file *m, void *data) --{ -- struct kfd_topology_device *dev; -- unsigned int i = 0; -- int r = 0; -- -- down_read(&topology_lock); -- -- list_for_each_entry(dev, &topology_device_list, list) { -- if (!dev->gpu) { -- i++; -- continue; -- } -- -- seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); -- r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets); -- if (r != 0) -- break; -- } -- -- up_read(&topology_lock); -+ return device; - -- return r; - } -- --#endif -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -index f22d420..c3ddb9b 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -@@ -39,17 +39,8 @@ - #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 - #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 - #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 --#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 --#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 --#define HSA_CAP_RESERVED 0xffffc000 -- --#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 --#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 --#define HSA_CAP_DOORBELL_TYPE_2_0 0x2 --#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 --#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 -+#define HSA_CAP_RESERVED 0xfffff000 - #define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 --#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 - - struct kfd_node_properties { - uint32_t cpu_cores_count; -@@ -97,11 +88,11 @@ struct kfd_mem_properties { - uint32_t width; - uint32_t mem_clk_max; - struct kobject *kobj; -- struct kfd_dev *gpu; -- struct attribute attr_props; -- struct attribute attr_used; -+ struct attribute attr; - }; - -+#define KFD_TOPOLOGY_CPU_SIBLINGS 256 -+ - #define HSA_CACHE_TYPE_DATA 0x00000001 - #define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 - #define HSA_CACHE_TYPE_CPU 0x00000004 -@@ -118,7 +109,7 @@ struct kfd_cache_properties { - uint32_t cache_assoc; - uint32_t cache_latency; - uint32_t cache_type; -- uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; -+ uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; - struct kobject *kobj; - struct attribute attr; - }; -@@ -141,36 +132,24 @@ struct kfd_iolink_properties { - struct attribute attr; - }; - --struct kfd_perf_properties { -- struct list_head list; -- char block_name[16]; -- uint32_t max_concurrent; -- struct attribute_group *attr_group; --}; -- - struct kfd_topology_device { - struct list_head list; - uint32_t gpu_id; -- uint32_t proximity_domain; - struct kfd_node_properties node_props; -+ uint32_t mem_bank_count; - struct list_head mem_props; - uint32_t cache_count; - struct list_head cache_props; - uint32_t io_link_count; - struct list_head io_link_props; -- struct list_head perf_props; - struct kfd_dev *gpu; - struct kobject *kobj_node; - struct kobject *kobj_mem; - struct kobject *kobj_cache; - struct kobject *kobj_iolink; -- struct kobject *kobj_perf; - struct attribute attr_gpuid; - struct attribute attr_name; - struct attribute attr_props; -- uint8_t oem_id[CRAT_OEMID_LENGTH]; -- uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; -- uint32_t oem_revision; - }; - - struct kfd_system_properties { -@@ -185,14 +164,6 @@ struct kfd_system_properties { - struct attribute attr_props; - }; - --struct kfd_topology_device *kfd_create_topology_device( -- struct list_head *device_list); --void kfd_release_topology_device_list(struct list_head *device_list); - --#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) --extern bool amd_iommu_pc_supported(void); --extern u8 amd_iommu_pc_get_max_banks(u16 devid); --extern u8 amd_iommu_pc_get_max_counters(u16 devid); --#endif - - #endif /* __KFD_TOPOLOGY_H__ */ -diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h -deleted file mode 100644 -index e00d03d..0000000 ---- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h -+++ /dev/null -@@ -1,84 +0,0 @@ --/* -- * Copyright 2016 Advanced Micro Devices, Inc. -- * -- * Permission is hereby granted, free of charge, to any person obtaining a -- * copy of this software and associated documentation files (the "Software"), -- * to deal in the Software without restriction, including without limitation -- * the rights to use, copy, modify, merge, publish, distribute, sublicense, -- * and/or sell copies of the Software, and to permit persons to whom the -- * Software is furnished to do so, subject to the following conditions: -- * -- * The above copyright notice and this permission notice shall be included in -- * all copies or substantial portions of the Software. -- * -- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -- * OTHER DEALINGS IN THE SOFTWARE. -- */ -- --#ifndef HSA_SOC15_INT_H_INCLUDED --#define HSA_SOC15_INT_H_INCLUDED --/* -- * vega10+ IH clients -- */ --enum soc15_ih_client_id { -- SOC15_IH_CLIENTID_IH = 0x00, -- SOC15_IH_CLIENTID_ACP = 0x01, -- SOC15_IH_CLIENTID_ATHUB = 0x02, -- SOC15_IH_CLIENTID_BIF = 0x03, -- SOC15_IH_CLIENTID_DCE = 0x04, -- SOC15_IH_CLIENTID_ISP = 0x05, -- SOC15_IH_CLIENTID_PCIE0 = 0x06, -- SOC15_IH_CLIENTID_RLC = 0x07, -- SOC15_IH_CLIENTID_SDMA0 = 0x08, -- SOC15_IH_CLIENTID_SDMA1 = 0x09, -- SOC15_IH_CLIENTID_SE0SH = 0x0a, -- SOC15_IH_CLIENTID_SE1SH = 0x0b, -- SOC15_IH_CLIENTID_SE2SH = 0x0c, -- SOC15_IH_CLIENTID_SE3SH = 0x0d, -- SOC15_IH_CLIENTID_SYSHUB = 0x0e, -- SOC15_IH_CLIENTID_THM = 0x0f, -- SOC15_IH_CLIENTID_UVD = 0x10, -- SOC15_IH_CLIENTID_VCE0 = 0x11, -- SOC15_IH_CLIENTID_VMC = 0x12, -- SOC15_IH_CLIENTID_XDMA = 0x13, -- SOC15_IH_CLIENTID_GRBM_CP = 0x14, -- SOC15_IH_CLIENTID_ATS = 0x15, -- SOC15_IH_CLIENTID_ROM_SMUIO = 0x16, -- SOC15_IH_CLIENTID_DF = 0x17, -- SOC15_IH_CLIENTID_VCE1 = 0x18, -- SOC15_IH_CLIENTID_PWR = 0x19, -- SOC15_IH_CLIENTID_UTCL2 = 0x1b, -- SOC15_IH_CLIENTID_EA = 0x1c, -- SOC15_IH_CLIENTID_UTCL2LOG = 0x1d, -- SOC15_IH_CLIENTID_MP0 = 0x1e, -- SOC15_IH_CLIENTID_MP1 = 0x1f, -- -- SOC15_IH_CLIENTID_MAX --}; -- -- --#define SOC15_INTSRC_CP_END_OF_PIPE 181 --#define SOC15_INTSRC_CP_BAD_OPCODE 183 --#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 --#define SOC15_INTSRC_VMC_FAULT 0 --#define SOC15_INTSRC_SDMA_TRAP 224 -- -- --#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) --#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) --#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) --#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) --#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) --#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) --#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) --#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) --#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) --#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) -- --#endif -- -diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h -old mode 100755 -new mode 100644 -index b6cf2d5..36f3766 ---- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h -+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h -@@ -30,7 +30,6 @@ - - #include <linux/types.h> - #include <linux/bitmap.h> --#include <linux/dma-buf.h> - - struct pci_dev; - -@@ -41,46 +40,6 @@ struct kfd_dev; - struct kgd_dev; - - struct kgd_mem; --struct kfd_process_device; --struct amdgpu_bo; -- --enum kfd_preempt_type { -- KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN = 0, -- KFD_PREEMPT_TYPE_WAVEFRONT_RESET, --}; -- --struct kfd_vm_fault_info { -- uint64_t page_addr; -- uint32_t vmid; -- uint32_t mc_id; -- uint32_t status; -- bool prot_valid; -- bool prot_read; -- bool prot_write; -- bool prot_exec; --}; -- --struct kfd_cu_info { -- uint32_t num_shader_engines; -- uint32_t num_shader_arrays_per_engine; -- uint32_t num_cu_per_sh; -- uint32_t cu_active_number; -- uint32_t cu_ao_mask; -- uint32_t simd_per_cu; -- uint32_t max_waves_per_simd; -- uint32_t wave_front_size; -- uint32_t max_scratch_slots_per_cu; -- uint32_t lds_size; -- uint32_t cu_bitmap[4][4]; --}; -- --/* For getting GPU local memory information from KGD */ --struct kfd_local_mem_info { -- uint64_t local_mem_size_private; -- uint64_t local_mem_size_public; -- uint32_t vram_width; -- uint32_t mem_clk_max; --}; - - enum kgd_memory_pool { - KGD_POOL_SYSTEM_CACHEABLE = 1, -@@ -113,21 +72,6 @@ struct kgd2kfd_shared_resources { - /* Bit n == 1 means Queue n is available for KFD */ - DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); - -- /* Doorbell assignments (SOC15 and later chips only). Only -- * specific doorbells are routed to each SDMA engine. Others -- * are routed to IH and VCN. They are not usable by the CP. -- * -- * Any doorbell number D that satisfies the following condition -- * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val -- * -- * KFD currently uses 1024 (= 0x3ff) doorbells per process. If -- * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means -- * mask would be set to 0x1f8 and val set to 0x0f0. -- */ -- unsigned int sdma_doorbell[2][2]; -- unsigned int reserved_doorbell_mask; -- unsigned int reserved_doorbell_val; -- - /* Base address of doorbell aperture. */ - phys_addr_t doorbell_physical_address; - -@@ -136,41 +80,8 @@ struct kgd2kfd_shared_resources { - - /* Number of bytes at start of aperture reserved for KGD. */ - size_t doorbell_start_offset; -- -- /* GPUVM address space size in bytes */ -- uint64_t gpuvm_size; - }; - --struct tile_config { -- uint32_t *tile_config_ptr; -- uint32_t *macro_tile_config_ptr; -- uint32_t num_tile_configs; -- uint32_t num_macro_tile_configs; -- -- uint32_t gb_addr_config; -- uint32_t num_banks; -- uint32_t num_ranks; --}; -- --/* -- * Allocation flag domains currently only VRAM and GTT domain supported -- */ --#define ALLOC_MEM_FLAGS_VRAM (1 << 0) --#define ALLOC_MEM_FLAGS_GTT (1 << 1) --#define ALLOC_MEM_FLAGS_USERPTR (1 << 2) --#define ALLOC_MEM_FLAGS_DOORBELL (1 << 3) -- --/* -- * Allocation flags attributes/access options. -- */ --#define ALLOC_MEM_FLAGS_NONPAGED (1 << 31) --#define ALLOC_MEM_FLAGS_READONLY (1 << 30) --#define ALLOC_MEM_FLAGS_PUBLIC (1 << 29) --#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) --#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) --#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) --#define ALLOC_MEM_FLAGS_COHERENT (1 << 25) -- - /** - * struct kfd2kgd_calls - * -@@ -179,7 +90,7 @@ struct tile_config { - * - * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture - * -- * @get_local_mem_info: Retrieves information about GPU local memory -+ * @get_vmem_size: Retrieves (physical) size of VRAM - * - * @get_gpu_clock_counter: Retrieves GPU clock counter - * -@@ -201,12 +112,6 @@ struct tile_config { - * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot. - * used only for no HWS mode. - * -- * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs. -- * Array is allocated with kmalloc, needs to be freed with kfree by caller. -- * -- * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs. -- * Array is allocated with kmalloc, needs to be freed with kfree by caller. -- * - * @hqd_is_occupies: Checks if a hqd slot is occupied. - * - * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot. -@@ -216,34 +121,8 @@ struct tile_config { - * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that - * SDMA hqd slot. - * -- * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs -- * -- * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs -- * - * @get_fw_version: Returns FW versions from the header - * -- * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to -- * IOMMU when address translation failed -- * -- * @get_cu_info: Retrieves activated cu info -- * -- * @get_dmabuf_info: Returns information about a dmabuf if it was -- * created by the GPU driver -- * -- * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object -- * Supports only DMA buffers created by GPU driver on the same GPU -- * -- * @export_dmabuf: Emports a KFD BO for sharing with other process -- * -- * @submit_ib: Submits an IB to the engine specified by inserting the IB to -- * the corresonded ring (ring type). -- * -- * @restore_process_bos: Restore all BOs that belongs to the process -- * -- * @copy_mem_to_mem: Copies size bytes from source BO to destination BO -- * -- * @get_vram_usage: Returns current VRAM usage -- * - * This structure contains function pointers to services that the kgd driver - * provides to amdkfd driver. - * -@@ -255,23 +134,11 @@ struct kfd2kgd_calls { - - void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); - -- void(*get_local_mem_info)(struct kgd_dev *kgd, -- struct kfd_local_mem_info *mem_info); -+ uint64_t (*get_vmem_size)(struct kgd_dev *kgd); - uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd); - - uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd); - -- int (*create_process_vm)(struct kgd_dev *kgd, void **vm, -- void **process_info, struct dma_fence **ef); -- void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm); -- -- int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem); -- void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem); -- -- uint32_t (*get_process_page_dir)(void *vm); -- -- int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); -- - /* Register access functions */ - void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid, - uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, -@@ -284,28 +151,16 @@ struct kfd2kgd_calls { - uint32_t hpd_size, uint64_t hpd_gpu_addr); - - int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); -- - - int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr, -- uint32_t wptr_shift, uint32_t wptr_mask, -- struct mm_struct *mm); -- -- int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd, -- uint32_t __user *wptr, struct mm_struct *mm); -- -- int (*hqd_dump)(struct kgd_dev *kgd, -- uint32_t pipe_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs); -+ uint32_t queue_id, uint32_t __user *wptr); - -- int (*hqd_sdma_dump)(struct kgd_dev *kgd, -- uint32_t engine_id, uint32_t queue_id, -- uint32_t (**dump)[2], uint32_t *n_regs); -+ int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd); - - bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); - -- int (*hqd_destroy)(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, -+ int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id); - -@@ -313,7 +168,7 @@ struct kfd2kgd_calls { - - int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd, - unsigned int timeout); -- -+ - int (*address_watch_disable)(struct kgd_dev *kgd); - int (*address_watch_execute)(struct kgd_dev *kgd, - unsigned int watch_point_id, -@@ -332,72 +187,11 @@ struct kfd2kgd_calls { - uint16_t (*get_atc_vmid_pasid_mapping_pasid)( - struct kgd_dev *kgd, - uint8_t vmid); -- uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd); - void (*write_vmid_invalidate_request)(struct kgd_dev *kgd, - uint8_t vmid); - -- int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid); -- -- int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); -- -- int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va, -- uint64_t size, void *vm, -- struct kgd_mem **mem, uint64_t *offset, -- uint32_t flags); -- int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, -- void *vm); -- int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, -- void *vm); -- int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, -- void *vm); -- - uint16_t (*get_fw_version)(struct kgd_dev *kgd, - enum kgd_engine_type type); -- -- void (*set_num_of_requests)(struct kgd_dev *kgd, -- uint8_t num_of_requests); -- int (*alloc_memory_of_scratch)(struct kgd_dev *kgd, -- uint64_t va, uint32_t vmid); -- int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable, -- uint8_t element_size, uint8_t index_stride, uint8_t mtype); -- void (*get_cu_info)(struct kgd_dev *kgd, -- struct kfd_cu_info *cu_info); -- int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma); -- int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd, -- struct kgd_mem *mem, void **kptr); -- void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid, -- uint32_t page_table_base); -- -- int (*pin_get_sg_table_bo)(struct kgd_dev *kgd, -- struct kgd_mem *mem, uint64_t offset, -- uint64_t size, struct sg_table **ret_sg); -- void (*unpin_put_sg_table_bo)(struct kgd_mem *mem, -- struct sg_table *sg); -- -- int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd, -- struct kgd_dev **dma_buf_kgd, uint64_t *bo_size, -- void *metadata_buffer, size_t buffer_size, -- uint32_t *metadata_size, uint32_t *flags); -- int (*import_dmabuf)(struct kgd_dev *kgd, struct dma_buf *dmabuf, -- uint64_t va, void *vm, struct kgd_mem **mem, -- uint64_t *size, uint64_t *mmap_offset); -- int (*export_dmabuf)(struct kgd_dev *kgd, void *vm, struct kgd_mem *mem, -- struct dma_buf **dmabuf); -- -- int (*get_vm_fault_info)(struct kgd_dev *kgd, -- struct kfd_vm_fault_info *info); -- int (*submit_ib)(struct kgd_dev *kgd, enum kgd_engine_type engine, -- uint32_t vmid, uint64_t gpu_addr, -- uint32_t *ib_cmd, uint32_t ib_len); -- int (*get_tile_config)(struct kgd_dev *kgd, -- struct tile_config *config); -- -- int (*restore_process_bos)(void *process_info, struct dma_fence **ef); -- int (*copy_mem_to_mem)(struct kgd_dev *kgd, struct kgd_mem *src_mem, -- uint64_t src_offset, struct kgd_mem *dst_mem, -- uint64_t dest_offset, uint64_t size, -- struct dma_fence **f, uint64_t *actual_size); -- uint64_t (*get_vram_usage)(struct kgd_dev *kgd); - }; - - /** -@@ -416,13 +210,6 @@ struct kfd2kgd_calls { - * - * @resume: Notifies amdkfd about a resume action done to a kgd device - * -- * @quiesce_mm: Quiesce all user queue access to specified MM address space -- * -- * @resume_mm: Resume user queue access to specified MM address space -- * -- * @schedule_evict_and_restore_process: Schedules work queue that will prepare -- * for safe eviction of KFD BOs that belong to the specified process. -- * - * This structure contains function callback pointers so the kgd driver - * will notify to the amdkfd about certain status changes. - * -@@ -437,13 +224,9 @@ struct kgd2kfd_calls { - void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); - void (*suspend)(struct kfd_dev *kfd); - int (*resume)(struct kfd_dev *kfd); -- int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm); -- int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm); -- int (*schedule_evict_and_restore_process)(struct mm_struct *mm, -- struct dma_fence *fence); - }; - - int kgd2kfd_init(unsigned interface_version, - const struct kgd2kfd_calls **g2f); - --#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ -+#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ -diff --git a/drivers/gpu/drm/drm_pci.c b/drivers/gpu/drm/drm_pci.c -index 1235c98..7e5a1fe 100644 ---- a/drivers/gpu/drm/drm_pci.c -+++ b/drivers/gpu/drm/drm_pci.c -@@ -149,6 +149,7 @@ int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master) - master->unique_len = strlen(master->unique); - return 0; - } -+EXPORT_SYMBOL(drm_pci_set_busid); - - static int drm_pci_irq_by_busid(struct drm_device *dev, struct drm_irq_busid *p) - { -diff --git a/drivers/gpu/drm/radeon/radeon_kfd.c b/drivers/gpu/drm/radeon/radeon_kfd.c -index c7d2e7a..a2ab6dc 100755 ---- a/drivers/gpu/drm/radeon/radeon_kfd.c -+++ b/drivers/gpu/drm/radeon/radeon_kfd.c -@@ -75,15 +75,12 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr); - static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr, -- uint32_t wptr_shift, uint32_t wptr_mask, -- struct mm_struct *mm); --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -- uint32_t __user *wptr, struct mm_struct *mm); -+ uint32_t queue_id, uint32_t __user *wptr); -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); - static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); - --static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, -+static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id); - static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); -@@ -110,6 +107,7 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); - static const struct kfd2kgd_calls kfd2kgd = { - .init_gtt_mem_allocation = alloc_gtt_mem, - .free_gtt_mem = free_gtt_mem, -+ .get_vmem_size = get_vmem_size, - .get_gpu_clock_counter = get_gpu_clock_counter, - .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, - .program_sh_mem_settings = kgd_program_sh_mem_settings, -@@ -484,9 +482,7 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) - } - - static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, -- uint32_t queue_id, uint32_t __user *wptr, -- uint32_t wptr_shift, uint32_t wptr_mask, -- struct mm_struct *mm) -+ uint32_t queue_id, uint32_t __user *wptr) - { - uint32_t wptr_shadow, is_wptr_shadow_valid; - struct cik_mqd *m; -@@ -562,8 +558,7 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - return 0; - } - --static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, -- uint32_t __user *wptr, struct mm_struct *mm) -+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) - { - struct cik_sdma_rlc_registers *m; - uint32_t sdma_base_addr; -@@ -641,7 +636,7 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) - return false; - } - --static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, -+static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) - { -diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h -index 3053049..f08273c 100644 ---- a/include/drm/drm_drv.h -+++ b/include/drm/drm_drv.h -@@ -174,6 +174,8 @@ struct drm_driver { - * to finalize the device and then freeing the struct themselves. - */ - void (*release) (struct drm_device *); -+ -+ int (*set_busid)(struct drm_device *dev, struct drm_master *master); - - /** - * @get_vblank_counter: -diff --git a/include/drm/drm_pci.h b/include/drm/drm_pci.h -index 6745990..4d5daa8 100644 ---- a/include/drm/drm_pci.h -+++ b/include/drm/drm_pci.h -@@ -49,6 +49,7 @@ void drm_legacy_pci_exit(struct drm_driver *driver, struct pci_driver *pdriver); - int drm_get_pci_dev(struct pci_dev *pdev, - const struct pci_device_id *ent, - struct drm_driver *driver); -+int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master); - #else - static inline int drm_get_pci_dev(struct pci_dev *pdev, - const struct pci_device_id *ent, -@@ -56,6 +57,12 @@ static inline int drm_get_pci_dev(struct pci_dev *pdev, - { - return -ENOSYS; - } -+ -+static inline int drm_pci_set_busid(struct drm_device *dev, -+ struct drm_master *master) -+{ -+ return -ENOSYS; -+} - #endif - - #define DRM_PCIE_SPEED_25 1 --- -2.7.4 - |