From 3538bdf4c8b2d8f1f93fd806656bad0c82c6e60f Mon Sep 17 00:00:00 2001 From: Chaudhary Amit Kumar Date: Thu, 18 Oct 2018 18:06:09 +0530 Subject: [PATCH 1368/4131] drm/amdkfd: revert kfd part to a previous state Revert following files to "2ba6b00 drm/amd/powerplay: add profile mode for vega10.": - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd* - drivers/gpu/drm/amd/amdkfd/* - drivers/gpu/drm/amd/include/kgd_kfd_interface.h - include/uapi/linux/kfd_ioctl.h - drivers/gpu/drm/radeon/radeon_kfd* Due to upstream, porting kfd patches to 4.13 all-open has many conflicts. It's hard to elegantly fix these conflicts. So we revert the kfd part to a previous commit, where we began to first port dkms patches in 4.12 hybrid. Then sequentially port all kfd patches. Change-Id: I75eda45f41ced2f4c444ded126e2b80b53d15f2a Signed-off-by: Le.Ma Acked-by: Junwei Zhang Signed-off-by: kalyan.alle Signed-off-by: Chaudhary Amit Kumar --- drivers/gpu/drm/amd/amdgpu/Makefile | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 97 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 354 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 184 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 196 -- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 542 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 642 +---- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1227 ---------- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2578 -------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 246 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 50 +- drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 13 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 82 + drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 133 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 106 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 22 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 21 +- drivers/gpu/drm/amd/amdgpu/vid.h | 6 - drivers/gpu/drm/amd/amdkfd/Kconfig | 3 +- drivers/gpu/drm/amd/amdkfd/Makefile | 23 +- drivers/gpu/drm/amd/amdkfd/backport/Makefile | 7 - drivers/gpu/drm/amd/amdkfd/backport/backport.h | 6 - drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 73 +- drivers/gpu/drm/amd/amdkfd/cik_int.h | 24 +- drivers/gpu/drm/amd/amdkfd/cik_regs.h | 3 +- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h | 1384 ----------- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 1388 ----------- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1361 +---------- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 1304 ---------- drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 42 +- drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 219 +- drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h | 32 - drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c | 24 +- drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h | 27 +- drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 75 - drivers/gpu/drm/amd/amdkfd/kfd_device.c | 890 +------ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1222 +++------- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 56 +- .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c | 80 +- .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 90 - .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c | 123 +- drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 106 +- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 253 +- drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 124 +- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 133 - drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 91 +- drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 275 --- drivers/gpu/drm/amd/amdkfd/kfd_ipc.h | 51 - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 149 +- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 17 +- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | 128 - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 377 --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 361 --- drivers/gpu/drm/amd/amdkfd/kfd_module.c | 61 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 54 - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 18 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 240 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 528 ---- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 329 +-- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 556 +++-- drivers/gpu/drm/amd/amdkfd/kfd_pasid.c | 7 +- drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c | 513 ---- drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h | 330 ++- drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h | 583 ----- drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h | 97 + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h | 140 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 546 +---- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 919 +------ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 290 +-- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 12 +- drivers/gpu/drm/amd/amdkfd/kfd_rdma.c | 294 --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1177 ++++----- drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 41 +- drivers/gpu/drm/amd/amdkfd/soc15_int.h | 84 - drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 231 +- drivers/gpu/drm/drm_pci.c | 1 + drivers/gpu/drm/radeon/radeon_kfd.c | 19 +- include/drm/drm_drv.h | 2 + include/drm/drm_pci.h | 7 + 82 files changed, 3407 insertions(+), 20703 deletions(-) delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c mode change 100755 => 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c mode change 100755 => 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c mode change 100755 => 100644 drivers/gpu/drm/amd/amdkfd/Makefile delete mode 100644 drivers/gpu/drm/amd/amdkfd/backport/Makefile delete mode 100644 drivers/gpu/drm/amd/amdkfd/backport/backport.h delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_crat.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.h delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h mode change 100755 => 100644 drivers/gpu/drm/amd/amdkfd/kfd_priv.h delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_rdma.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/soc15_int.h mode change 100755 => 100644 drivers/gpu/drm/amd/include/kgd_kfd_interface.h diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 6b373d0..bc6f49e 100755 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -32,7 +32,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \ amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \ amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \ - amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o amdgpu_amdkfd_fence.o + amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o # add asic specific block amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \ @@ -110,8 +110,6 @@ amdgpu-y += \ amdgpu_amdkfd.o \ amdgpu_amdkfd_gfx_v7.o \ amdgpu_amdkfd_gfx_v8.o \ - amdgpu_amdkfd_gfx_v9.o \ - amdgpu_amdkfd_gpuvm.o # add cgs amdgpu-y += amdgpu_cgs.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bcf95e7..b07c90e 100755 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -67,7 +67,6 @@ #include "amdgpu_vce.h" #include "amdgpu_vcn.h" #include "amdgpu_dm.h" -#include "amdgpu_mn.h" #include "gpu_scheduler.h" #include "amdgpu_virt.h" @@ -125,7 +124,6 @@ extern int amdgpu_cntl_sb_buf_per_se; extern int amdgpu_param_buf_per_se; extern int amdgpu_job_hang_limit; extern int amdgpu_lbpw; -extern int amdgpu_compute_multipipe; #ifdef CONFIG_DRM_AMDGPU_SI extern int amdgpu_si_support; @@ -184,8 +182,7 @@ struct amdgpu_cs_parser; struct amdgpu_job; struct amdgpu_irq_src; struct amdgpu_fpriv; -struct kfd_vm_fault_info; -struct amdgpu_bo_va_mapping; +struct kfd_process_device; enum amdgpu_cp_irq { AMDGPU_CP_IRQ_GFX_EOP = 0, @@ -300,25 +297,14 @@ struct amdgpu_buffer_funcs { /* provided by hw blocks that can write ptes, e.g., sdma */ struct amdgpu_vm_pte_funcs { - /* number of dw to reserve per operation */ - unsigned copy_pte_num_dw; - /* copy pte entries from GART */ void (*copy_pte)(struct amdgpu_ib *ib, uint64_t pe, uint64_t src, unsigned count); - /* write pte one entry at a time with addr mapping */ void (*write_pte)(struct amdgpu_ib *ib, uint64_t pe, uint64_t value, unsigned count, uint32_t incr); - - /* maximum nums of PTEs/PDEs in a single operation */ - uint32_t set_max_nums_pte_pde; - - /* number of dw to reserve per operation */ - unsigned set_pte_pde_num_dw; - /* for linear pte/pde updates without addr mapping */ void (*set_pte_pde)(struct amdgpu_ib *ib, uint64_t pe, @@ -397,15 +383,7 @@ struct amdgpu_clock { */ #define AMDGPU_GEM_DOMAIN_MAX 0x3 - -struct amdgpu_gem_object { - struct drm_gem_object base; - struct list_head list; - struct amdgpu_bo *bo; -}; - -struct kgd_mem; -#define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_gem_object, base)->bo +#define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_bo, gem_base) void amdgpu_gem_object_free(struct drm_gem_object *obj); int amdgpu_gem_object_open(struct drm_gem_object *obj, @@ -421,8 +399,6 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, struct dma_buf *amdgpu_gem_prime_export(struct drm_device *dev, struct drm_gem_object *gobj, int flags); -struct drm_gem_object * -amdgpu_gem_prime_foreign_bo(struct amdgpu_device *adev, struct amdgpu_bo *bo); int amdgpu_gem_prime_pin(struct drm_gem_object *obj); void amdgpu_gem_prime_unpin(struct drm_gem_object *obj); struct reservation_object *amdgpu_gem_prime_res_obj(struct drm_gem_object *); @@ -484,10 +460,9 @@ struct amdgpu_sa_bo { */ void amdgpu_gem_force_release(struct amdgpu_device *adev); int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size, - int alignment, u32 initial_domain, - u64 flags, bool kernel, - struct reservation_object *resv, - struct drm_gem_object **obj); + int alignment, u32 initial_domain, + u64 flags, bool kernel, + struct drm_gem_object **obj); int amdgpu_mode_dumb_create(struct drm_file *file_priv, struct drm_device *dev, @@ -545,9 +520,6 @@ struct amdgpu_mc { u64 private_aperture_end; /* protects concurrent invalidation */ spinlock_t invalidate_lock; - - struct kfd_vm_fault_info *vm_fault_info; - atomic_t vm_fault_info_updated; }; /* @@ -730,7 +702,7 @@ int amdgpu_queue_mgr_fini(struct amdgpu_device *adev, struct amdgpu_queue_mgr *mgr); int amdgpu_queue_mgr_map(struct amdgpu_device *adev, struct amdgpu_queue_mgr *mgr, - u32 hw_ip, u32 instance, u32 ring, + int hw_ip, int instance, int ring, struct amdgpu_ring **out_ring); /* @@ -966,7 +938,6 @@ struct amdgpu_gfx_config { }; struct amdgpu_cu_info { - uint32_t simd_per_cu; uint32_t max_waves_per_simd; uint32_t wave_front_size; uint32_t max_scratch_slots_per_cu; @@ -1094,7 +1065,6 @@ struct amdgpu_cs_parser { /* buffer objects */ struct ww_acquire_ctx ticket; struct amdgpu_bo_list *bo_list; - struct amdgpu_mn *mn; struct amdgpu_bo_list_entry vm_pd; struct list_head validated; struct dma_fence *fence; @@ -1236,6 +1206,20 @@ void amdgpu_benchmark(struct amdgpu_device *adev, int test_number); void amdgpu_test_moves(struct amdgpu_device *adev); /* + * MMU Notifier + */ +#if defined(CONFIG_MMU_NOTIFIER) +int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr); +void amdgpu_mn_unregister(struct amdgpu_bo *bo); +#else +static inline int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) +{ + return -ENODEV; +} +static inline void amdgpu_mn_unregister(struct amdgpu_bo *bo) {} +#endif + +/* * Debugfs */ struct amdgpu_debugfs { @@ -1435,7 +1419,10 @@ struct amdgpu_direct_gma { }; #if defined(CONFIG_ZONE_DEVICE) && \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) || defined(OS_NAME_RHEL_7_3) || defined(OS_NAME_SLE)) + (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) || \ + defined(OS_NAME_RHEL_7_3) || \ + defined(OS_NAME_RHEL_7_4) || \ + defined(OS_NAME_SLE)) #define CONFIG_ENABLE_SSG #endif @@ -1603,14 +1590,18 @@ struct amdgpu_device { /* sdma */ struct amdgpu_sdma sdma; - /* uvd */ - struct amdgpu_uvd uvd; + union { + struct { + /* uvd */ + struct amdgpu_uvd uvd; - /* vce */ - struct amdgpu_vce vce; + /* vce */ + struct amdgpu_vce vce; + }; - /* vcn */ - struct amdgpu_vcn vcn; + /* vcn */ + struct amdgpu_vcn vcn; + }; /* firmwares */ struct amdgpu_firmware firmware; @@ -1655,7 +1646,6 @@ struct amdgpu_device { /* record hw reset is performed */ bool has_hw_reset; u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; - spinlock_t tlb_invalidation_lock; /* record last mm index being written through WREG32*/ unsigned long last_mm_index; @@ -1861,6 +1851,18 @@ void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes, u64 num_vis_bytes); void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain); bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo); +int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages); +int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr, + uint32_t flags); +bool amdgpu_ttm_tt_has_userptr(struct ttm_tt *ttm); +struct mm_struct *amdgpu_ttm_tt_get_usermm(struct ttm_tt *ttm); +bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start, + unsigned long end); +bool amdgpu_ttm_tt_userptr_invalidated(struct ttm_tt *ttm, + int *last_invalidated); +bool amdgpu_ttm_tt_is_readonly(struct ttm_tt *ttm); +uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt *ttm, + struct ttm_mem_reg *mem); void amdgpu_vram_location(struct amdgpu_device *adev, struct amdgpu_mc *mc, u64 base); void amdgpu_gart_location(struct amdgpu_device *adev, struct amdgpu_mc *mc); void amdgpu_ttm_set_active_vram_size(struct amdgpu_device *adev, u64 size); @@ -1943,9 +1945,10 @@ static inline int amdgpu_acpi_init(struct amdgpu_device *adev) { return 0; } static inline void amdgpu_acpi_fini(struct amdgpu_device *adev) { } #endif -int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, - uint64_t addr, struct amdgpu_bo **bo, - struct amdgpu_bo_va_mapping **mapping); +struct amdgpu_bo_va_mapping * +amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, + uint64_t addr, struct amdgpu_bo **bo); +int amdgpu_cs_sysvm_access_required(struct amdgpu_cs_parser *parser); #if defined(CONFIG_DRM_AMD_DC) int amdgpu_dm_display_resume(struct amdgpu_device *adev ); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index ec8141f..ef56352 100755 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -20,29 +20,23 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#undef pr_fmt -#define pr_fmt(fmt) "kfd2kgd: " fmt - #include "amdgpu_amdkfd.h" -#include +#include "amd_shared.h" #include #include "amdgpu.h" #include "amdgpu_gfx.h" #include -#define AMDKFD_SKIP_UNCOMPILED_CODE 1 - +const struct kfd2kgd_calls *kfd2kgd; const struct kgd2kfd_calls *kgd2kfd; -bool (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); - -unsigned int global_compute_vmid_bitmap = 0xFF00; +bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); int amdgpu_amdkfd_init(void) { int ret; #if defined(CONFIG_HSA_AMD_MODULE) - int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); + int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); kgd2kfd_init_p = symbol_request(kgd2kfd_init); @@ -63,68 +57,56 @@ int amdgpu_amdkfd_init(void) #else ret = -ENOENT; #endif - amdgpu_amdkfd_gpuvm_init_mem_limits(); - return ret; -} -void amdgpu_amdkfd_fini(void) -{ - if (kgd2kfd) { - kgd2kfd->exit(); - symbol_put(kgd2kfd_init); - } + return ret; } -void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) +bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev) { - const struct kfd2kgd_calls *kfd2kgd; - - if (!kgd2kfd) - return; - switch (adev->asic_type) { #ifdef CONFIG_DRM_AMDGPU_CIK case CHIP_KAVERI: - case CHIP_HAWAII: kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions(); break; #endif case CHIP_CARRIZO: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); break; - case CHIP_VEGA10: - case CHIP_RAVEN: - kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions(); - break; default: - dev_info(adev->dev, "kfd not supported on this ASIC\n"); - return; + return false; + } + + return true; +} + +void amdgpu_amdkfd_fini(void) +{ + if (kgd2kfd) { + kgd2kfd->exit(); + symbol_put(kgd2kfd_init); } +} - adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, - adev->pdev, kfd2kgd); +void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) +{ + if (kgd2kfd) + adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, + adev->pdev, kfd2kgd); } void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) { int i; int last_valid_bit; - if (adev->kfd) { struct kgd2kfd_shared_resources gpu_resources = { - .compute_vmid_bitmap = global_compute_vmid_bitmap, + .compute_vmid_bitmap = 0xFF00, .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec, - .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe, - .gpuvm_size = (uint64_t)amdgpu_vm_size << 30 + .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe }; /* this is going to have a few of the MSBs set that we need to - * clear - */ + * clear */ bitmap_complement(gpu_resources.queue_bitmap, adev->gfx.mec.queue_bitmap, KGD_MAX_QUEUES); @@ -138,8 +120,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) gpu_resources.queue_bitmap); /* According to linux/bitmap.h we shouldn't use bitmap_clear if - * nbits is not compile time constant - */ + * nbits is not compile time constant */ last_valid_bit = 1 /* only first MEC can have compute queues */ * adev->gfx.mec.num_pipe_per_mec * adev->gfx.mec.num_queue_per_pipe; @@ -150,28 +131,6 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) &gpu_resources.doorbell_physical_address, &gpu_resources.doorbell_aperture_size, &gpu_resources.doorbell_start_offset); - if (adev->asic_type >= CHIP_VEGA10) { - /* On SOC15 the BIF is involved in routing - * doorbells using the low 12 bits of the - * address. Communicate the assignments to - * KFD. KFD uses two doorbell pages per - * process in case of 64-bit doorbells so we - * can use each doorbell assignment twice. - */ - gpu_resources.sdma_doorbell[0][0] = - AMDGPU_DOORBELL64_sDMA_ENGINE0; - gpu_resources.sdma_doorbell[0][1] = - AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200; - gpu_resources.sdma_doorbell[1][0] = - AMDGPU_DOORBELL64_sDMA_ENGINE1; - gpu_resources.sdma_doorbell[1][1] = - AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200; - /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for - * SDMA, IH and VCN. So don't use them for the CP. - */ - gpu_resources.reserved_doorbell_mask = 0x1f0; - gpu_resources.reserved_doorbell_val = 0x0f0; - } kgd2kfd->device_init(adev->kfd, &gpu_resources); } @@ -208,81 +167,24 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev) return r; } -int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, - uint32_t vmid, uint64_t gpu_addr, - uint32_t *ib_cmd, uint32_t ib_len) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - struct amdgpu_job *job; - struct amdgpu_ib *ib; - struct amdgpu_ring *ring; - struct dma_fence *f = NULL; - int ret; - - switch (engine) { - case KGD_ENGINE_MEC1: - ring = &adev->gfx.compute_ring[0]; - break; - case KGD_ENGINE_SDMA1: - ring = &adev->sdma.instance[0].ring; - break; - case KGD_ENGINE_SDMA2: - ring = &adev->sdma.instance[1].ring; - break; - default: - pr_err("Invalid engine in IB submission: %d\n", engine); - ret = -EINVAL; - goto err; - } - - ret = amdgpu_job_alloc(adev, 1, &job, NULL); - if (ret) - goto err; - - ib = &job->ibs[0]; - memset(ib, 0, sizeof(struct amdgpu_ib)); - - ib->gpu_addr = gpu_addr; - ib->ptr = ib_cmd; - ib->length_dw = ib_len; - /* This works for NO_HWS. TODO: need to handle without knowing VMID */ - job->vm_id = vmid; - - ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); - if (ret) { - DRM_ERROR("amdgpu: failed to schedule IB.\n"); - goto err_ib_sched; - } - - ret = dma_fence_wait(f, false); - -err_ib_sched: - dma_fence_put(f); - amdgpu_job_free(job); -err: - return ret; -} - -u32 pool_to_domain(enum kgd_memory_pool p) -{ - switch (p) { - case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM; - default: return AMDGPU_GEM_DOMAIN_GTT; - } -} - int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, void **mem_obj, uint64_t *gpu_addr, void **cpu_ptr) { struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - struct amdgpu_bo *bo = NULL; + struct kgd_mem **mem = (struct kgd_mem **) mem_obj; int r; - uint64_t gpu_addr_tmp = 0; - void *cpu_ptr_tmp = NULL; + + BUG_ON(kgd == NULL); + BUG_ON(gpu_addr == NULL); + BUG_ON(cpu_ptr == NULL); + + *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL); + if ((*mem) == NULL) + return -ENOMEM; r = amdgpu_bo_create(adev, size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT, - AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &bo); + AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &(*mem)->bo); if (r) { dev_err(adev->dev, "failed to allocate BO for amdkfd (%d)\n", r); @@ -290,87 +192,64 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, } /* map the buffer */ - r = amdgpu_bo_reserve(bo, true); + r = amdgpu_bo_reserve((*mem)->bo, true); if (r) { dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r); goto allocate_mem_reserve_bo_failed; } - r = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT, - &gpu_addr_tmp); + r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT, + &(*mem)->gpu_addr); if (r) { dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r); goto allocate_mem_pin_bo_failed; } + *gpu_addr = (*mem)->gpu_addr; - r = amdgpu_bo_kmap(bo, &cpu_ptr_tmp); + r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); if (r) { dev_err(adev->dev, "(%d) failed to map bo to kernel for amdkfd\n", r); goto allocate_mem_kmap_bo_failed; } + *cpu_ptr = (*mem)->cpu_ptr; - *mem_obj = bo; - *gpu_addr = gpu_addr_tmp; - *cpu_ptr = cpu_ptr_tmp; - - amdgpu_bo_unreserve(bo); + amdgpu_bo_unreserve((*mem)->bo); return 0; allocate_mem_kmap_bo_failed: - amdgpu_bo_unpin(bo); + amdgpu_bo_unpin((*mem)->bo); allocate_mem_pin_bo_failed: - amdgpu_bo_unreserve(bo); + amdgpu_bo_unreserve((*mem)->bo); allocate_mem_reserve_bo_failed: - amdgpu_bo_unref(&bo); + amdgpu_bo_unref(&(*mem)->bo); return r; } void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) { - struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; + struct kgd_mem *mem = (struct kgd_mem *) mem_obj; - amdgpu_bo_reserve(bo, true); - amdgpu_bo_kunmap(bo); - amdgpu_bo_unpin(bo); - amdgpu_bo_unreserve(bo); - amdgpu_bo_unref(&(bo)); + BUG_ON(mem == NULL); + + amdgpu_bo_reserve(mem->bo, true); + amdgpu_bo_kunmap(mem->bo); + amdgpu_bo_unpin(mem->bo); + amdgpu_bo_unreserve(mem->bo); + amdgpu_bo_unref(&(mem->bo)); + kfree(mem); } -void get_local_mem_info(struct kgd_dev *kgd, - struct kfd_local_mem_info *mem_info) +uint64_t get_vmem_size(struct kgd_dev *kgd) { - uint64_t address_mask; - resource_size_t aper_limit; - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + struct amdgpu_device *adev = + (struct amdgpu_device *)kgd; - address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask : - ~((1ULL << 32) - 1); - aper_limit = adev->mc.aper_base + adev->mc.aper_size; - - memset(mem_info, 0, sizeof(*mem_info)); - if (!(adev->mc.aper_base & address_mask || - aper_limit & address_mask)) { - mem_info->local_mem_size_public = adev->mc.visible_vram_size; - mem_info->local_mem_size_private = adev->mc.real_vram_size - - adev->mc.visible_vram_size; - } else { - mem_info->local_mem_size_public = 0; - mem_info->local_mem_size_private = adev->mc.real_vram_size; - } - mem_info->vram_width = adev->mc.vram_width; + BUG_ON(kgd == NULL); - pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n", - adev->mc.aper_base, aper_limit, - mem_info->local_mem_size_public, - mem_info->local_mem_size_private); - - if (amdgpu_sriov_vf(adev)) - mem_info->mem_clk_max = adev->clock.default_mclk / 100; - else - mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100; + return adev->mc.real_vram_size; } uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) @@ -385,113 +264,6 @@ uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd) { struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - - /* the sclk is in quantas of 10kHz */ - if (amdgpu_sriov_vf(adev)) - return adev->clock.default_sclk / 100; - - return amdgpu_dpm_get_sclk(adev, false) / 100; -} - -void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - struct amdgpu_cu_info acu_info = adev->gfx.cu_info; - - memset(cu_info, 0, sizeof(*cu_info)); - if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap)) - return; - - cu_info->cu_active_number = acu_info.number; - cu_info->cu_ao_mask = acu_info.ao_cu_mask; - memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], - sizeof(acu_info.bitmap)); - cu_info->num_shader_engines = adev->gfx.config.max_shader_engines; - cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; - cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh; - cu_info->simd_per_cu = acu_info.simd_per_cu; - cu_info->max_waves_per_simd = acu_info.max_waves_per_simd; - cu_info->wave_front_size = acu_info.wave_front_size; - cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu; - cu_info->lds_size = acu_info.lds_size; -} - -int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, - struct kgd_dev **dma_buf_kgd, - uint64_t *bo_size, void *metadata_buffer, - size_t buffer_size, uint32_t *metadata_size, - uint32_t *flags) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - struct dma_buf *dma_buf; - struct drm_gem_object *obj; - struct amdgpu_bo *bo; - uint64_t metadata_flags; - int r = -EINVAL; - - dma_buf = dma_buf_get(dma_buf_fd); - if (IS_ERR(dma_buf)) - return PTR_ERR(dma_buf); - - if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) - /* Can't handle non-graphics buffers */ - goto out_put; - - obj = dma_buf->priv; - if (obj->dev->driver != adev->ddev->driver) - /* Can't handle buffers from different drivers */ - goto out_put; - - adev = obj->dev->dev_private; - bo = gem_to_amdgpu_bo(obj); - if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | - AMDGPU_GEM_DOMAIN_GTT | - AMDGPU_GEM_DOMAIN_DGMA))) - /* Only VRAM, GTT and DGMA BOs are supported */ - goto out_put; - - r = 0; - if (dma_buf_kgd) - *dma_buf_kgd = (struct kgd_dev *)adev; - if (bo_size) - *bo_size = amdgpu_bo_size(bo); - if (metadata_size) - *metadata_size = bo->metadata_size; - if (metadata_buffer) - r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size, - metadata_size, &metadata_flags); - if (flags) { - /* If the preferred domain is DGMA, set flags to VRAM because - * KFD doesn't support allocating DGMA memory - */ - *flags = (bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | - AMDGPU_GEM_DOMAIN_DGMA)) ? - ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT; - - if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) - *flags |= ALLOC_MEM_FLAGS_PUBLIC; - } - -out_put: - dma_buf_put(dma_buf); - return r; -} - -uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - uint64_t usage = - amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]); - return usage; -} - -bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, - u32 vmid) -{ - if (adev->kfd) { - if ((1 << vmid) & global_compute_vmid_bitmap) - return true; - } - - return false; + /* The sclk is in quantas of 10kHz */ + return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index b259ba7..8e8c10e 100755 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -27,109 +27,21 @@ #include #include -#include -#include #include -#include "amdgpu.h" - -extern const struct kgd2kfd_calls *kgd2kfd; struct amdgpu_device; -struct kfd_bo_va_list { - struct list_head bo_list; - struct amdgpu_bo_va *bo_va; - void *kgd_dev; - bool is_mapped; - bool map_fail; - uint64_t va; - uint64_t pte_flags; -}; - struct kgd_mem { - struct mutex lock; struct amdgpu_bo *bo; - struct list_head bo_va_list; - /* protected by amdkfd_process_info.lock */ - struct ttm_validate_buffer validate_list; - struct ttm_validate_buffer resv_list; - uint32_t domain; - unsigned int mapped_to_gpu_memory; - void *kptr; - uint64_t va; - - uint32_t mapping_flags; - - atomic_t invalid; - struct amdkfd_process_info *process_info; - struct page **user_pages; - - struct amdgpu_sync sync; - - /* flags bitfield */ - bool coherent : 1; - bool no_substitute : 1; - bool aql_queue : 1; -}; - -/* KFD Memory Eviction */ -struct amdgpu_amdkfd_fence { - struct dma_fence base; - void *mm; - spinlock_t lock; - char timeline_name[TASK_COMM_LEN]; -}; - -struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, - void *mm); -bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm); -struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); - -struct amdkfd_process_info { - /* List head of all VMs that belong to a KFD process */ - struct list_head vm_list_head; - /* List head for all KFD BOs that belong to a KFD process. */ - struct list_head kfd_bo_list; - /* List of userptr BOs that are valid or invalid */ - struct list_head userptr_valid_list; - struct list_head userptr_inval_list; - /* Lock to protect kfd_bo_list */ - struct mutex lock; - - /* Number of VMs */ - unsigned int n_vms; - /* Eviction Fence */ - struct amdgpu_amdkfd_fence *eviction_fence; - - /* MMU-notifier related fields */ - atomic_t evicted_bos; - struct delayed_work work; - struct pid *pid; -}; - -/* struct amdkfd_vm - - * For Memory Eviction KGD requires a mechanism to keep track of all KFD BOs - * belonging to a KFD process. All the VMs belonging to the same process point - * to the same amdkfd_process_info. - */ -struct amdkfd_vm { - /* Keep base as the first parameter for pointer compatibility between - * amdkfd_vm and amdgpu_vm. - */ - struct amdgpu_vm base; - - /* List node in amdkfd_process_info.vm_list_head*/ - struct list_head vm_list_node; - - struct amdgpu_device *adev; - /* Points to the KFD process VM info*/ - struct amdkfd_process_info *process_info; + uint64_t gpu_addr; + void *cpu_ptr; }; int amdgpu_amdkfd_init(void); void amdgpu_amdkfd_fini(void); +bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev); void amdgpu_amdkfd_suspend(struct amdgpu_device *adev); int amdgpu_amdkfd_resume(struct amdgpu_device *adev); @@ -139,105 +51,17 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); -int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); -int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, - uint32_t vmid, uint64_t gpu_addr, - uint32_t *ib_cmd, uint32_t ib_len); -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, - struct dma_fence **ef); struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void); -int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, - uint64_t src_offset, struct kgd_mem *dst_mem, - uint64_t dest_offset, uint64_t size, struct dma_fence **f, - uint64_t *actual_size); - -bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, - u32 vmid); /* Shared API */ -int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm, - struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va); int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, void **mem_obj, uint64_t *gpu_addr, void **cpu_ptr); void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); -void get_local_mem_info(struct kgd_dev *kgd, - struct kfd_local_mem_info *mem_info); +uint64_t get_vmem_size(struct kgd_dev *kgd); uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd); -void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info); -int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, - struct kgd_dev **dmabuf_kgd, - uint64_t *bo_size, void *metadata_buffer, - size_t buffer_size, uint32_t *metadata_size, - uint32_t *flags); -uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd); - -#define read_user_wptr(mmptr, wptr, dst) \ - ({ \ - bool valid = false; \ - if ((mmptr) && (wptr)) { \ - if ((mmptr) == current->mm) { \ - valid = !get_user((dst), (wptr)); \ - } else if (current->mm == NULL) { \ - use_mm(mmptr); \ - valid = !get_user((dst), (wptr)); \ - unuse_mm(mmptr); \ - } \ - } \ - valid; \ - }) - -/* GPUVM API */ -int amdgpu_amdkfd_gpuvm_sync_memory( - struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); -int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( - struct kgd_dev *kgd, uint64_t va, uint64_t size, - void *vm, struct kgd_mem **mem, - uint64_t *offset, uint32_t flags); -int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( - struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); -int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( - struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); -int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( - struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); -int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, - void **process_info, - struct dma_fence **ef); -void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm); - -uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm); - -int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, - struct kfd_vm_fault_info *info); - -int amdgpu_amdkfd_gpuvm_mmap_bo( - struct kgd_dev *kgd, struct vm_area_struct *vma); - -int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, - struct kgd_mem *mem, void **kptr); - -int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, - struct kgd_mem *mem, uint64_t offset, - uint64_t size, struct sg_table **ret_sg); -void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( - struct kgd_mem *mem, struct sg_table *sg); -int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, - struct dma_buf *dmabuf, - uint64_t va, void *vm, - struct kgd_mem **mem, uint64_t *size, - uint64_t *mmap_offset); -int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, - struct kgd_mem *mem, - struct dma_buf **dmabuf); -int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm); -int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm); - -void amdgpu_amdkfd_gpuvm_init_mem_limits(void); -void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo); #endif /* AMDGPU_AMDKFD_H_INCLUDED */ - diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c deleted file mode 100644 index 3961937..0000000 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include "amdgpu_amdkfd.h" - -const struct dma_fence_ops amd_kfd_fence_ops; -static atomic_t fence_seq = ATOMIC_INIT(0); - -static int amd_kfd_fence_signal(struct dma_fence *f); - -/* Eviction Fence - * Fence helper functions to deal with KFD memory eviction. - * Big Idea - Since KFD submissions are done by user queues, a BO cannot be - * evicted unless all the user queues for that process are evicted. - * - * All the BOs in a process share an eviction fence. When process X wants - * to map VRAM memory but TTM can't find enough space, TTM will attempt to - * evict BOs from its LRU list. TTM checks if the BO is valuable to evict - * by calling ttm_bo_driver->eviction_valuable(). - * - * ttm_bo_driver->eviction_valuable() - will return false if the BO belongs - * to process X. Otherwise, it will return true to indicate BO can be - * evicted by TTM. - * - * If ttm_bo_driver->eviction_valuable returns true, then TTM will continue - * the evcition process for that BO by calling ttm_bo_evict --> amdgpu_bo_move - * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler. - * - * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to - * nofity when the BO is free to move. fence_add_callback --> enable_signaling - * --> amdgpu_amdkfd_fence.enable_signaling - * - * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will quiesce - * user queues and signal fence. The work item will also start another delayed - * work item to restore BOs - */ - -struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, - void *mm) -{ - struct amdgpu_amdkfd_fence *fence = NULL; - - fence = kzalloc(sizeof(*fence), GFP_KERNEL); - if (fence == NULL) - return NULL; - - /* mm_struct mm is used as void pointer to identify the parent - * KFD process. Don't dereference it. Fence and any threads using - * mm is guranteed to be released before process termination. - */ - fence->mm = mm; - get_task_comm(fence->timeline_name, current); - spin_lock_init(&fence->lock); - - dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock, - context, atomic_inc_return(&fence_seq)); - - return fence; -} - -struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f) -{ - struct amdgpu_amdkfd_fence *fence; - - if (!f) - return NULL; - - fence = container_of(f, struct amdgpu_amdkfd_fence, base); - if (fence && f->ops == &amd_kfd_fence_ops) - return fence; - - return NULL; -} - -static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f) -{ - return "amdgpu_amdkfd_fence"; -} - -static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f) -{ - struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); - - return fence->timeline_name; -} - -/** - * amd_kfd_fence_enable_signaling - This gets called when TTM wants to evict - * a KFD BO and schedules a job to move the BO. - * If fence is already signaled return true. - * If fence is not signaled schedule a evict KFD process work item. - */ -static bool amd_kfd_fence_enable_signaling(struct dma_fence *f) -{ - struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); - - if (!fence) - return false; - - if (dma_fence_is_signaled(f)) - return true; - - if (!kgd2kfd->schedule_evict_and_restore_process( - (struct mm_struct *)fence->mm, f)) - return true; - - return false; -} - -static int amd_kfd_fence_signal(struct dma_fence *f) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(f->lock, flags); - /* Set enabled bit so cb will called */ - set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags); - ret = dma_fence_signal_locked(f); - spin_unlock_irqrestore(f->lock, flags); - - return ret; -} - -/** - * amd_kfd_fence_release - callback that fence can be freed - * - * @fence: fence - * - * This function is called when the reference count becomes zero. - * It just RCU schedules freeing up the fence. -*/ -static void amd_kfd_fence_release(struct dma_fence *f) -{ - struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); - /* Unconditionally signal the fence. The process is getting - * terminated. - */ - if (WARN_ON(!fence)) - return; /* Not an amdgpu_amdkfd_fence */ - - amd_kfd_fence_signal(f); - kfree_rcu(f, rcu); -} - -/** - * amd_kfd_fence_check_mm - Check if @mm is same as that of the fence @f - * if same return TRUE else return FALSE. - * - * @f: [IN] fence - * @mm: [IN] mm that needs to be verified -*/ -bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm) -{ - struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); - - if (!fence) - return false; - else if (fence->mm == mm) - return true; - - return false; -} - -const struct dma_fence_ops amd_kfd_fence_ops = { - .get_driver_name = amd_kfd_fence_get_driver_name, - .get_timeline_name = amd_kfd_fence_get_timeline_name, - .enable_signaling = amd_kfd_fence_enable_signaling, - .signaled = NULL, - .wait = dma_fence_default_wait, - .release = amd_kfd_fence_release, -}; - diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c old mode 100755 new mode 100644 index 6964ece..f6acf48 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -20,9 +20,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#undef pr_fmt -#define pr_fmt(fmt) "kfd2kgd: " fmt - #include #include #include @@ -42,14 +39,6 @@ #include "gmc/gmc_7_1_sh_mask.h" #include "cik_structs.h" -#define AMDKFD_SKIP_UNCOMPILED_CODE 1 - -enum hqd_dequeue_request_type { - NO_ACTION = 0, - DRAIN_PIPE, - RESET_WAVES -}; - enum { MAX_TRAPID = 8, /* 3 bits in the bitfield. */ MAX_WATCH_ADDRESSES = 4 @@ -66,8 +55,8 @@ enum { enum { ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, - ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000, - /* extend the mask to 26 bits in order to match the low address field */ + ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, + /* extend the mask to 26 bits to match the low address field */ ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF }; @@ -92,42 +81,30 @@ union TCP_WATCH_CNTL_BITS { float f32All; }; -static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, - int fd, uint32_t handle, struct kgd_mem **mem); - -static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); - /* * Register access functions */ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, - uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, - uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); + uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, + uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); + static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, - unsigned int vmid); + unsigned int vmid); + static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr); + uint32_t hpd_size, uint64_t hpd_gpu_addr); static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm); -static int kgd_hqd_dump(struct kgd_dev *kgd, - uint32_t pipe_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs); -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, - uint32_t __user *wptr, struct mm_struct *mm); -static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, - uint32_t engine_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs); + uint32_t queue_id, uint32_t __user *wptr); +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); -static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, - enum kfd_preempt_type reset_type, + uint32_t pipe_id, uint32_t queue_id); + +static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, unsigned int utimeout, uint32_t pipe_id, uint32_t queue_id); +static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, unsigned int utimeout); static int kgd_address_watch_disable(struct kgd_dev *kgd); @@ -147,60 +124,21 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid); static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, uint8_t vmid); static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); -static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); -static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req); -static int alloc_memory_of_scratch(struct kgd_dev *kgd, - uint64_t va, uint32_t vmid); -static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, - uint8_t element_size, uint8_t index_stride, uint8_t mtype); -static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, - uint32_t page_table_base); -static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd); - -/* Because of REG_GET_FIELD() being used, we put this function in the - * asic specific file. - */ -static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, - struct tile_config *config) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - - config->gb_addr_config = adev->gfx.config.gb_addr_config; - config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, - MC_ARB_RAMCFG, NOOFBANK); - config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, - MC_ARB_RAMCFG, NOOFRANKS); - config->tile_config_ptr = adev->gfx.config.tile_mode_array; - config->num_tile_configs = - ARRAY_SIZE(adev->gfx.config.tile_mode_array); - config->macro_tile_config_ptr = - adev->gfx.config.macrotile_mode_array; - config->num_macro_tile_configs = - ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); - - - return 0; -} +static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); static const struct kfd2kgd_calls kfd2kgd = { .init_gtt_mem_allocation = alloc_gtt_mem, .free_gtt_mem = free_gtt_mem, - .get_local_mem_info = get_local_mem_info, + .get_vmem_size = get_vmem_size, .get_gpu_clock_counter = get_gpu_clock_counter, .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, - .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, - .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, - .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, - .open_graphic_handle = open_graphic_handle, .program_sh_mem_settings = kgd_program_sh_mem_settings, .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, .init_pipeline = kgd_init_pipeline, .init_interrupts = kgd_init_interrupts, .hqd_load = kgd_hqd_load, .hqd_sdma_load = kgd_hqd_sdma_load, - .hqd_dump = kgd_hqd_dump, - .hqd_sdma_dump = kgd_hqd_sdma_dump, .hqd_is_occupied = kgd_hqd_is_occupied, .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, .hqd_destroy = kgd_hqd_destroy, @@ -209,50 +147,17 @@ static const struct kfd2kgd_calls kfd2kgd = { .address_watch_execute = kgd_address_watch_execute, .wave_control_execute = kgd_wave_control_execute, .address_watch_get_offset = kgd_address_watch_get_offset, - .get_atc_vmid_pasid_mapping_pasid = - get_atc_vmid_pasid_mapping_pasid, - .get_atc_vmid_pasid_mapping_valid = - get_atc_vmid_pasid_mapping_valid, - .read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg, + .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid, + .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, .write_vmid_invalidate_request = write_vmid_invalidate_request, - .invalidate_tlbs = invalidate_tlbs, - .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, - .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, - .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, - .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, - .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, - .get_fw_version = get_fw_version, - .set_num_of_requests = set_num_of_requests, - .get_cu_info = get_cu_info, - .alloc_memory_of_scratch = alloc_memory_of_scratch, - .write_config_static_mem = write_config_static_mem, - .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, - .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, - .set_vm_context_page_table_base = set_vm_context_page_table_base, - .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, - .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, - .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, - .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, - .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, - .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, - .submit_ib = amdgpu_amdkfd_submit_ib, - .get_tile_config = amdgpu_amdkfd_get_tile_config, - .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, - .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, - .get_vram_usage = amdgpu_amdkfd_get_vram_usage + .get_fw_version = get_fw_version }; -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions() +struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void) { return (struct kfd2kgd_calls *)&kfd2kgd; } -static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, - int fd, uint32_t handle, struct kgd_mem **mem) -{ - return 0; -} - static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) { return (struct amdgpu_device *)kgd; @@ -281,7 +186,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, { struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); lock_srbm(kgd, mec, pipe, queue_id, 0); @@ -317,12 +222,12 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, /* * We have to assume that there is no outstanding mapping. - * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a - * mapping is in progress or because a mapping finished and the SW - * cleared it. So the protocol is to always wait & clear. + * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because + * a mapping is in progress or because a mapping finished and the + * SW cleared it. So the protocol is to always wait & clear. */ - uint32_t pasid_mapping = (pasid == 0) ? 0 : - (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; + uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | + ATC_VMID0_PASID_MAPPING__VALID_MASK; WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); @@ -368,7 +273,8 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; - pr_debug("sdma base address: 0x%x\n", retval); + + pr_debug("kfd: sdma base address: 0x%x\n", retval); return retval; } @@ -384,138 +290,42 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) } static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm) + uint32_t queue_id, uint32_t __user *wptr) { struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t wptr_shadow, is_wptr_shadow_valid; struct cik_mqd *m; - uint32_t *mqd_hqd; - uint32_t reg, wptr_val, data; - bool valid_wptr = false; m = get_mqd(mqd); - acquire_queue(kgd, pipe_id, queue_id); - - /* HQD registers extend from CP_MQD_BASE_ADDR to CP_MQD_CONTROL. */ - mqd_hqd = &m->cp_mqd_base_addr_lo; - - for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) - WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); - - /* Copy userspace write pointer value to register. - * Activate doorbell logic to monitor subsequent changes. - */ - data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, - CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); - WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); - - /* read_user_ptr may take the mm->mmap_sem. - * release srbm_mutex to avoid circular dependency between - * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. - */ - release_queue(kgd); - valid_wptr = read_user_wptr(mm, wptr, wptr_val); + is_wptr_shadow_valid = !get_user(wptr_shadow, wptr); + if (is_wptr_shadow_valid) + m->cp_hqd_pq_wptr = wptr_shadow; acquire_queue(kgd, pipe_id, queue_id); - if (valid_wptr) - WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); - - data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); - WREG32(mmCP_HQD_ACTIVE, data); - + gfx_v7_0_mqd_commit(adev, m); release_queue(kgd); return 0; } -static int kgd_hqd_dump(struct kgd_dev *kgd, - uint32_t pipe_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t i = 0, reg; -#define HQD_N_REGS (35+4) -#define DUMP_REG(addr) do { \ - if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ - break; \ - (*dump)[i][0] = (addr) << 2; \ - (*dump)[i++][1] = RREG32(addr); \ - } while (0) - - *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); - if (*dump == NULL) - return -ENOMEM; - - acquire_queue(kgd, pipe_id, queue_id); - - DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); - DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); - DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); - DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); - - for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) - DUMP_REG(reg); - - release_queue(kgd); - - WARN_ON_ONCE(i != HQD_N_REGS); - *n_regs = i; - - return 0; -} - -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, - uint32_t __user *wptr, struct mm_struct *mm) +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) { struct amdgpu_device *adev = get_amdgpu_device(kgd); struct cik_sdma_rlc_registers *m; unsigned long end_jiffies; uint32_t sdma_base_addr; - uint32_t data; m = get_sdma_mqd(mqd); sdma_base_addr = get_sdma_base_addr(m); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, - m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); - - while (true) { - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) - break; - if (timeout == 0) - return -ETIME; - msleep(10); - timeout -= 10; - } - if (m->sdma_engine_id) { - data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); - data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, - RESUME_CTX, 0); - WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); - } else { - data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); - data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, - RESUME_CTX, 0); - WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); - } - - data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL, - ENABLE, 1); - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr); - if (read_user_wptr(mm, wptr, data)) - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); - else - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, - m->sdma_rlc_rb_rptr); - + WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, + m->sdma_rlc_virtual_addr); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, + m->sdma_rlc_rb_base); WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, m->sdma_rlc_virtual_addr); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdma_rlc_rb_base_hi); @@ -523,35 +333,11 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, m->sdma_rlc_rb_rptr_addr_lo); WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdma_rlc_rb_rptr_addr_hi); - data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL, - RB_ENABLE, 1); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); - return 0; -} + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, + m->sdma_rlc_doorbell); -static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, - uint32_t engine_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + - queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; - uint32_t i = 0, reg; -#undef HQD_N_REGS -#define HQD_N_REGS (19+4) - - *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); - if (*dump == NULL) - return -ENOMEM; - - for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) - DUMP_REG(sdma_offset + reg); - for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; - reg++) - DUMP_REG(sdma_offset + reg); - - WARN_ON_ONCE(i != HQD_N_REGS); - *n_regs = i; + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + m->sdma_rlc_rb_cntl); return 0; } @@ -596,99 +382,30 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) return false; } -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, - enum kfd_preempt_type reset_type, +static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, unsigned int utimeout, uint32_t pipe_id, uint32_t queue_id) { struct amdgpu_device *adev = get_amdgpu_device(kgd); uint32_t temp; - enum hqd_dequeue_request_type type; - unsigned long flags, end_jiffies; - int retry; + int timeout = utimeout; acquire_queue(kgd, pipe_id, queue_id); WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0); - switch (reset_type) { - case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: - type = DRAIN_PIPE; - break; - case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: - type = RESET_WAVES; - break; - default: - type = DRAIN_PIPE; - break; - } - - /* Workaround: If IQ timer is active and the wait time is close to or - * equal to 0, dequeueing is not safe. Wait until either the wait time - * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is - * cleared before continuing. Also, ensure wait times are set to at - * least 0x3. - */ - local_irq_save(flags); - preempt_disable(); - retry = 5000; /* wait for 500 usecs at maximum */ - while (true) { - temp = RREG32(mmCP_HQD_IQ_TIMER); - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { - pr_debug("HW is processing IQ\n"); - goto loop; - } - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) - == 3) /* SEM-rearm is safe */ - break; - /* Wait time 3 is safe for CP, but our MMIO read/write - * time is close to 1 microsecond, so check for 10 to - * leave more buffer room - */ - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) - >= 10) - break; - pr_debug("IQ timer is active\n"); - } else - break; -loop: - if (!retry) { - pr_err("CP HQD IQ timer status time out\n"); - break; - } - ndelay(100); - --retry; - } - retry = 1000; - while (true) { - temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); - if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) - break; - pr_debug("Dequeue request is pending\n"); - - if (!retry) { - pr_err("CP HQD dequeue request time out\n"); - break; - } - ndelay(100); - --retry; - } - local_irq_restore(flags); - preempt_enable(); - - WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); + WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); - end_jiffies = (utimeout * HZ / 1000) + jiffies; while (true) { temp = RREG32(mmCP_HQD_ACTIVE); - if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) + if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) break; - if (time_after(jiffies, end_jiffies)) { - pr_err("cp queue preemption time out\n"); + if (timeout <= 0) { + pr_err("kfd: cp queue preemption time out.\n"); release_queue(kgd); return -ETIME; } - usleep_range(500, 1000); + msleep(20); + timeout -= 20; } release_queue(kgd); @@ -702,7 +419,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, struct cik_sdma_rlc_registers *m; uint32_t sdma_base_addr; uint32_t temp; - unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; + int timeout = utimeout; m = get_sdma_mqd(mqd); sdma_base_addr = get_sdma_base_addr(m); @@ -713,19 +430,18 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, while (true) { temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) break; - if (time_after(jiffies, end_jiffies)) + if (timeout <= 0) return -ETIME; - usleep_range(500, 1000); + msleep(20); + timeout -= 20; } WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, - RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | - SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); - - m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); return 0; } @@ -744,9 +460,8 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd) /* Turning off this address until we set all the registers */ for (i = 0; i < MAX_WATCH_ADDRESSES; i++) - WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_CNTL], - cntl.u32All); + WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], cntl.u32All); return 0; } @@ -764,24 +479,20 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, /* Turning off this watch point until we set all the registers */ cntl.bitfields.valid = 0; - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_CNTL], - cntl.u32All); + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], cntl.u32All); - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_ADDR_HI], - addr_hi); + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_ADDR_HI], addr_hi); - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_ADDR_LO], - addr_lo); + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_ADDR_LO], addr_lo); /* Enable the watch point */ cntl.bitfields.valid = 1; - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_CNTL], - cntl.u32All); + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], cntl.u32All); return 0; } @@ -835,7 +546,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, struct amdgpu_device *adev = (struct amdgpu_device *) kgd; reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); - return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; + return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; } static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) @@ -845,90 +556,52 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); } -static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) -{ - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - int vmid; - - for (vmid = 0; vmid < 16; vmid++) { - if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) - continue; - if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & - ATC_VMID0_PASID_MAPPING__VALID_MASK) { - if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & - ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { - WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); - break; - } - } - } - - return 0; -} - -static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, - uint8_t element_size, uint8_t index_stride, uint8_t mtype) -{ - uint32_t reg; - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | - element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | - index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | - mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; - - WREG32(mmSH_STATIC_MEM_CONFIG, reg); - return 0; -} -static int alloc_memory_of_scratch(struct kgd_dev *kgd, - uint64_t va, uint32_t vmid) -{ - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - lock_srbm(kgd, 0, 0, 0, vmid); - WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); - unlock_srbm(kgd); - - return 0; -} - - static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) { struct amdgpu_device *adev = (struct amdgpu_device *) kgd; const union amdgpu_firmware_header *hdr; + BUG_ON(kgd == NULL); + switch (type) { case KGD_ENGINE_PFP: - hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; + hdr = (const union amdgpu_firmware_header *) + adev->gfx.pfp_fw->data; break; case KGD_ENGINE_ME: - hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; + hdr = (const union amdgpu_firmware_header *) + adev->gfx.me_fw->data; break; case KGD_ENGINE_CE: - hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; + hdr = (const union amdgpu_firmware_header *) + adev->gfx.ce_fw->data; break; case KGD_ENGINE_MEC1: - hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; + hdr = (const union amdgpu_firmware_header *) + adev->gfx.mec_fw->data; break; case KGD_ENGINE_MEC2: - hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; + hdr = (const union amdgpu_firmware_header *) + adev->gfx.mec2_fw->data; break; case KGD_ENGINE_RLC: - hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; + hdr = (const union amdgpu_firmware_header *) + adev->gfx.rlc_fw->data; break; case KGD_ENGINE_SDMA1: - hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; + hdr = (const union amdgpu_firmware_header *) + adev->sdma.instance[0].fw->data; break; case KGD_ENGINE_SDMA2: - hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; + hdr = (const union amdgpu_firmware_header *) + adev->sdma.instance[1].fw->data; break; default: @@ -942,42 +615,3 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) return hdr->common.ucode_version; } -static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req) -{ - uint32_t value; - struct amdgpu_device *adev = get_amdgpu_device(dev); - - value = RREG32(mmATC_ATS_DEBUG); - value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK; - value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT); - - WREG32(mmATC_ATS_DEBUG, value); -} - -static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, - uint32_t page_table_base) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - /* TODO: Don't use hardcoded VMIDs */ - if (vmid < 8 || vmid > 15) { - pr_err("trying to set page table base for wrong VMID\n"); - return; - } - WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); -} - - /** - * read_vmid_from_vmfault_reg - read vmid from register - * - * adev: amdgpu_device pointer - * @vmid: vmid pointer - * read vmid from register (CIK). - */ -static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - - uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); - - return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID); -} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c old mode 100755 new mode 100644 index 2ff10e9..133d066 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -20,9 +20,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#undef pr_fmt -#define pr_fmt(fmt) "kfd2kgd: " fmt - #include #include #include @@ -31,7 +28,7 @@ #include "amdgpu.h" #include "amdgpu_amdkfd.h" #include "amdgpu_ucode.h" -#include "amdgpu_amdkfd_gfx_v8.h" +#include "gfx_v8_0.h" #include "gca/gfx_8_0_sh_mask.h" #include "gca/gfx_8_0_d.h" #include "gca/gfx_8_0_enum.h" @@ -42,31 +39,7 @@ #include "vi_structs.h" #include "vid.h" -enum hqd_dequeue_request_type { - NO_ACTION = 0, - DRAIN_PIPE, - RESET_WAVES, - SAVE_WAVES -}; - -static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { - mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, - mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, - mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, - mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL -}; - - -struct vi_sdma_mqd; - -static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, - void *vm, struct kgd_mem **mem); -static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); - -static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, - int fd, uint32_t handle, struct kgd_mem **mem); - -static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); +struct cik_sdma_rlc_registers; /* * Register access functions @@ -82,26 +55,17 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, uint32_t hpd_size, uint64_t hpd_gpu_addr); static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm); -static int kgd_hqd_dump(struct kgd_dev *kgd, - uint32_t pipe_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs); -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, - uint32_t __user *wptr, struct mm_struct *mm); -static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, - uint32_t engine_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs); + uint32_t queue_id, uint32_t __user *wptr); +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, - enum kfd_preempt_type reset_type, +static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, unsigned int utimeout, uint32_t pipe_id, uint32_t queue_id); static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, unsigned int utimeout); +static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); static int kgd_address_watch_disable(struct kgd_dev *kgd); static int kgd_address_watch_execute(struct kgd_dev *kgd, unsigned int watch_point_id, @@ -120,61 +84,20 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, uint8_t vmid); static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); -static void set_num_of_requests(struct kgd_dev *kgd, - uint8_t num_of_requests); -static int alloc_memory_of_scratch(struct kgd_dev *kgd, - uint64_t va, uint32_t vmid); -static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, - uint8_t element_size, uint8_t index_stride, uint8_t mtype); -static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, - uint32_t page_table_base); -static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); - -/* Because of REG_GET_FIELD() being used, we put this function in the - * asic specific file. - */ -static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, - struct tile_config *config) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - - config->gb_addr_config = adev->gfx.config.gb_addr_config; - config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, - MC_ARB_RAMCFG, NOOFBANK); - config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, - MC_ARB_RAMCFG, NOOFRANKS); - - config->tile_config_ptr = adev->gfx.config.tile_mode_array; - config->num_tile_configs = - ARRAY_SIZE(adev->gfx.config.tile_mode_array); - config->macro_tile_config_ptr = - adev->gfx.config.macrotile_mode_array; - config->num_macro_tile_configs = - ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); - - return 0; -} +static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); static const struct kfd2kgd_calls kfd2kgd = { .init_gtt_mem_allocation = alloc_gtt_mem, .free_gtt_mem = free_gtt_mem, - .get_local_mem_info = get_local_mem_info, + .get_vmem_size = get_vmem_size, .get_gpu_clock_counter = get_gpu_clock_counter, .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, - .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, - .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, - .create_process_gpumem = create_process_gpumem, - .destroy_process_gpumem = destroy_process_gpumem, - .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, - .open_graphic_handle = open_graphic_handle, .program_sh_mem_settings = kgd_program_sh_mem_settings, .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, .init_pipeline = kgd_init_pipeline, .init_interrupts = kgd_init_interrupts, .hqd_load = kgd_hqd_load, .hqd_sdma_load = kgd_hqd_sdma_load, - .hqd_dump = kgd_hqd_dump, - .hqd_sdma_dump = kgd_hqd_sdma_dump, .hqd_is_occupied = kgd_hqd_is_occupied, .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, .hqd_destroy = kgd_hqd_destroy, @@ -188,56 +111,14 @@ static const struct kfd2kgd_calls kfd2kgd = { .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, .write_vmid_invalidate_request = write_vmid_invalidate_request, - .invalidate_tlbs = invalidate_tlbs, - .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, - .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, - .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, - .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, - .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, - .get_fw_version = get_fw_version, - .set_num_of_requests = set_num_of_requests, - .get_cu_info = get_cu_info, - .alloc_memory_of_scratch = alloc_memory_of_scratch, - .write_config_static_mem = write_config_static_mem, - .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, - .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, - .set_vm_context_page_table_base = set_vm_context_page_table_base, - .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, - .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, - .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, - .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, - .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, - .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, - .submit_ib = amdgpu_amdkfd_submit_ib, - .get_tile_config = amdgpu_amdkfd_get_tile_config, - .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, - .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, - .get_vram_usage = amdgpu_amdkfd_get_vram_usage + .get_fw_version = get_fw_version }; -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions() +struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void) { return (struct kfd2kgd_calls *)&kfd2kgd; } -static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, - void *vm, struct kgd_mem **mem) -{ - return 0; -} - -/* Destroys the GPU allocation and frees the kgd_mem structure */ -static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) -{ - -} - -static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, - int fd, uint32_t handle, struct kgd_mem **mem) -{ - return 0; -} - static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) { return (struct amdgpu_device *)kgd; @@ -266,7 +147,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, { struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); lock_srbm(kgd, mec, pipe, queue_id, 0); @@ -335,28 +216,21 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) uint32_t mec; uint32_t pipe; - mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); lock_srbm(kgd, mec, pipe, 0, 0); - WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | - CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); + WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK); unlock_srbm(kgd); return 0; } -static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m) +static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) { - uint32_t retval; - - retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + - m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET; - pr_debug("sdma base address: 0x%x\n", retval); - - return retval; + return 0; } static inline struct vi_mqd *get_mqd(void *mqd) @@ -364,224 +238,33 @@ static inline struct vi_mqd *get_mqd(void *mqd) return (struct vi_mqd *)mqd; } -static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) +static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) { - return (struct vi_sdma_mqd *)mqd; + return (struct cik_sdma_rlc_registers *)mqd; } static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm) + uint32_t queue_id, uint32_t __user *wptr) { - struct amdgpu_device *adev = get_amdgpu_device(kgd); struct vi_mqd *m; - uint32_t *mqd_hqd; - uint32_t reg, wptr_val, data; - bool valid_wptr = false; + uint32_t shadow_wptr, valid_wptr; + struct amdgpu_device *adev = get_amdgpu_device(kgd); m = get_mqd(mqd); - acquire_queue(kgd, pipe_id, queue_id); - - /* HIQ is set during driver init period with vmid set to 0. For SRIOV - * world switching support let the RLC know about the HIQ. - * - * Workaround: This causes reboots on CZ. Disable this on CZ, which - * doesn't support SRIOV anyway. - */ - if (m->cp_hqd_vmid == 0 && - adev->asic_type != CHIP_CARRIZO) { - uint32_t value, mec, pipe; - - mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", - mec, pipe, queue_id); - value = RREG32(mmRLC_CP_SCHEDULERS); - value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, - ((mec << 5) | (pipe << 3) | queue_id | 0x80)); - WREG32(mmRLC_CP_SCHEDULERS, value); - } - - /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ - mqd_hqd = &m->cp_mqd_base_addr_lo; + valid_wptr = copy_from_user(&shadow_wptr, wptr, sizeof(shadow_wptr)); + if (valid_wptr == 0) + m->cp_hqd_pq_wptr = shadow_wptr; - for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_CONTROL; reg++) - WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); - - /* Tonga errata: EOP RPTR/WPTR should be left unmodified. - * This is safe since EOP RPTR==WPTR for any inactive HQD - * on ASICs that do not support context-save. - * EOP writes/reads can start anywhere in the ring. - */ - if (get_amdgpu_device(kgd)->asic_type != CHIP_TONGA) { - WREG32(mmCP_HQD_EOP_RPTR, m->cp_hqd_eop_rptr); - WREG32(mmCP_HQD_EOP_WPTR, m->cp_hqd_eop_wptr); - WREG32(mmCP_HQD_EOP_WPTR_MEM, m->cp_hqd_eop_wptr_mem); - } - - for (reg = mmCP_HQD_EOP_EVENTS; reg <= mmCP_HQD_ERROR; reg++) - WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); - - /* Copy userspace write pointer value to register. - * Activate doorbell logic to monitor subsequent changes. - */ - data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, - CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); - WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); - - /* read_user_ptr may take the mm->mmap_sem. - * release srbm_mutex to avoid circular dependency between - * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. - */ - release_queue(kgd); - valid_wptr = read_user_wptr(mm, wptr, wptr_val); acquire_queue(kgd, pipe_id, queue_id); - if (valid_wptr) - WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); - - data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); - WREG32(mmCP_HQD_ACTIVE, data); - + gfx_v8_0_mqd_commit(adev, mqd); release_queue(kgd); return 0; } -static int kgd_hqd_dump(struct kgd_dev *kgd, - uint32_t pipe_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs) +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t i = 0, reg; -#define HQD_N_REGS (54+4) -#define DUMP_REG(addr) do { \ - if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ - break; \ - (*dump)[i][0] = (addr) << 2; \ - (*dump)[i++][1] = RREG32(addr); \ - } while (0) - - *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); - if (*dump == NULL) - return -ENOMEM; - - acquire_queue(kgd, pipe_id, queue_id); - - DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); - DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); - DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); - DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); - - for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++) - DUMP_REG(reg); - - release_queue(kgd); - - WARN_ON_ONCE(i != HQD_N_REGS); - *n_regs = i; - - return 0; -} - -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, - uint32_t __user *wptr, struct mm_struct *mm) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct vi_sdma_mqd *m; - uint32_t sdma_base_addr; - uint32_t temp, timeout = 2000; - uint32_t data; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, - m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); - - while (true) { - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) - break; - if (timeout == 0) - return -ETIME; - msleep(10); - timeout -= 10; - } - if (m->sdma_engine_id) { - data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); - data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, - RESUME_CTX, 0); - WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); - } else { - data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); - data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, - RESUME_CTX, 0); - WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); - } - - data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, - ENABLE, 1); - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); - - if (read_user_wptr(mm, wptr, data)) - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); - else - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, - m->sdmax_rlcx_rb_rptr); - - WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, - m->sdmax_rlcx_virtual_addr); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, - m->sdmax_rlcx_rb_base_hi); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, - m->sdmax_rlcx_rb_rptr_addr_lo); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, - m->sdmax_rlcx_rb_rptr_addr_hi); - - data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, - RB_ENABLE, 1); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); - - return 0; -} - -static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, - uint32_t engine_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + - queue_id * KFD_VI_SDMA_QUEUE_OFFSET; - uint32_t i = 0, reg; -#undef HQD_N_REGS -#define HQD_N_REGS (19+4+2+3+7) - - *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); - if (*dump == NULL) - return -ENOMEM; - - for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) - DUMP_REG(sdma_offset + reg); - for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; - reg++) - DUMP_REG(sdma_offset + reg); - for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; - reg++) - DUMP_REG(sdma_offset + reg); - for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG; - reg++) - DUMP_REG(sdma_offset + reg); - for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL; - reg++) - DUMP_REG(sdma_offset + reg); - - WARN_ON_ONCE(i != HQD_N_REGS); - *n_regs = i; - return 0; } @@ -610,7 +293,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) { struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct vi_sdma_mqd *m; + struct cik_sdma_rlc_registers *m; uint32_t sdma_base_addr; uint32_t sdma_rlc_rb_cntl; @@ -625,102 +308,29 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) return false; } -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, - enum kfd_preempt_type reset_type, +static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, unsigned int utimeout, uint32_t pipe_id, uint32_t queue_id) { struct amdgpu_device *adev = get_amdgpu_device(kgd); uint32_t temp; - enum hqd_dequeue_request_type type; - unsigned long flags, end_jiffies; - int retry; - struct vi_mqd *m = get_mqd(mqd); + int timeout = utimeout; acquire_queue(kgd, pipe_id, queue_id); - if (m->cp_hqd_vmid == 0) - WREG32_FIELD(RLC_CP_SCHEDULERS, scheduler1, 0); - - switch (reset_type) { - case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: - type = DRAIN_PIPE; - break; - case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: - type = RESET_WAVES; - break; - default: - type = DRAIN_PIPE; - break; - } + WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); - /* Workaround: If IQ timer is active and the wait time is close to or - * equal to 0, dequeueing is not safe. Wait until either the wait time - * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is - * cleared before continuing. Also, ensure wait times are set to at - * least 0x3. - */ - local_irq_save(flags); - preempt_disable(); - retry = 5000; /* wait for 500 usecs at maximum */ - while (true) { - temp = RREG32(mmCP_HQD_IQ_TIMER); - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { - pr_debug("HW is processing IQ\n"); - goto loop; - } - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) - == 3) /* SEM-rearm is safe */ - break; - /* Wait time 3 is safe for CP, but our MMIO read/write - * time is close to 1 microsecond, so check for 10 to - * leave more buffer room - */ - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) - >= 10) - break; - pr_debug("IQ timer is active\n"); - } else - break; -loop: - if (!retry) { - pr_err("CP HQD IQ timer status time out\n"); - break; - } - ndelay(100); - --retry; - } - retry = 1000; - while (true) { - temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); - if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) - break; - pr_debug("Dequeue request is pending\n"); - - if (!retry) { - pr_err("CP HQD dequeue request time out\n"); - break; - } - ndelay(100); - --retry; - } - local_irq_restore(flags); - preempt_enable(); - - WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); - - end_jiffies = (utimeout * HZ / 1000) + jiffies; while (true) { temp = RREG32(mmCP_HQD_ACTIVE); - if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) + if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) break; - if (time_after(jiffies, end_jiffies)) { - pr_err("cp queue preemption time out.\n"); + if (timeout <= 0) { + pr_err("kfd: cp queue preemption time out.\n"); release_queue(kgd); return -ETIME; } - usleep_range(500, 1000); + msleep(20); + timeout -= 20; } release_queue(kgd); @@ -731,10 +341,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, unsigned int utimeout) { struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct vi_sdma_mqd *m; + struct cik_sdma_rlc_registers *m; uint32_t sdma_base_addr; uint32_t temp; - unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; + int timeout = utimeout; m = get_sdma_mqd(mqd); sdma_base_addr = get_sdma_base_addr(m); @@ -745,19 +355,18 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, while (true) { temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) break; - if (time_after(jiffies, end_jiffies)) + if (timeout <= 0) return -ETIME; - usleep_range(500, 1000); + msleep(20); + timeout -= 20; } WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, - RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | - SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); - - m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); return 0; } @@ -779,7 +388,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, struct amdgpu_device *adev = (struct amdgpu_device *) kgd; reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); - return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; + return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; } static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) @@ -789,83 +398,8 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); } -/* - * FIXME: Poliars test failed with this package, FIJI works fine - * From the CP spec it does not official support the invalidation - * with the specified pasid in the package, so disable it for V8 - * - */ -#ifdef V8_SUPPORT_IT_OFFICIAL -static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) -{ - signed long r; - struct dma_fence *f; - struct amdgpu_ring *ring = &adev->gfx.kiq.ring; - - mutex_lock(&adev->gfx.kiq.ring_mutex); - amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ - amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); - amdgpu_ring_write(ring, - PACKET3_INVALIDATE_TLBS_DST_SEL(1) | - PACKET3_INVALIDATE_TLBS_PASID(pasid)); - amdgpu_fence_emit(ring, &f); - amdgpu_ring_commit(ring); - mutex_unlock(&adev->gfx.kiq.ring_mutex); - - r = dma_fence_wait(f, false); - if (r) - DRM_ERROR("wait for kiq fence error: %ld.\n", r); - dma_fence_put(f); - - return r; -} -#endif -static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) -{ - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - int vmid; - -#ifdef V8_SUPPORT_IT_OFFICIAL - struct amdgpu_ring *ring = &adev->gfx.kiq.ring; - - if (ring->ready) - return invalidate_tlbs_with_kiq(adev, pasid); -#endif - - for (vmid = 0; vmid < 16; vmid++) { - if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) - continue; - if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & - ATC_VMID0_PASID_MAPPING__VALID_MASK) { - if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & - ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { - WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); - break; - } - } - } - - return 0; -} - static int kgd_address_watch_disable(struct kgd_dev *kgd) { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - union TCP_WATCH_CNTL_BITS cntl; - unsigned int i; - - cntl.u32All = 0; - - cntl.bitfields.valid = 0; - cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; - cntl.bitfields.atc = 1; - - /* Turning off this address until we set all the registers */ - for (i = 0; i < MAX_WATCH_ADDRESSES; i++) - WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_CNTL], - cntl.u32All); - return 0; } @@ -875,32 +409,6 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, uint32_t addr_hi, uint32_t addr_lo) { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - union TCP_WATCH_CNTL_BITS cntl; - - cntl.u32All = cntl_val; - - /* Turning off this watch point until we set all the registers */ - cntl.bitfields.valid = 0; - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_CNTL], - cntl.u32All); - - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_ADDR_HI], - addr_hi); - - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_ADDR_LO], - addr_lo); - - /* Enable the watch point */ - cntl.bitfields.valid = 1; - - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX - + ADDRESS_WATCH_REG_CNTL], - cntl.u32All); - return 0; } @@ -933,32 +441,6 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, unsigned int watch_point_id, unsigned int reg_offset) { - return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; -} - -static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, - uint8_t element_size, uint8_t index_stride, uint8_t mtype) -{ - uint32_t reg; - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | - element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | - index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | - mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; - - WREG32(mmSH_STATIC_MEM_CONFIG, reg); - return 0; -} -static int alloc_memory_of_scratch(struct kgd_dev *kgd, - uint64_t va, uint32_t vmid) -{ - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - lock_srbm(kgd, 0, 0, 0, vmid); - WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); - unlock_srbm(kgd); - return 0; } @@ -967,45 +449,47 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) struct amdgpu_device *adev = (struct amdgpu_device *) kgd; const union amdgpu_firmware_header *hdr; + BUG_ON(kgd == NULL); + switch (type) { case KGD_ENGINE_PFP: hdr = (const union amdgpu_firmware_header *) - adev->gfx.pfp_fw->data; + adev->gfx.pfp_fw->data; break; case KGD_ENGINE_ME: hdr = (const union amdgpu_firmware_header *) - adev->gfx.me_fw->data; + adev->gfx.me_fw->data; break; case KGD_ENGINE_CE: hdr = (const union amdgpu_firmware_header *) - adev->gfx.ce_fw->data; + adev->gfx.ce_fw->data; break; case KGD_ENGINE_MEC1: hdr = (const union amdgpu_firmware_header *) - adev->gfx.mec_fw->data; + adev->gfx.mec_fw->data; break; case KGD_ENGINE_MEC2: hdr = (const union amdgpu_firmware_header *) - adev->gfx.mec2_fw->data; + adev->gfx.mec2_fw->data; break; case KGD_ENGINE_RLC: hdr = (const union amdgpu_firmware_header *) - adev->gfx.rlc_fw->data; + adev->gfx.rlc_fw->data; break; case KGD_ENGINE_SDMA1: hdr = (const union amdgpu_firmware_header *) - adev->sdma.instance[0].fw->data; + adev->sdma.instance[0].fw->data; break; case KGD_ENGINE_SDMA2: hdr = (const union amdgpu_firmware_header *) - adev->sdma.instance[1].fw->data; + adev->sdma.instance[1].fw->data; break; default: @@ -1018,21 +502,3 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) /* Only 12 bit in use*/ return hdr->common.ucode_version; } - -static void set_num_of_requests(struct kgd_dev *kgd, - uint8_t num_of_requests) -{ - pr_debug("This is a stub\n"); -} - -static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, - uint32_t page_table_base) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - /* TODO: Don't use hardcoded VMIDs */ - if (vmid < 8 || vmid > 15) { - pr_err("trying to set page table base for wrong VMID\n"); - return; - } - WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); -} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c deleted file mode 100644 index edbae19..0000000 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ /dev/null @@ -1,1227 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -#undef pr_fmt -#define pr_fmt(fmt) "kfd2kgd: " fmt - -#include -#include -#include -#include -#include -#include "amdgpu.h" -#include "amdgpu_amdkfd.h" -#include "amdgpu_ucode.h" -#include "amdgpu_amdkfd_gfx_v8.h" -#include "vega10/soc15ip.h" -#include "vega10/GC/gc_9_0_offset.h" -#include "vega10/GC/gc_9_0_sh_mask.h" -#include "vega10/vega10_enum.h" -#include "vega10/SDMA0/sdma0_4_0_offset.h" -#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" -#include "vega10/SDMA1/sdma1_4_0_offset.h" -#include "vega10/SDMA1/sdma1_4_0_sh_mask.h" -#include "vega10/ATHUB/athub_1_0_offset.h" -#include "vega10/ATHUB/athub_1_0_sh_mask.h" -#include "vega10/OSSSYS/osssys_4_0_offset.h" -#include "vega10/OSSSYS/osssys_4_0_sh_mask.h" -#include "soc15_common.h" -#include "v9_structs.h" -#include "soc15.h" -#include "soc15d.h" - -/* HACK: MMHUB and GC both have VM-related register with the same - * names but different offsets. Define the MMHUB register we need here - * with a prefix. A proper solution would be to move the functions - * programming these registers into gfx_v9_0.c and mmhub_v1_0.c - * respectively. - */ -#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3 -#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0 - -#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705 -#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0 - -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0 -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0 - -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0 -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0 - -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0 -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c -#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0 - -#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727 -#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0 -#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728 -#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0 - -enum hqd_dequeue_request_type { - NO_ACTION = 0, - DRAIN_PIPE, - RESET_WAVES, - SAVE_WAVES -}; - -static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { - mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, - mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, - mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, - mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL -}; - - -static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, - void *vm, struct kgd_mem **mem); -static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); - -static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, - int fd, uint32_t handle, struct kgd_mem **mem); - -static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); - -/* - * Register access functions - */ - -static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, - uint32_t sh_mem_config, - uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, - uint32_t sh_mem_bases); -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, - unsigned int vmid); -static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr); -static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); -static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm); -static int kgd_hqd_dump(struct kgd_dev *kgd, - uint32_t pipe_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs); -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, - uint32_t __user *wptr, struct mm_struct *mm); -static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, - uint32_t engine_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs); -static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); -static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, - enum kfd_preempt_type reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id); -static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout); -static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); -static uint32_t get_watch_base_addr(void); -static int kgd_address_watch_disable(struct kgd_dev *kgd); -static int kgd_address_watch_execute(struct kgd_dev *kgd, - unsigned int watch_point_id, - uint32_t cntl_val, - uint32_t addr_hi, - uint32_t addr_lo); -static int kgd_wave_control_execute(struct kgd_dev *kgd, - uint32_t gfx_index_val, - uint32_t sq_cmd); -static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, - unsigned int watch_point_id, - unsigned int reg_offset); - -static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, - uint8_t vmid); -static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid); -static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); -static void set_num_of_requests(struct kgd_dev *kgd, - uint8_t num_of_requests); -static int alloc_memory_of_scratch(struct kgd_dev *kgd, - uint64_t va, uint32_t vmid); -static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, - uint8_t element_size, uint8_t index_stride, uint8_t mtype); -static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, - uint32_t page_table_base); -static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); - -/* Because of REG_GET_FIELD() being used, we put this function in the - * asic specific file. - */ -static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, - struct tile_config *config) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - - config->gb_addr_config = adev->gfx.config.gb_addr_config; -#if 0 -/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but - * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu - * changes commented out related code, doing the same here for now but - * need to sync with Ken et al - */ - config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, - MC_ARB_RAMCFG, NOOFBANK); - config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, - MC_ARB_RAMCFG, NOOFRANKS); -#endif - - config->tile_config_ptr = adev->gfx.config.tile_mode_array; - config->num_tile_configs = - ARRAY_SIZE(adev->gfx.config.tile_mode_array); - config->macro_tile_config_ptr = - adev->gfx.config.macrotile_mode_array; - config->num_macro_tile_configs = - ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); - - return 0; -} - -static const struct kfd2kgd_calls kfd2kgd = { - .init_gtt_mem_allocation = alloc_gtt_mem, - .free_gtt_mem = free_gtt_mem, - .get_local_mem_info = get_local_mem_info, - .get_gpu_clock_counter = get_gpu_clock_counter, - .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, - .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, - .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, - .create_process_gpumem = create_process_gpumem, - .destroy_process_gpumem = destroy_process_gpumem, - .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, - .open_graphic_handle = open_graphic_handle, - .program_sh_mem_settings = kgd_program_sh_mem_settings, - .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, - .init_pipeline = kgd_init_pipeline, - .init_interrupts = kgd_init_interrupts, - .hqd_load = kgd_hqd_load, - .hqd_sdma_load = kgd_hqd_sdma_load, - .hqd_dump = kgd_hqd_dump, - .hqd_sdma_dump = kgd_hqd_sdma_dump, - .hqd_is_occupied = kgd_hqd_is_occupied, - .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, - .hqd_destroy = kgd_hqd_destroy, - .hqd_sdma_destroy = kgd_hqd_sdma_destroy, - .address_watch_disable = kgd_address_watch_disable, - .address_watch_execute = kgd_address_watch_execute, - .wave_control_execute = kgd_wave_control_execute, - .address_watch_get_offset = kgd_address_watch_get_offset, - .get_atc_vmid_pasid_mapping_pasid = - get_atc_vmid_pasid_mapping_pasid, - .get_atc_vmid_pasid_mapping_valid = - get_atc_vmid_pasid_mapping_valid, - .write_vmid_invalidate_request = write_vmid_invalidate_request, - .invalidate_tlbs = invalidate_tlbs, - .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, - .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, - .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, - .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, - .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, - .get_fw_version = get_fw_version, - .set_num_of_requests = set_num_of_requests, - .get_cu_info = get_cu_info, - .alloc_memory_of_scratch = alloc_memory_of_scratch, - .write_config_static_mem = write_config_static_mem, - .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, - .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, - .set_vm_context_page_table_base = set_vm_context_page_table_base, - .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, - .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, - .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, - .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, - .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, - .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, - .submit_ib = amdgpu_amdkfd_submit_ib, - .get_tile_config = amdgpu_amdkfd_get_tile_config, - .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, - .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, - .get_vram_usage = amdgpu_amdkfd_get_vram_usage -}; - -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions() -{ - return (struct kfd2kgd_calls *)&kfd2kgd; -} - -static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, - void *vm, struct kgd_mem **mem) -{ - return 0; -} - -/* Destroys the GPU allocation and frees the kgd_mem structure */ -static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) -{ - -} - -static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, - int fd, uint32_t handle, struct kgd_mem **mem) -{ - return 0; -} - -static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) -{ - return (struct amdgpu_device *)kgd; -} - -static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, - uint32_t queue, uint32_t vmid) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - - mutex_lock(&adev->srbm_mutex); - soc15_grbm_select(adev, mec, pipe, queue, vmid); -} - -static void unlock_srbm(struct kgd_dev *kgd) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - - soc15_grbm_select(adev, 0, 0, 0, 0); - mutex_unlock(&adev->srbm_mutex); -} - -static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t queue_id) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - - uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - lock_srbm(kgd, mec, pipe, queue_id, 0); -} - -static uint32_t get_queue_mask(struct amdgpu_device *adev, - uint32_t pipe_id, uint32_t queue_id) -{ - unsigned int bit = (pipe_id * adev->gfx.mec.num_pipe_per_mec + - queue_id) & 31; - - return ((uint32_t)1) << bit; -} - -static void release_queue(struct kgd_dev *kgd) -{ - unlock_srbm(kgd); -} - -static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, - uint32_t sh_mem_config, - uint32_t sh_mem_ape1_base, - uint32_t sh_mem_ape1_limit, - uint32_t sh_mem_bases) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - - lock_srbm(kgd, 0, 0, 0, vmid); - - WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); - WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); - /* APE1 no longer exists on GFX9 */ - - unlock_srbm(kgd); -} - -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, - unsigned int vmid) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - - /* - * We have to assume that there is no outstanding mapping. - * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because - * a mapping is in progress or because a mapping finished - * and the SW cleared it. - * So the protocol is to always wait & clear. - */ - uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | - ATC_VMID0_PASID_MAPPING__VALID_MASK; - - /* - * need to do this twice, once for gfx and once for mmhub - * for ATC add 16 to VMID for mmhub, for IH different registers. - * ATC_VMID0..15 registers are separate from ATC_VMID16..31. - */ - - WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, - pasid_mapping); - - while (!(RREG32(SOC15_REG_OFFSET( - ATHUB, 0, - mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & - (1U << vmid))) - cpu_relax(); - - WREG32(SOC15_REG_OFFSET(ATHUB, 0, - mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), - 1U << vmid); - - /* Mapping vmid to pasid also for IH block */ - WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, - pasid_mapping); - - WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, - pasid_mapping); - - while (!(RREG32(SOC15_REG_OFFSET( - ATHUB, 0, - mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & - (1U << (vmid + 16)))) - cpu_relax(); - - WREG32(SOC15_REG_OFFSET(ATHUB, 0, - mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), - 1U << (vmid + 16)); - - /* Mapping vmid to pasid also for IH block */ - WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, - pasid_mapping); - return 0; -} - -static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr) -{ - /* amdgpu owns the per-pipe state */ - return 0; -} - -/* TODO - RING0 form of field is obsolete, seems to date back to SI - * but still works - */ - -static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t mec; - uint32_t pipe; - - mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - lock_srbm(kgd, mec, pipe, 0, 0); - - WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), - CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | - CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); - - unlock_srbm(kgd); - - return 0; -} - -static uint32_t get_sdma_base_addr(unsigned int engine_id, - unsigned int queue_id) -{ - static const uint32_t base[2] = { - SOC15_REG_OFFSET(SDMA0, 0, - mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, - SOC15_REG_OFFSET(SDMA1, 0, - mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL - }; - uint32_t retval; - - retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - - mmSDMA0_RLC0_RB_CNTL); - - pr_debug("sdma base address: 0x%x\n", retval); - - return retval; -} - -static uint32_t get_watch_base_addr(void) -{ - uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) - - mmTCP_WATCH0_ADDR_H; - - pr_debug("kfd: reg watch base address: 0x%x\n", retval); - - return retval; -} - -static inline struct v9_mqd *get_mqd(void *mqd) -{ - return (struct v9_mqd *)mqd; -} - -static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) -{ - return (struct v9_sdma_mqd *)mqd; -} - -static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct v9_mqd *m; - uint32_t *mqd_hqd; - uint32_t reg, hqd_base, data; - - m = get_mqd(mqd); - - acquire_queue(kgd, pipe_id, queue_id); - - /* HIQ is set during driver init period with vmid set to 0*/ - if (m->cp_hqd_vmid == 0) { - uint32_t value, mec, pipe; - - mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; - pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); - - pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", - mec, pipe, queue_id); - value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); - value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, - ((mec << 5) | (pipe << 3) | queue_id | 0x80)); - WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); - } - - /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ - mqd_hqd = &m->cp_mqd_base_addr_lo; - hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); - - for (reg = hqd_base; - reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) - WREG32(reg, mqd_hqd[reg - hqd_base]); - - - /* Activate doorbell logic before triggering WPTR poll. */ - data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, - CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); - WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); - - if (wptr) { - /* Don't read wptr with get_user because the user - * context may not be accessible (if this function - * runs in a work queue). Instead trigger a one-shot - * polling read from memory in the CP. This assumes - * that wptr is GPU-accessible in the queue's VMID via - * ATC or SVM. WPTR==RPTR before starting the poll so - * the CP starts fetching new commands from the right - * place. - * - * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit - * tricky. Assume that the queue didn't overflow. The - * number of valid bits in the 32-bit RPTR depends on - * the queue size. The remaining bits are taken from - * the saved 64-bit WPTR. If the WPTR wrapped, add the - * queue size. - */ - uint32_t queue_size = - 2 << REG_GET_FIELD(m->cp_hqd_pq_control, - CP_HQD_PQ_CONTROL, QUEUE_SIZE); - uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); - - if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) - guessed_wptr += queue_size; - guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); - guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; - - WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), - lower_32_bits(guessed_wptr)); - WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), - upper_32_bits(guessed_wptr)); - WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), - lower_32_bits((uint64_t)wptr)); - WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), - upper_32_bits((uint64_t)wptr)); - WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), - get_queue_mask(adev, pipe_id, queue_id)); - } - - /* Start the EOP fetcher */ - WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), - REG_SET_FIELD(m->cp_hqd_eop_rptr, - CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); - - data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); - WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); - - release_queue(kgd); - - return 0; -} - -static int kgd_hqd_dump(struct kgd_dev *kgd, - uint32_t pipe_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t i = 0, reg; -#define HQD_N_REGS 56 -#define DUMP_REG(addr) do { \ - if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ - break; \ - (*dump)[i][0] = (addr) << 2; \ - (*dump)[i++][1] = RREG32(addr); \ - } while (0) - - *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); - if (*dump == NULL) - return -ENOMEM; - - acquire_queue(kgd, pipe_id, queue_id); - - for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); - reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) - DUMP_REG(reg); - - release_queue(kgd); - - WARN_ON_ONCE(i != HQD_N_REGS); - *n_regs = i; - - return 0; -} - -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, - uint32_t __user *wptr, struct mm_struct *mm) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct v9_sdma_mqd *m; - uint32_t sdma_base_addr, sdmax_gfx_context_cntl; - uint32_t temp, timeout = 2000; - uint32_t data; - uint64_t data64; - uint64_t __user *wptr64 = (uint64_t __user *)wptr; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, - m->sdma_queue_id); - sdmax_gfx_context_cntl = m->sdma_engine_id ? - SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : - SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); - - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, - m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); - - while (true) { - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) - break; - if (timeout == 0) - return -ETIME; - msleep(10); - timeout -= 10; - } - data = RREG32(sdmax_gfx_context_cntl); - data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, - RESUME_CTX, 0); - WREG32(sdmax_gfx_context_cntl, data); - - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, - m->sdmax_rlcx_doorbell_offset); - - data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, - ENABLE, 1); - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, - m->sdmax_rlcx_rb_rptr_hi); - - WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); - if (read_user_wptr(mm, wptr64, data64)) { - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, - lower_32_bits(data64)); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, - upper_32_bits(data64)); - } else { - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, - m->sdmax_rlcx_rb_rptr); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, - m->sdmax_rlcx_rb_rptr_hi); - } - WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); - - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, - m->sdmax_rlcx_rb_base_hi); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, - m->sdmax_rlcx_rb_rptr_addr_lo); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, - m->sdmax_rlcx_rb_rptr_addr_hi); - - data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, - RB_ENABLE, 1); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); - - return 0; -} - -static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, - uint32_t engine_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t sdma_base_addr = get_sdma_base_addr(engine_id, queue_id); - uint32_t i = 0, reg; -#undef HQD_N_REGS -#define HQD_N_REGS (19+6+7+10) - - *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); - if (*dump == NULL) - return -ENOMEM; - - for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) - DUMP_REG(sdma_base_addr + reg); - for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) - DUMP_REG(sdma_base_addr + reg); - for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; - reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) - DUMP_REG(sdma_base_addr + reg); - for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; - reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) - DUMP_REG(sdma_base_addr + reg); - - WARN_ON_ONCE(i != HQD_N_REGS); - *n_regs = i; - - return 0; -} - -static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t act; - bool retval = false; - uint32_t low, high; - - acquire_queue(kgd, pipe_id, queue_id); - act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); - if (act) { - low = lower_32_bits(queue_address >> 8); - high = upper_32_bits(queue_address >> 8); - - if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && - high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) - retval = true; - } - release_queue(kgd); - return retval; -} - -static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct v9_sdma_mqd *m; - uint32_t sdma_base_addr; - uint32_t sdma_rlc_rb_cntl; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, - m->sdma_queue_id); - - sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); - - if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) - return true; - - return false; -} - -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, - enum kfd_preempt_type reset_type, - unsigned int utimeout, uint32_t pipe_id, - uint32_t queue_id) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - enum hqd_dequeue_request_type type; - unsigned long end_jiffies; - uint32_t temp; - struct v9_mqd *m = get_mqd(mqd); - -#if 0 - unsigned long flags; - int retry; -#endif - - acquire_queue(kgd, pipe_id, queue_id); - - if (m->cp_hqd_vmid == 0) - WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); - - switch (reset_type) { - case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: - type = DRAIN_PIPE; - break; - case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: - type = RESET_WAVES; - break; - default: - type = DRAIN_PIPE; - break; - } - -#if 0 /* Is this still needed? */ - /* Workaround: If IQ timer is active and the wait time is close to or - * equal to 0, dequeueing is not safe. Wait until either the wait time - * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is - * cleared before continuing. Also, ensure wait times are set to at - * least 0x3. - */ - local_irq_save(flags); - preempt_disable(); - retry = 5000; /* wait for 500 usecs at maximum */ - while (true) { - temp = RREG32(mmCP_HQD_IQ_TIMER); - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { - pr_debug("HW is processing IQ\n"); - goto loop; - } - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) - == 3) /* SEM-rearm is safe */ - break; - /* Wait time 3 is safe for CP, but our MMIO read/write - * time is close to 1 microsecond, so check for 10 to - * leave more buffer room - */ - if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) - >= 10) - break; - pr_debug("IQ timer is active\n"); - } else - break; -loop: - if (!retry) { - pr_err("CP HQD IQ timer status time out\n"); - break; - } - ndelay(100); - --retry; - } - retry = 1000; - while (true) { - temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); - if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) - break; - pr_debug("Dequeue request is pending\n"); - - if (!retry) { - pr_err("CP HQD dequeue request time out\n"); - break; - } - ndelay(100); - --retry; - } - local_irq_restore(flags); - preempt_enable(); -#endif - - WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); - - end_jiffies = (utimeout * HZ / 1000) + jiffies; - while (true) { - temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); - if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) - break; - if (time_after(jiffies, end_jiffies)) { - pr_err("cp queue preemption time out.\n"); - release_queue(kgd); - return -ETIME; - } - usleep_range(500, 1000); - } - - release_queue(kgd); - return 0; -} - -static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int utimeout) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct v9_sdma_mqd *m; - uint32_t sdma_base_addr; - uint32_t temp; - unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, - m->sdma_queue_id); - - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); - temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); - - while (true) { - temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) - break; - if (time_after(jiffies, end_jiffies)) - return -ETIME; - usleep_range(500, 1000); - } - - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, - RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | - SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); - - m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); - m->sdmax_rlcx_rb_rptr_hi = - RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); - - return 0; -} - -static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, - uint8_t vmid) -{ - uint32_t reg; - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) - + vmid); - return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; -} - -static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid) -{ - uint32_t reg; - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - - reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) - + vmid); - return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; -} - -static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) -{ - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - uint32_t req = (1 << vmid) | - (1 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* light */ - VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK | - VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK | - VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK | - VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK | - VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK; - - spin_lock(&adev->tlb_invalidation_lock); - - /* Use light weight invalidation. - * - * TODO 1: agree on the right set of invalidation registers for - * KFD use. Use the last one for now. Invalidate both GC and - * MMHUB. - * - * TODO 2: support range-based invalidation, requires kfg2kgd - * interface change - */ - WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), - 0xffffffff); - WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), - 0x0000001f); - - WREG32(SOC15_REG_OFFSET(MMHUB, 0, - mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), - 0xffffffff); - WREG32(SOC15_REG_OFFSET(MMHUB, 0, - mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), - 0x0000001f); - - WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req); - - WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), - req); - - while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) & - (1 << vmid))) - cpu_relax(); - - while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, - mmMMHUB_VM_INVALIDATE_ENG16_ACK)) & - (1 << vmid))) - cpu_relax(); - - spin_unlock(&adev->tlb_invalidation_lock); - -} - -static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) -{ - signed long r; - struct dma_fence *f; - struct amdgpu_ring *ring = &adev->gfx.kiq.ring; - - mutex_lock(&adev->gfx.kiq.ring_mutex); - amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ - amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); - amdgpu_ring_write(ring, - PACKET3_INVALIDATE_TLBS_DST_SEL(1) | - PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | - PACKET3_INVALIDATE_TLBS_PASID(pasid) | - PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(2)); - amdgpu_fence_emit(ring, &f); - amdgpu_ring_commit(ring); - mutex_unlock(&adev->gfx.kiq.ring_mutex); - - r = dma_fence_wait(f, false); - if (r) - DRM_ERROR("wait for kiq fence error: %ld.\n", r); - dma_fence_put(f); - - return r; -} - -static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) -{ - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - int vmid; - struct amdgpu_ring *ring = &adev->gfx.kiq.ring; - - if (ring->ready) - return invalidate_tlbs_with_kiq(adev, pasid); - - for (vmid = 0; vmid < 16; vmid++) { - if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) - continue; - if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { - if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) - == pasid) { - write_vmid_invalidate_request(kgd, vmid); - break; - } - } - } - - return 0; -} - -static int kgd_address_watch_disable(struct kgd_dev *kgd) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - union TCP_WATCH_CNTL_BITS cntl; - unsigned int i; - uint32_t watch_base_addr; - - cntl.u32All = 0; - - cntl.bitfields.valid = 0; - cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; - cntl.bitfields.atc = 1; - - watch_base_addr = get_watch_base_addr(); - /* Turning off this address until we set all the registers */ - for (i = 0; i < MAX_WATCH_ADDRESSES; i++) - WREG32(watch_base_addr + - watchRegs[i * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_CNTL], - cntl.u32All); - - return 0; -} - -static int kgd_address_watch_execute(struct kgd_dev *kgd, - unsigned int watch_point_id, - uint32_t cntl_val, - uint32_t addr_hi, - uint32_t addr_lo) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - union TCP_WATCH_CNTL_BITS cntl; - uint32_t watch_base_addr; - - watch_base_addr = get_watch_base_addr(); - cntl.u32All = cntl_val; - - /* Turning off this watch point until we set all the registers */ - cntl.bitfields.valid = 0; - WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], - cntl.u32All); - - WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], - addr_hi); - - WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], - addr_lo); - - /* Enable the watch point */ - cntl.bitfields.valid = 1; - - WREG32(watch_base_addr + - watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_CNTL], - cntl.u32All); - - return 0; -} - -static int kgd_wave_control_execute(struct kgd_dev *kgd, - uint32_t gfx_index_val, - uint32_t sq_cmd) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t data = 0; - - mutex_lock(&adev->grbm_idx_mutex); - - WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); - WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); - - data = REG_SET_FIELD(data, GRBM_GFX_INDEX, - INSTANCE_BROADCAST_WRITES, 1); - data = REG_SET_FIELD(data, GRBM_GFX_INDEX, - SH_BROADCAST_WRITES, 1); - data = REG_SET_FIELD(data, GRBM_GFX_INDEX, - SE_BROADCAST_WRITES, 1); - - WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); - mutex_unlock(&adev->grbm_idx_mutex); - - return 0; -} - -static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, - unsigned int watch_point_id, - unsigned int reg_offset) -{ - return get_watch_base_addr() + - watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; -} - -static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, - uint8_t element_size, uint8_t index_stride, uint8_t mtype) -{ - /* No longer needed on GFXv9. These values are now hard-coded, - * except for the MTYPE which comes from the page table. - */ - - return 0; -} -static int alloc_memory_of_scratch(struct kgd_dev *kgd, - uint64_t va, uint32_t vmid) -{ - /* No longer needed on GFXv9. The scratch base address is - * passed to the shader by the CP. It's the user mode driver's - * responsibility. - */ - - return 0; -} - -/* FIXME: Does this need to be ASIC-specific code? */ -static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) -{ - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - const union amdgpu_firmware_header *hdr; - - switch (type) { - case KGD_ENGINE_PFP: - hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; - break; - - case KGD_ENGINE_ME: - hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; - break; - - case KGD_ENGINE_CE: - hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; - break; - - case KGD_ENGINE_MEC1: - hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; - break; - - case KGD_ENGINE_MEC2: - hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; - break; - - case KGD_ENGINE_RLC: - hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; - break; - - case KGD_ENGINE_SDMA1: - hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; - break; - - case KGD_ENGINE_SDMA2: - hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; - break; - - default: - return 0; - } - - if (hdr == NULL) - return 0; - - /* Only 12 bit in use*/ - return hdr->common.ucode_version; -} - -static void set_num_of_requests(struct kgd_dev *kgd, - uint8_t num_of_requests) -{ - pr_debug("This is a stub\n"); -} - -static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, - uint32_t page_table_base) -{ - struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT | - AMDGPU_PTE_VALID; - - /* TODO: Don't use hardcoded VMIDs */ - if (vmid < 8 || vmid > 15) { - pr_err("trying to set page table base for wrong VMID %u\n", - vmid); - return; - } - - /* TODO: take advantage of per-process address space size. For - * now, all processes share the same address space size, like - * on GFX8 and older. - */ - WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); - WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); - - WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), - lower_32_bits(adev->vm_manager.max_pfn - 1)); - WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), - upper_32_bits(adev->vm_manager.max_pfn - 1)); - - WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); - WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); - - WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); - WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); - - WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), - lower_32_bits(adev->vm_manager.max_pfn - 1)); - WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), - upper_32_bits(adev->vm_manager.max_pfn - 1)); - - WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); - WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); -} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c deleted file mode 100644 index 7df892d..0000000 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ /dev/null @@ -1,2578 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#undef pr_fmt -#define pr_fmt(fmt) "kfd2kgd: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "amdgpu_amdkfd.h" -#include "amdgpu_ucode.h" -#include "gca/gfx_8_0_sh_mask.h" -#include "gca/gfx_8_0_d.h" -#include "gca/gfx_8_0_enum.h" -#include "oss/oss_3_0_sh_mask.h" -#include "oss/oss_3_0_d.h" -#include "gmc/gmc_8_1_sh_mask.h" -#include "gmc/gmc_8_1_d.h" - -/* Special VM and GART address alignment needed for VI pre-Fiji due to - * a HW bug. - */ -#define VI_BO_SIZE_ALIGN (0x8000) - -/* BO flag to indicate a KFD userptr BO */ -#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63) - -/* Impose limit on how much memory KFD can use */ -struct kfd_mem_usage_limit { - uint64_t max_system_mem_limit; - uint64_t max_userptr_mem_limit; - int64_t system_mem_used; - int64_t userptr_mem_used; - spinlock_t mem_limit_lock; -}; - -static struct kfd_mem_usage_limit kfd_mem_limit; - -/* Struct used for amdgpu_amdkfd_bo_validate */ -struct amdgpu_vm_parser { - uint32_t domain; - bool wait; -}; - -static const char * const domain_bit_to_string[] = { - "CPU", - "GTT", - "VRAM", - "GDS", - "GWS", - "OA" -}; - -#define domain_string(domain) domain_bit_to_string[ffs(domain)-1] - -static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); - - -static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) -{ - return (struct amdgpu_device *)kgd; -} - -static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, - struct kgd_mem *mem) -{ - struct kfd_bo_va_list *entry; - - list_for_each_entry(entry, &mem->bo_va_list, bo_list) - if (entry->bo_va->base.vm == avm) - return false; - - return true; -} - -/* Set memory usage limits. Current, limits are - * System (kernel) memory - 15/16th System RAM - * Userptr memory - 15/16th System RAM - */ -void amdgpu_amdkfd_gpuvm_init_mem_limits(void) -{ - struct sysinfo si; - uint64_t mem; - - si_meminfo(&si); - mem = si.totalram - si.totalhigh; - mem *= si.mem_unit; - - spin_lock_init(&kfd_mem_limit.mem_limit_lock); - kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4); /* 15/16 */ - kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 4); /* 15/16 */ - pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n", - (kfd_mem_limit.max_system_mem_limit >> 20), - (kfd_mem_limit.max_userptr_mem_limit >> 20)); -} - -static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, - uint64_t size, u32 domain) -{ - size_t acc_size; - int ret = 0; - - acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, - sizeof(struct amdgpu_bo)); - - spin_lock(&kfd_mem_limit.mem_limit_lock); - if (domain == AMDGPU_GEM_DOMAIN_GTT) { - if (kfd_mem_limit.system_mem_used + (acc_size + size) > - kfd_mem_limit.max_system_mem_limit) { - ret = -ENOMEM; - goto err_no_mem; - } - kfd_mem_limit.system_mem_used += (acc_size + size); - } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { - if ((kfd_mem_limit.system_mem_used + acc_size > - kfd_mem_limit.max_system_mem_limit) || - (kfd_mem_limit.userptr_mem_used + (size + acc_size) > - kfd_mem_limit.max_userptr_mem_limit)) { - ret = -ENOMEM; - goto err_no_mem; - } - kfd_mem_limit.system_mem_used += acc_size; - kfd_mem_limit.userptr_mem_used += size; - } -err_no_mem: - spin_unlock(&kfd_mem_limit.mem_limit_lock); - return ret; -} - -static void unreserve_system_mem_limit(struct amdgpu_device *adev, - uint64_t size, u32 domain) -{ - size_t acc_size; - - acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, - sizeof(struct amdgpu_bo)); - - spin_lock(&kfd_mem_limit.mem_limit_lock); - if (domain == AMDGPU_GEM_DOMAIN_GTT) { - kfd_mem_limit.system_mem_used -= (acc_size + size); - } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { - kfd_mem_limit.system_mem_used -= acc_size; - kfd_mem_limit.userptr_mem_used -= size; - } - WARN_ONCE(kfd_mem_limit.system_mem_used < 0, - "kfd system memory accounting unbalanced"); - WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, - "kfd userptr memory accounting unbalanced"); - - spin_unlock(&kfd_mem_limit.mem_limit_lock); -} - -void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo) -{ - spin_lock(&kfd_mem_limit.mem_limit_lock); - - if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) { - kfd_mem_limit.system_mem_used -= bo->tbo.acc_size; - kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo); - } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { - kfd_mem_limit.system_mem_used -= - (bo->tbo.acc_size + amdgpu_bo_size(bo)); - } - WARN_ONCE(kfd_mem_limit.system_mem_used < 0, - "kfd system memory accounting unbalanced"); - WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, - "kfd userptr memory accounting unbalanced"); - - spin_unlock(&kfd_mem_limit.mem_limit_lock); -} - - -/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence(s) from BO's - * reservation object. - * - * @bo: [IN] Remove eviction fence(s) from this BO - * @ef: [IN] If ef is specified, then this eviction fence is removed if it - * is present in the shared list. - * @ef_list: [OUT] Returns list of eviction fences. These fences are removed - * from BO's reservation object shared list. - * @ef_count: [OUT] Number of fences in ef_list. - * - * NOTE: If called with ef_list, then amdgpu_amdkfd_add_eviction_fence must be - * called to restore the eviction fences and to avoid memory leak. This is - * useful for shared BOs. - * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held. - */ -static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, - struct amdgpu_amdkfd_fence *ef, - struct amdgpu_amdkfd_fence ***ef_list, - unsigned int *ef_count) -{ - struct reservation_object_list *fobj; - struct reservation_object *resv; - unsigned int i = 0, j = 0, k = 0, shared_count; - unsigned int count = 0; - struct amdgpu_amdkfd_fence **fence_list; - - if (!ef && !ef_list) - return -EINVAL; - - if (ef_list) { - *ef_list = NULL; - *ef_count = 0; - } - - resv = bo->tbo.resv; - fobj = reservation_object_get_list(resv); - - if (!fobj) - return 0; - - preempt_disable(); - write_seqcount_begin(&resv->seq); - - /* Go through all the shared fences in the resevation object. If - * ef is specified and it exists in the list, remove it and reduce the - * count. If ef is not specified, then get the count of eviction fences - * present. - */ - shared_count = fobj->shared_count; - for (i = 0; i < shared_count; ++i) { - struct dma_fence *f; - - f = rcu_dereference_protected(fobj->shared[i], - reservation_object_held(resv)); - - if (ef) { - if (f->context == ef->base.context) { - dma_fence_put(f); - fobj->shared_count--; - } else - RCU_INIT_POINTER(fobj->shared[j++], f); - - } else if (to_amdgpu_amdkfd_fence(f)) - count++; - } - write_seqcount_end(&resv->seq); - preempt_enable(); - - if (ef || !count) - return 0; - - /* Alloc memory for count number of eviction fence pointers. Fill the - * ef_list array and ef_count - */ - - fence_list = kcalloc(count, sizeof(struct amdgpu_amdkfd_fence *), - GFP_KERNEL); - if (!fence_list) - return -ENOMEM; - - preempt_disable(); - write_seqcount_begin(&resv->seq); - - j = 0; - for (i = 0; i < shared_count; ++i) { - struct dma_fence *f; - struct amdgpu_amdkfd_fence *efence; - - f = rcu_dereference_protected(fobj->shared[i], - reservation_object_held(resv)); - - efence = to_amdgpu_amdkfd_fence(f); - if (efence) { - fence_list[k++] = efence; - fobj->shared_count--; - } else - RCU_INIT_POINTER(fobj->shared[j++], f); - } - - write_seqcount_end(&resv->seq); - preempt_enable(); - - *ef_list = fence_list; - *ef_count = k; - - return 0; -} - -/* amdgpu_amdkfd_add_eviction_fence - Adds eviction fence(s) back into BO's - * reservation object. - * - * @bo: [IN] Add eviction fences to this BO - * @ef_list: [IN] List of eviction fences to be added - * @ef_count: [IN] Number of fences in ef_list. - * - * NOTE: Must call amdgpu_amdkfd_remove_eviction_fence before calling this - * function. - */ -static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo, - struct amdgpu_amdkfd_fence **ef_list, - unsigned int ef_count) -{ - int i; - - if (!ef_list || !ef_count) - return; - - for (i = 0; i < ef_count; i++) { - amdgpu_bo_fence(bo, &ef_list[i]->base, true); - /* Readding the fence takes an additional reference. Drop that - * reference. - */ - dma_fence_put(&ef_list[i]->base); - } - - kfree(ef_list); -} - -static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, - bool wait) -{ - int ret; - - if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm), - "Called with userptr BO")) - return -EINVAL; - - amdgpu_ttm_placement_from_domain(bo, domain); - - ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); - if (ret) - goto validate_fail; - if (wait) { - struct amdgpu_amdkfd_fence **ef_list; - unsigned int ef_count; - - ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list, - &ef_count); - if (ret) - goto validate_fail; - - ttm_bo_wait(&bo->tbo, false, false); - amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count); - } - -validate_fail: - return ret; -} - -static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo) -{ - struct amdgpu_vm_parser *p = param; - - return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait); -} - -/* vm_validate_pt_pd_bos - Validate page table and directory BOs - * - * Also updates page directory entries so we don't need to do this - * again later until the page directory is validated again (e.g. after - * an eviction or allocating new page tables). - */ -static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm) -{ - struct amdgpu_bo *pd = vm->root.base.bo; - struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev); - struct amdgpu_vm_parser param; - int ret; - - param.domain = AMDGPU_GEM_DOMAIN_VRAM; - param.wait = false; - - ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate, - ¶m); - if (ret) { - pr_err("amdgpu: failed to validate PT BOs\n"); - return ret; - } - - ret = amdgpu_amdkfd_validate(¶m, pd); - if (ret) { - pr_err("amdgpu: failed to validate PD\n"); - return ret; - } - - ret = amdgpu_vm_update_directories(adev, vm); - if (ret != 0) - return ret; - - return 0; -} - -/* add_bo_to_vm - Add a BO to a VM - * - * Everything that needs to bo done only once when a BO is first added - * to a VM. It can later be mapped and unmapped many times without - * repeating these steps. - * - * 1. Allocate and initialize BO VA entry data structure - * 2. Add BO to the VM - * 3. Determine ASIC-specific PTE flags - * 4. Alloc page tables and directories if needed - * 4a. Validate new page tables and directories and update directories - */ -static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem, - struct amdgpu_vm *avm, bool is_aql, - struct kfd_bo_va_list **p_bo_va_entry) -{ - int ret; - struct kfd_bo_va_list *bo_va_entry; - struct amdkfd_vm *kvm = container_of(avm, - struct amdkfd_vm, base); - struct amdgpu_bo *pd = avm->root.base.bo; - struct amdgpu_bo *bo = mem->bo; - uint64_t va = mem->va; - struct list_head *list_bo_va = &mem->bo_va_list; - unsigned long bo_size = bo->tbo.mem.size; - - if (!va) { - pr_err("Invalid VA when adding BO to VM\n"); - return -EINVAL; - } - - if (is_aql) - va += bo_size; - - bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL); - if (!bo_va_entry) - return -ENOMEM; - - pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va, - va + bo_size, avm); - - /* Add BO to VM internal data structures*/ - bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo); - if (bo_va_entry->bo_va == NULL) { - ret = -EINVAL; - pr_err("Failed to add BO object to VM. ret == %d\n", - ret); - goto err_vmadd; - } - - bo_va_entry->va = va; - bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev, - mem->mapping_flags); - bo_va_entry->kgd_dev = (void *)adev; - list_add(&bo_va_entry->bo_list, list_bo_va); - - if (p_bo_va_entry) - *p_bo_va_entry = bo_va_entry; - - /* Allocate new page tables if neeeded and validate - * them. Clearing of new page tables and validate need to wait - * on move fences. We don't want that to trigger the eviction - * fence, so remove it temporarily. - */ - amdgpu_amdkfd_remove_eviction_fence(pd, - kvm->process_info->eviction_fence, - NULL, NULL); - - ret = amdgpu_vm_alloc_pts(adev, avm, va, amdgpu_bo_size(bo)); - if (ret) { - pr_err("Failed to allocate pts, err=%d\n", ret); - goto err_alloc_pts; - } - - ret = vm_validate_pt_pd_bos(avm); - if (ret != 0) { - pr_err("validate_pt_pd_bos() failed\n"); - goto err_alloc_pts; - } - - /* Add the eviction fence back */ - amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); - - return 0; - -err_alloc_pts: - amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); - amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va); - list_del(&bo_va_entry->bo_list); -err_vmadd: - kfree(bo_va_entry); - return ret; -} - -static void remove_bo_from_vm(struct amdgpu_device *adev, - struct kfd_bo_va_list *entry, unsigned long size) -{ - pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n", - entry->va, - entry->va + size, entry); - amdgpu_vm_bo_rmv(adev, entry->bo_va); - list_del(&entry->bo_list); - kfree(entry); -} - -static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, - struct amdkfd_process_info *process_info, - bool userptr) -{ - struct ttm_validate_buffer *entry = &mem->validate_list; - struct amdgpu_bo *bo = mem->bo; - - INIT_LIST_HEAD(&entry->head); - entry->shared = true; - entry->bo = &bo->tbo; - mutex_lock(&process_info->lock); - if (userptr) - list_add_tail(&entry->head, &process_info->userptr_valid_list); - else - list_add_tail(&entry->head, &process_info->kfd_bo_list); - mutex_unlock(&process_info->lock); -} - -/* Initializes user pages. It registers the MMU notifier and validates - * the userptr BO in the GTT domain. - * - * The BO must already be on the userptr_valid_list. Otherwise an - * eviction and restore may happen that leaves the new BO unmapped - * with the user mode queues running. - * - * Takes the process_info->lock to protect against concurrent restore - * workers. - * - * Returns 0 for success, negative errno for errors. - */ -static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, - uint64_t user_addr) -{ - struct amdkfd_process_info *process_info = mem->process_info; - struct amdgpu_bo *bo = mem->bo; - int ret = 0; - - mutex_lock(&process_info->lock); - - ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); - if (ret) { - pr_err("%s: Failed to set userptr: %d\n", __func__, ret); - goto out; - } - - ret = amdgpu_mn_register(bo, user_addr); - if (ret) { - pr_err("%s: Failed to register MMU notifier: %d\n", - __func__, ret); - goto out; - } - - /* If no restore worker is running concurrently, user_pages - * should not be allocated - */ - WARN(mem->user_pages, "Leaking user_pages array"); - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) - mem->user_pages = drm_calloc_large(bo->tbo.ttm->num_pages, - sizeof(struct page *)); -#else - mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages, - sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); -#endif - if (!mem->user_pages) { - pr_err("%s: Failed to allocate pages array\n", __func__); - ret = -ENOMEM; - goto unregister_out; - } - - ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); - if (ret) { - pr_err("%s: Failed to get user pages: %d\n", __func__, ret); - goto free_out; - } - - amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages); - - ret = amdgpu_bo_reserve(bo, true); - if (ret) { - pr_err("%s: Failed to reserve BO\n", __func__); - goto release_out; - } - amdgpu_ttm_placement_from_domain(bo, mem->domain); - ret = ttm_bo_validate(&bo->tbo, &bo->placement, - true, false); - if (ret) - pr_err("%s: failed to validate BO\n", __func__); - amdgpu_bo_unreserve(bo); - -release_out: - if (ret) - release_pages(mem->user_pages, bo->tbo.ttm->num_pages, 0); -free_out: -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) - drm_free_large(mem->user_pages); -#else - kvfree(mem->user_pages); -#endif - mem->user_pages = NULL; -unregister_out: - if (ret) - amdgpu_mn_unregister(bo); -out: - mutex_unlock(&process_info->lock); - return ret; -} - -static int __map_bo_to_kernel(struct amdgpu_bo *bo, u32 domain, void **kptr) -{ - int ret; - - ret = amdgpu_bo_reserve(bo, true); - if (ret) { - pr_err("Failed to reserve bo. ret %d\n", ret); - return ret; - } - - ret = amdgpu_bo_pin(bo, domain, NULL); - if (ret) { - pr_err("Failed to pin bo. ret %d\n", ret); - goto pin_failed; - } - - ret = amdgpu_bo_kmap(bo, kptr); - if (ret) { - pr_err("Failed to map bo to kernel. ret %d\n", ret); - goto kmap_failed; - } - - amdgpu_bo_unreserve(bo); - - return ret; - -kmap_failed: - amdgpu_bo_unpin(bo); -pin_failed: - amdgpu_bo_unreserve(bo); - - return ret; -} - -static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, - uint64_t size, void *vm, struct kgd_mem **mem, - uint64_t *offset, u32 domain, u64 flags, - struct sg_table *sg, bool aql_queue, - bool readonly, bool execute, bool coherent, bool no_sub, - bool userptr) -{ - struct amdgpu_device *adev; - int ret; - struct amdgpu_bo *bo; - uint64_t user_addr = 0; - int byte_align; - u32 alloc_domain; - uint32_t mapping_flags; - struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; - - if (aql_queue) - size = size >> 1; - if (userptr) { - if (!offset || !*offset) - return -EINVAL; - user_addr = *offset; - } - - adev = get_amdgpu_device(kgd); - byte_align = (adev->family == AMDGPU_FAMILY_VI && - adev->asic_type != CHIP_FIJI && - adev->asic_type != CHIP_POLARIS10 && - adev->asic_type != CHIP_POLARIS11) ? - VI_BO_SIZE_ALIGN : 1; - - *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); - if (*mem == NULL) { - ret = -ENOMEM; - goto err; - } - INIT_LIST_HEAD(&(*mem)->bo_va_list); - mutex_init(&(*mem)->lock); - (*mem)->coherent = coherent; - (*mem)->no_substitute = no_sub; - (*mem)->aql_queue = aql_queue; - - mapping_flags = AMDGPU_VM_PAGE_READABLE; - if (!readonly) - mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE; - if (execute) - mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; - if (coherent) - mapping_flags |= AMDGPU_VM_MTYPE_UC; - else - mapping_flags |= AMDGPU_VM_MTYPE_NC; - - (*mem)->mapping_flags = mapping_flags; - - alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain; - - amdgpu_sync_create(&(*mem)->sync); - - ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain); - if (ret) { - pr_err("Insufficient system memory\n"); - goto err_bo_create; - } - - pr_debug("\t create BO VA 0x%llx size 0x%llx domain %s\n", - va, size, domain_string(alloc_domain)); - - /* Allocate buffer object. Userptr objects need to start out - * in the CPU domain, get moved to GTT when pinned. - */ - ret = amdgpu_bo_create(adev, size, byte_align, false, - alloc_domain, - flags, sg, NULL, 0, &bo); - if (ret != 0) { - pr_err("Failed to create BO on domain %s. ret %d\n", - domain_string(alloc_domain), ret); - unreserve_system_mem_limit(adev, size, alloc_domain); - goto err_bo_create; - } - bo->kfd_bo = *mem; - (*mem)->bo = bo; - if (userptr) - bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; - - (*mem)->va = va; - (*mem)->domain = domain; - (*mem)->mapped_to_gpu_memory = 0; - (*mem)->process_info = kfd_vm->process_info; - add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, userptr); - - if (userptr) { - ret = init_user_pages(*mem, current->mm, user_addr); - if (ret) { - mutex_lock(&kfd_vm->process_info->lock); - list_del(&(*mem)->validate_list.head); - mutex_unlock(&kfd_vm->process_info->lock); - goto allocate_init_user_pages_failed; - } - } - - if (offset) - *offset = amdgpu_bo_mmap_offset(bo); - - return 0; - -allocate_init_user_pages_failed: - amdgpu_bo_unref(&bo); -err_bo_create: - kfree(*mem); -err: - return ret; -} - -/* Reserving a BO and its page table BOs must happen atomically to - * avoid deadlocks. When updating userptrs we need to temporarily - * back-off the reservation and then reacquire it. Track all the - * reservation info in a context structure. Buffers can be mapped to - * multiple VMs simultaneously (buffers being restored on multiple - * GPUs). - */ -struct bo_vm_reservation_context { - struct amdgpu_bo_list_entry kfd_bo; - unsigned int n_vms; - struct amdgpu_bo_list_entry *vm_pd; - struct ww_acquire_ctx ticket; - struct list_head list, duplicates; - struct amdgpu_sync *sync; - bool reserved; -}; - -/** - * reserve_bo_and_vm - reserve a BO and a VM unconditionally. - * @mem: KFD BO structure. - * @vm: the VM to reserve. - * @ctx: the struct that will be used in unreserve_bo_and_vms(). - */ -static int reserve_bo_and_vm(struct kgd_mem *mem, - struct amdgpu_vm *vm, - struct bo_vm_reservation_context *ctx) -{ - struct amdgpu_bo *bo = mem->bo; - int ret; - - WARN_ON(!vm); - - ctx->reserved = false; - ctx->n_vms = 1; - ctx->sync = &mem->sync; - - INIT_LIST_HEAD(&ctx->list); - INIT_LIST_HEAD(&ctx->duplicates); - - ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) - * ctx->n_vms, GFP_KERNEL); - if (ctx->vm_pd == NULL) - return -ENOMEM; - - ctx->kfd_bo.robj = bo; - ctx->kfd_bo.priority = 0; - ctx->kfd_bo.tv.bo = &bo->tbo; - ctx->kfd_bo.tv.shared = true; - ctx->kfd_bo.user_pages = NULL; - list_add(&ctx->kfd_bo.tv.head, &ctx->list); - - amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]); - - ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, - false, &ctx->duplicates); - if (!ret) - ctx->reserved = true; - else - pr_err("Failed to reserve buffers in ttm\n"); - - if (ret) { - kfree(ctx->vm_pd); - ctx->vm_pd = NULL; - } - - return ret; -} - -enum VA_TYPE { - VA_NOT_MAPPED = 0, - VA_MAPPED, - VA_DO_NOT_CARE, -}; - -/** - * reserve_bo_and_vm - reserve a BO and some VMs that the BO has been added - * to, conditionally based on map_type. - * @mem: KFD BO structure. - * @vm: the VM to reserve. If NULL, then all VMs associated with the BO - * is used. Otherwise, a single VM associated with the BO. - * @map_type: the mapping status that will be used to filter the VMs. - * @ctx: the struct that will be used in unreserve_bo_and_vms(). - */ -static int reserve_bo_and_cond_vms(struct kgd_mem *mem, - struct amdgpu_vm *vm, enum VA_TYPE map_type, - struct bo_vm_reservation_context *ctx) -{ - struct amdgpu_bo *bo = mem->bo; - struct kfd_bo_va_list *entry; - unsigned int i; - int ret; - - ctx->reserved = false; - ctx->n_vms = 0; - ctx->vm_pd = NULL; - ctx->sync = &mem->sync; - - INIT_LIST_HEAD(&ctx->list); - INIT_LIST_HEAD(&ctx->duplicates); - - list_for_each_entry(entry, &mem->bo_va_list, bo_list) { - if ((vm && vm != entry->bo_va->base.vm) || - (entry->is_mapped != map_type - && map_type != VA_DO_NOT_CARE)) - continue; - - ctx->n_vms++; - } - - if (ctx->n_vms != 0) { - ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) - * ctx->n_vms, GFP_KERNEL); - if (ctx->vm_pd == NULL) - return -ENOMEM; - } - - ctx->kfd_bo.robj = bo; - ctx->kfd_bo.priority = 0; - ctx->kfd_bo.tv.bo = &bo->tbo; - ctx->kfd_bo.tv.shared = true; - ctx->kfd_bo.user_pages = NULL; - list_add(&ctx->kfd_bo.tv.head, &ctx->list); - - i = 0; - list_for_each_entry(entry, &mem->bo_va_list, bo_list) { - if ((vm && vm != entry->bo_va->base.vm) || - (entry->is_mapped != map_type - && map_type != VA_DO_NOT_CARE)) - continue; - - amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list, - &ctx->vm_pd[i]); - i++; - } - - ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, - false, &ctx->duplicates); - if (!ret) - ctx->reserved = true; - else - pr_err("Failed to reserve buffers in ttm.\n"); - - if (ret) { - kfree(ctx->vm_pd); - ctx->vm_pd = NULL; - } - - return ret; -} - -static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, - bool wait, bool intr) -{ - int ret = 0; - - if (wait) - ret = amdgpu_sync_wait(ctx->sync, intr); - - if (ctx->reserved) - ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); - kfree(ctx->vm_pd); - - ctx->sync = NULL; - - ctx->reserved = false; - ctx->vm_pd = NULL; - - return ret; -} - -static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, - struct kfd_bo_va_list *entry, - struct amdgpu_sync *sync) -{ - struct amdgpu_bo_va *bo_va = entry->bo_va; - struct amdgpu_vm *vm = bo_va->base.vm; - struct amdkfd_vm *kvm = container_of(vm, struct amdkfd_vm, base); - struct amdgpu_bo *pd = vm->root.base.bo; - - /* Remove eviction fence from PD (and thereby from PTs too as they - * share the resv. object. Otherwise during PT update job (see - * amdgpu_vm_bo_update_mapping), eviction fence will get added to - * job->sync object - */ - amdgpu_amdkfd_remove_eviction_fence(pd, - kvm->process_info->eviction_fence, - NULL, NULL); - amdgpu_vm_bo_unmap(adev, bo_va, entry->va); - - amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); - - /* Add the eviction fence back */ - amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); - - amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); - - /* Sync objects can't handle multiple GPUs (contexts) updating - * sync->last_vm_update. Fortunately we don't need it for - * KFD's purposes, so we can just drop that fence. - */ - if (sync->last_vm_update) { - dma_fence_put(sync->last_vm_update); - sync->last_vm_update = NULL; - } - - return 0; -} - -static int update_gpuvm_pte(struct amdgpu_device *adev, - struct kfd_bo_va_list *entry, - struct amdgpu_sync *sync) -{ - int ret; - struct amdgpu_vm *vm; - struct amdgpu_bo_va *bo_va; - struct amdgpu_bo *bo; - - bo_va = entry->bo_va; - vm = bo_va->base.vm; - bo = bo_va->base.bo; - - /* Update the page tables */ - ret = amdgpu_vm_bo_update(adev, bo_va, false); - if (ret != 0) { - pr_err("amdgpu_vm_bo_update failed\n"); - return ret; - } - - amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); - - /* Sync objects can't handle multiple GPUs (contexts) updating - * sync->last_vm_update. Fortunately we don't need it for - * KFD's purposes, so we can just drop that fence. - */ - if (sync->last_vm_update) { - dma_fence_put(sync->last_vm_update); - sync->last_vm_update = NULL; - } - - return 0; -} - -static int map_bo_to_gpuvm(struct amdgpu_device *adev, - struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, - bool no_update_pte) -{ - int ret; - - /* Set virtual address for the allocation */ - ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0, - amdgpu_bo_size(entry->bo_va->base.bo), entry->pte_flags); - if (ret != 0) { - pr_err("Failed to map VA 0x%llx in vm. ret %d\n", - entry->va, ret); - return ret; - } - - if (no_update_pte) - return 0; - - ret = update_gpuvm_pte(adev, entry, sync); - if (ret != 0) { - pr_err("update_gpuvm_pte() failed\n"); - goto update_gpuvm_pte_failed; - } - - return 0; - -update_gpuvm_pte_failed: - unmap_bo_from_gpuvm(adev, entry, sync); - return ret; -} - -static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size) -{ - struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL); - - if (!sg) - return NULL; - if (sg_alloc_table(sg, 1, GFP_KERNEL)) { - kfree(sg); - return NULL; - } - sg->sgl->dma_address = addr; - sg->sgl->length = size; -#ifdef CONFIG_NEED_SG_DMA_LENGTH - sg->sgl->dma_length = size; -#endif - return sg; -} - -int amdgpu_amdkfd_gpuvm_sync_memory( - struct kgd_dev *kgd, struct kgd_mem *mem, bool intr) -{ - int ret = 0; - struct amdgpu_sync sync; - struct amdgpu_device *adev; - - adev = get_amdgpu_device(kgd); - amdgpu_sync_create(&sync); - - mutex_lock(&mem->lock); - amdgpu_sync_clone(adev, &mem->sync, &sync); - mutex_unlock(&mem->lock); - - ret = amdgpu_sync_wait(&sync, intr); - amdgpu_sync_free(&sync); - return ret; -} - -#define BOOL_TO_STR(b) (b == true) ? "true" : "false" - -int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( - struct kgd_dev *kgd, uint64_t va, uint64_t size, - void *vm, struct kgd_mem **mem, - uint64_t *offset, uint32_t flags) -{ - bool aql_queue, public, readonly, execute, coherent, no_sub, userptr; - u64 alloc_flag; - uint32_t domain; - uint64_t *temp_offset; - struct sg_table *sg = NULL; - - if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) { - pr_err("current hw doesn't support paged memory\n"); - return -EINVAL; - } - - domain = 0; - alloc_flag = 0; - temp_offset = NULL; - - aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false; - public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false; - readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false; - execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false; - coherent = (flags & ALLOC_MEM_FLAGS_COHERENT) ? true : false; - no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false; - userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false; - - /* - * Check on which domain to allocate BO - */ - if (flags & ALLOC_MEM_FLAGS_VRAM) { - domain = AMDGPU_GEM_DOMAIN_VRAM; - alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; - if (public) { - alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; - temp_offset = offset; - } - alloc_flag |= AMDGPU_GEM_CREATE_VRAM_CLEARED; - } else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) { - domain = AMDGPU_GEM_DOMAIN_GTT; - alloc_flag = 0; - temp_offset = offset; - } else if (flags & ALLOC_MEM_FLAGS_DOORBELL) { - domain = AMDGPU_GEM_DOMAIN_GTT; - alloc_flag = 0; - temp_offset = offset; - if (size > UINT_MAX) - return -EINVAL; - sg = create_doorbell_sg(*offset, size); - if (!sg) - return -ENOMEM; - } - - if (offset && !userptr) - *offset = 0; - - pr_debug("Allocate VA 0x%llx - 0x%llx domain %s aql %s\n", - va, va + size, domain_string(domain), - BOOL_TO_STR(aql_queue)); - - pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s coherent %s no_sub %s\n", - alloc_flag, BOOL_TO_STR(public), - BOOL_TO_STR(readonly), BOOL_TO_STR(execute), - BOOL_TO_STR(coherent), BOOL_TO_STR(no_sub)); - - return __alloc_memory_of_gpu(kgd, va, size, vm, mem, - temp_offset, domain, - alloc_flag, sg, - aql_queue, readonly, execute, - coherent, no_sub, userptr); -} - -int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( - struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) -{ - struct amdgpu_device *adev; - struct kfd_bo_va_list *entry, *tmp; - struct bo_vm_reservation_context ctx; - int ret = 0; - struct ttm_validate_buffer *bo_list_entry; - struct amdkfd_process_info *process_info; - unsigned long bo_size; - - adev = get_amdgpu_device(kgd); - process_info = ((struct amdkfd_vm *)vm)->process_info; - - bo_size = mem->bo->tbo.mem.size; - - mutex_lock(&mem->lock); - - if (mem->mapped_to_gpu_memory > 0) { - pr_err("BO VA 0x%llx size 0x%lx is already mapped to vm %p.\n", - mem->va, bo_size, vm); - mutex_unlock(&mem->lock); - return -EBUSY; - } - - mutex_unlock(&mem->lock); - /* lock is not needed after this, since mem is unused and will - * be freed anyway - */ - - /* No more MMU notifiers */ - amdgpu_mn_unregister(mem->bo); - - /* Make sure restore workers don't access the BO any more */ - bo_list_entry = &mem->validate_list; - mutex_lock(&process_info->lock); - list_del(&bo_list_entry->head); - mutex_unlock(&process_info->lock); - - /* Free user pages if necessary */ - if (mem->user_pages) { - pr_debug("%s: Freeing user_pages array\n", __func__); - if (mem->user_pages[0]) - release_pages(mem->user_pages, - mem->bo->tbo.ttm->num_pages, 0); -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) - drm_free_large(mem->user_pages); -#else - kvfree(mem->user_pages); -#endif - } - - ret = reserve_bo_and_cond_vms(mem, NULL, VA_DO_NOT_CARE, &ctx); - if (unlikely(ret != 0)) - return ret; - - /* The eviction fence should be removed by the last unmap. - * TODO: Log an error condition if the bo still has the eviction fence - * attached - */ - amdgpu_amdkfd_remove_eviction_fence(mem->bo, - process_info->eviction_fence, - NULL, NULL); - pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va, - mem->va + bo_size * (1 + mem->aql_queue)); - - /* Remove from VM internal data structures */ - list_for_each_entry_safe(entry, tmp, &mem->bo_va_list, bo_list) { - remove_bo_from_vm((struct amdgpu_device *)entry->kgd_dev, - entry, bo_size); - } - - ret = unreserve_bo_and_vms(&ctx, false, false); - - /* Free the sync object */ - amdgpu_sync_free(&mem->sync); - - /* If the SG is not NULL, it's one we created for a doorbell - * BO. We need to free it. - */ - if (mem->bo->tbo.sg) { - sg_free_table(mem->bo->tbo.sg); - kfree(mem->bo->tbo.sg); - } - - /* Free the BO*/ - amdgpu_bo_unref(&mem->bo); - kfree(mem); - - return ret; -} - -int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( - struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) -{ - struct amdgpu_device *adev; - int ret; - struct amdgpu_bo *bo; - uint32_t domain; - struct kfd_bo_va_list *entry; - struct bo_vm_reservation_context ctx; - struct kfd_bo_va_list *bo_va_entry = NULL; - struct kfd_bo_va_list *bo_va_entry_aql = NULL; - struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; - unsigned long bo_size; - bool is_invalid_userptr; - - adev = get_amdgpu_device(kgd); - - /* Make sure restore is not running concurrently. Since we - * don't map invalid userptr BOs, we rely on the next restore - * worker to do the mapping - */ - mutex_lock(&mem->process_info->lock); - - /* Lock mmap-sem. If we find an invalid userptr BO, we can be - * sure that the MMU notifier is no longer running - * concurrently and the queues are actually stopped - */ - down_read(¤t->mm->mmap_sem); - is_invalid_userptr = atomic_read(&mem->invalid); - up_read(¤t->mm->mmap_sem); - - mutex_lock(&mem->lock); - - bo = mem->bo; - - if (!bo) { - pr_err("Invalid BO when mapping memory to GPU\n"); - return -EINVAL; - } - - domain = mem->domain; - bo_size = bo->tbo.mem.size; - - pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n", - mem->va, - mem->va + bo_size * (1 + mem->aql_queue), - vm, domain_string(domain)); - - ret = reserve_bo_and_vm(mem, vm, &ctx); - if (unlikely(ret != 0)) - goto bo_reserve_failed; - - /* Userptr can be marked as "not invalid", but not actually be - * validated yet (still in the system domain). In that case - * the queues are still stopped and we can leave mapping for - * the next restore worker - */ - if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) - is_invalid_userptr = true; - - if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, mem)) { - ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, false, - &bo_va_entry); - if (ret != 0) - goto add_bo_to_vm_failed; - if (mem->aql_queue) { - ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, - true, &bo_va_entry_aql); - if (ret != 0) - goto add_bo_to_vm_failed_aql; - } - } - - if (mem->mapped_to_gpu_memory == 0 && - !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { - /* Validate BO only once. The eviction fence gets added to BO - * the first time it is mapped. Validate will wait for all - * background evictions to complete. - */ - ret = amdgpu_amdkfd_bo_validate(bo, domain, true); - if (ret) { - pr_debug("Validate failed\n"); - goto map_bo_to_gpuvm_failed; - } - } - - list_for_each_entry(entry, &mem->bo_va_list, bo_list) { - if (entry->bo_va->base.vm == vm && !entry->is_mapped) { - pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n", - entry->va, entry->va + bo_size, - entry); - - ret = map_bo_to_gpuvm(adev, entry, ctx.sync, - is_invalid_userptr); - if (ret != 0) { - pr_err("Failed to map radeon bo to gpuvm\n"); - goto map_bo_to_gpuvm_failed; - } - entry->is_mapped = true; - mem->mapped_to_gpu_memory++; - pr_debug("\t INC mapping count %d\n", - mem->mapped_to_gpu_memory); - } - } - - if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) == NULL) - amdgpu_bo_fence(bo, - &kfd_vm->process_info->eviction_fence->base, - true); - ret = unreserve_bo_and_vms(&ctx, false, false); - - mutex_unlock(&mem->process_info->lock); - mutex_unlock(&mem->lock); - return ret; - -map_bo_to_gpuvm_failed: - if (bo_va_entry_aql) - remove_bo_from_vm(adev, bo_va_entry_aql, bo_size); -add_bo_to_vm_failed_aql: - if (bo_va_entry) - remove_bo_from_vm(adev, bo_va_entry, bo_size); -add_bo_to_vm_failed: - unreserve_bo_and_vms(&ctx, false, false); -bo_reserve_failed: - mutex_unlock(&mem->process_info->lock); - mutex_unlock(&mem->lock); - return ret; -} - -static u64 get_vm_pd_gpu_offset(void *vm) -{ - struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; - struct amdgpu_device *adev = - amdgpu_ttm_adev(avm->root.base.bo->tbo.bdev); - u64 offset; - - BUG_ON(avm == NULL); - - amdgpu_bo_reserve(avm->root.base.bo, false); - - offset = amdgpu_bo_gpu_offset(avm->root.base.bo); - - amdgpu_bo_unreserve(avm->root.base.bo); - - /* On some ASICs the FB doesn't start at 0. Adjust FB offset - * to an actual MC address. - */ - if (adev->gart.gart_funcs->get_vm_pde) - offset = amdgpu_gart_get_vm_pde(adev, offset); - - return offset; -} - -int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, - void **process_info, - struct dma_fence **ef) -{ - int ret; - struct amdkfd_vm *new_vm; - struct amdkfd_process_info *info; - struct amdgpu_device *adev = get_amdgpu_device(kgd); - - new_vm = kzalloc(sizeof(*new_vm), GFP_KERNEL); - if (new_vm == NULL) - return -ENOMEM; - - /* Initialize the VM context, allocate the page directory and zero it */ - ret = amdgpu_vm_init(adev, &new_vm->base, AMDGPU_VM_CONTEXT_COMPUTE); - if (ret != 0) { - pr_err("Failed init vm ret %d\n", ret); - /* Undo everything related to the new VM context */ - goto vm_init_fail; - } - new_vm->adev = adev; - - if (!*process_info) { - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) { - pr_err("Failed to create amdkfd_process_info"); - ret = -ENOMEM; - goto alloc_process_info_fail; - } - - mutex_init(&info->lock); - INIT_LIST_HEAD(&info->vm_list_head); - INIT_LIST_HEAD(&info->kfd_bo_list); - INIT_LIST_HEAD(&info->userptr_valid_list); - INIT_LIST_HEAD(&info->userptr_inval_list); - - info->eviction_fence = - amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), - current->mm); - if (info->eviction_fence == NULL) { - pr_err("Failed to create eviction fence\n"); - goto create_evict_fence_fail; - } - - info->pid = get_task_pid(current->group_leader, - PIDTYPE_PID); - atomic_set(&info->evicted_bos, 0); - INIT_DELAYED_WORK(&info->work, - amdgpu_amdkfd_restore_userptr_worker); - - *process_info = info; - *ef = dma_fence_get(&info->eviction_fence->base); - } - - new_vm->process_info = *process_info; - - mutex_lock(&new_vm->process_info->lock); - list_add_tail(&new_vm->vm_list_node, - &(new_vm->process_info->vm_list_head)); - new_vm->process_info->n_vms++; - mutex_unlock(&new_vm->process_info->lock); - - *vm = (void *) new_vm; - - pr_debug("Created process vm %p\n", *vm); - - return ret; - -create_evict_fence_fail: - kfree(info); -alloc_process_info_fail: - amdgpu_vm_fini(adev, &new_vm->base); -vm_init_fail: - kfree(new_vm); - return ret; - -} - -void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm) -{ - struct amdgpu_device *adev = (struct amdgpu_device *) kgd; - struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *) vm; - struct amdgpu_vm *avm = &kfd_vm->base; - struct amdgpu_bo *pd; - struct amdkfd_process_info *process_info; - - if (WARN_ON(!kgd || !vm)) - return; - - pr_debug("Destroying process vm %p\n", vm); - /* Release eviction fence from PD */ - pd = avm->root.base.bo; - amdgpu_bo_reserve(pd, false); - amdgpu_bo_fence(pd, NULL, false); - amdgpu_bo_unreserve(pd); - - process_info = kfd_vm->process_info; - - mutex_lock(&process_info->lock); - process_info->n_vms--; - list_del(&kfd_vm->vm_list_node); - mutex_unlock(&process_info->lock); - - /* Release per-process resources */ - if (!process_info->n_vms) { - WARN_ON(!list_empty(&process_info->kfd_bo_list)); - WARN_ON(!list_empty(&process_info->userptr_valid_list)); - WARN_ON(!list_empty(&process_info->userptr_inval_list)); - - dma_fence_put(&process_info->eviction_fence->base); - cancel_delayed_work_sync(&process_info->work); - put_pid(process_info->pid); - kfree(process_info); - } - - /* Release the VM context */ - amdgpu_vm_fini(adev, avm); - kfree(vm); -} - -uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm) -{ - return get_vm_pd_gpu_offset(vm) >> AMDGPU_GPU_PAGE_SHIFT; -} - -int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, - struct kfd_vm_fault_info *mem) -{ - struct amdgpu_device *adev; - - adev = (struct amdgpu_device *) kgd; - if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) { - *mem = *adev->mc.vm_fault_info; - mb(); - atomic_set(&adev->mc.vm_fault_info_updated, 0); - } - return 0; -} - -static bool is_mem_on_local_device(struct kgd_dev *kgd, - struct list_head *bo_va_list, void *vm) -{ - struct kfd_bo_va_list *entry; - - list_for_each_entry(entry, bo_va_list, bo_list) { - if (entry->kgd_dev == kgd && entry->bo_va->base.vm == vm) - return true; - } - - return false; -} - -int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( - struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) -{ - struct kfd_bo_va_list *entry; - struct amdgpu_device *adev; - unsigned int mapped_before; - int ret = 0; - struct bo_vm_reservation_context ctx; - struct amdkfd_process_info *process_info; - unsigned long bo_size; - - adev = (struct amdgpu_device *) kgd; - process_info = ((struct amdkfd_vm *)vm)->process_info; - - bo_size = mem->bo->tbo.mem.size; - - mutex_lock(&mem->lock); - - /* - * Make sure that this BO mapped on KGD before unmappping it - */ - if (!is_mem_on_local_device(kgd, &mem->bo_va_list, vm)) { - ret = -EINVAL; - goto out; - } - - if (mem->mapped_to_gpu_memory == 0) { - pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", - mem->va, bo_size, vm); - ret = -EINVAL; - goto out; - } - mapped_before = mem->mapped_to_gpu_memory; - - ret = reserve_bo_and_cond_vms(mem, vm, VA_MAPPED, &ctx); - if (unlikely(ret != 0)) - goto out; - - pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n", - mem->va, - mem->va + bo_size * (1 + mem->aql_queue), - vm); - - list_for_each_entry(entry, &mem->bo_va_list, bo_list) { - if (entry->bo_va->base.vm == vm && entry->is_mapped) { - pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n", - entry->va, - entry->va + bo_size, - entry); - - ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync); - if (ret == 0) { - entry->is_mapped = false; - } else { - pr_err("failed to unmap VA 0x%llx\n", - mem->va); - goto unreserve_out; - } - - mem->mapped_to_gpu_memory--; - pr_debug("\t DEC mapping count %d\n", - mem->mapped_to_gpu_memory); - } - } - - /* If BO is unmapped from all VMs, unfence it. It can be evicted if - * required. - */ - if (mem->mapped_to_gpu_memory == 0 && - !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) - amdgpu_amdkfd_remove_eviction_fence(mem->bo, - process_info->eviction_fence, - NULL, NULL); - - if (mapped_before == mem->mapped_to_gpu_memory) { - pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", - mem->va, bo_size, vm); - ret = -EINVAL; - } - -unreserve_out: - unreserve_bo_and_vms(&ctx, false, false); -out: - mutex_unlock(&mem->lock); - return ret; -} - -int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) -{ - struct amdgpu_device *adev; - - adev = get_amdgpu_device(kgd); - if (!adev) { - pr_err("Could not get amdgpu device in %s\n", __func__); - return -ENODEV; - } - - return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev); -} - -int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, - struct kgd_mem *mem, void **kptr) -{ - int ret; - struct amdgpu_bo *bo = mem->bo; - - if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { - pr_err("userptr can't be mapped to kernel\n"); - return -EINVAL; - } - - /* delete kgd_mem from kfd_bo_list to avoid re-validating - * this BO in BO's restoring after eviction. - */ - mutex_lock(&mem->process_info->lock); - - list_del_init(&mem->validate_list.head); - - ret = __map_bo_to_kernel(bo, AMDGPU_GEM_DOMAIN_GTT, kptr); - if (!ret) - mem->kptr = *kptr; - - mutex_unlock(&mem->process_info->lock); - - return ret; -} - -static int pin_bo_wo_map(struct kgd_mem *mem) -{ - struct amdgpu_bo *bo = mem->bo; - int ret = 0; - - ret = amdgpu_bo_reserve(bo, false); - if (unlikely(ret != 0)) - return ret; - - ret = amdgpu_bo_pin(bo, mem->domain, NULL); - amdgpu_bo_unreserve(bo); - - return ret; -} - -static void unpin_bo_wo_map(struct kgd_mem *mem) -{ - struct amdgpu_bo *bo = mem->bo; - int ret = 0; - - ret = amdgpu_bo_reserve(bo, false); - if (unlikely(ret != 0)) - return; - - amdgpu_bo_unpin(bo); - amdgpu_bo_unreserve(bo); -} - -#define AMD_GPU_PAGE_SHIFT PAGE_SHIFT -#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT) - -static int get_sg_table(struct amdgpu_device *adev, - struct kgd_mem *mem, uint64_t offset, - uint64_t size, struct sg_table **ret_sg) -{ - struct amdgpu_bo *bo = mem->bo; - struct sg_table *sg = NULL; - unsigned long bus_addr; - unsigned int chunks; - unsigned int i; - struct scatterlist *s; - uint64_t offset_in_page; - unsigned int page_size; - int ret; - - sg = kmalloc(sizeof(*sg), GFP_KERNEL); - if (!sg) { - ret = -ENOMEM; - goto out; - } - - if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) - page_size = AMD_GPU_PAGE_SIZE; - else - page_size = PAGE_SIZE; - - - offset_in_page = offset & (page_size - 1); - chunks = (size + offset_in_page + page_size - 1) - / page_size; - - ret = sg_alloc_table(sg, chunks, GFP_KERNEL); - if (unlikely(ret)) - goto out; - - if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) { - bus_addr = bo->tbo.offset + adev->mc.aper_base + offset; - - for_each_sg(sg->sgl, s, sg->orig_nents, i) { - uint64_t chunk_size, length; - - chunk_size = page_size - offset_in_page; - length = min(size, chunk_size); - - sg_set_page(s, NULL, length, offset_in_page); - s->dma_address = bus_addr; - s->dma_length = length; - - size -= length; - offset_in_page = 0; - bus_addr += length; - } - } else { - struct page **pages; - unsigned int cur_page; - - pages = bo->tbo.ttm->pages; - - cur_page = offset / page_size; - for_each_sg(sg->sgl, s, sg->orig_nents, i) { - uint64_t chunk_size, length; - - chunk_size = page_size - offset_in_page; - length = min(size, chunk_size); - - sg_set_page(s, pages[cur_page], length, offset_in_page); - s->dma_address = page_to_phys(pages[cur_page]); - s->dma_length = length; - - size -= length; - offset_in_page = 0; - cur_page++; - } - } - - *ret_sg = sg; - return 0; -out: - kfree(sg); - *ret_sg = NULL; - return ret; -} - -int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, - struct kgd_mem *mem, uint64_t offset, - uint64_t size, struct sg_table **ret_sg) -{ - int ret; - struct amdgpu_device *adev; - - ret = pin_bo_wo_map(mem); - if (unlikely(ret != 0)) - return ret; - - adev = get_amdgpu_device(kgd); - - ret = get_sg_table(adev, mem, offset, size, ret_sg); - if (ret) - unpin_bo_wo_map(mem); - - return ret; -} - -void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( - struct kgd_mem *mem, struct sg_table *sg) -{ - sg_free_table(sg); - kfree(sg); - - unpin_bo_wo_map(mem); -} - -int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, - struct dma_buf *dma_buf, - uint64_t va, void *vm, - struct kgd_mem **mem, uint64_t *size, - uint64_t *mmap_offset) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - struct drm_gem_object *obj; - struct amdgpu_bo *bo; - struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; - - if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) - /* Can't handle non-graphics buffers */ - return -EINVAL; - - obj = dma_buf->priv; - if (obj->dev->dev_private != adev) - /* Can't handle buffers from other devices */ - return -EINVAL; - - bo = gem_to_amdgpu_bo(obj); - if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | - AMDGPU_GEM_DOMAIN_GTT | - AMDGPU_GEM_DOMAIN_DGMA))) - /* Only VRAM and GTT BOs are supported */ - return -EINVAL; - - *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); - if (*mem == NULL) - return -ENOMEM; - - if (size) - *size = amdgpu_bo_size(bo); - - if (mmap_offset) - *mmap_offset = amdgpu_bo_mmap_offset(bo); - - INIT_LIST_HEAD(&(*mem)->bo_va_list); - mutex_init(&(*mem)->lock); - (*mem)->mapping_flags = - AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | - AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC; - - (*mem)->bo = amdgpu_bo_ref(bo); - (*mem)->va = va; - if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) - (*mem)->domain = AMDGPU_GEM_DOMAIN_VRAM; - else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) - (*mem)->domain = AMDGPU_GEM_DOMAIN_GTT; - else - (*mem)->domain = AMDGPU_GEM_DOMAIN_DGMA; - (*mem)->mapped_to_gpu_memory = 0; - (*mem)->process_info = kfd_vm->process_info; - add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, false); - amdgpu_sync_create(&(*mem)->sync); - - return 0; -} - -int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, - struct kgd_mem *mem, - struct dma_buf **dmabuf) -{ - struct amdgpu_device *adev = NULL; - struct amdgpu_bo *bo = NULL; - struct drm_gem_object *gobj = NULL; - - if (!dmabuf || !kgd || !vm || !mem) - return -EINVAL; - - adev = get_amdgpu_device(kgd); - bo = mem->bo; - - gobj = amdgpu_gem_prime_foreign_bo(adev, bo); - if (gobj == NULL) { - pr_err("Export BO failed. Unable to find/create GEM object\n"); - return -EINVAL; - } - - *dmabuf = amdgpu_gem_prime_export(adev->ddev, gobj, 0); - return 0; -} - -static int process_validate_vms(struct amdkfd_process_info *process_info) -{ - struct amdkfd_vm *peer_vm; - int ret; - - list_for_each_entry(peer_vm, &process_info->vm_list_head, - vm_list_node) { - ret = vm_validate_pt_pd_bos(&peer_vm->base); - if (ret) - return ret; - } - - return 0; -} - -/* Evict a userptr BO by stopping the queues if necessary - * - * Runs in MMU notifier, may be in RECLAIM_FS context. This means it - * cannot do any memory allocations, and cannot take any locks that - * are held elsewhere while allocating memory. Therefore this is as - * simple as possible, using atomic counters. - * - * It doesn't do anything to the BO itself. The real work happens in - * restore, where we get updated page addresses. This function only - * ensures that GPU access to the BO is stopped. - */ -int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, - struct mm_struct *mm) -{ - struct amdkfd_process_info *process_info = mem->process_info; - int invalid, evicted_bos; - int r = 0; - - invalid = atomic_inc_return(&mem->invalid); - evicted_bos = atomic_inc_return(&process_info->evicted_bos); - if (evicted_bos == 1) { - /* First eviction, stop the queues */ - r = kgd2kfd->quiesce_mm(NULL, mm); - if (r != 0) - pr_err("Failed to quiesce KFD\n"); - schedule_delayed_work(&process_info->work, 1); - } - - return r; -} - -/* Update invalid userptr BOs - * - * Moves invalidated (evicted) userptr BOs from userptr_valid_list to - * userptr_inval_list and updates user pages for all BOs that have - * been invalidated since their last update. - */ -static int update_invalid_user_pages(struct amdkfd_process_info *process_info, - struct mm_struct *mm) -{ - struct kgd_mem *mem, *tmp_mem; - struct amdgpu_bo *bo; - int invalid, ret; - - /* Move all invalidated BOs to the userptr_inval_list and - * release their user pages by migration to the CPU domain - */ - list_for_each_entry_safe(mem, tmp_mem, - &process_info->userptr_valid_list, - validate_list.head) { - if (!atomic_read(&mem->invalid)) - continue; /* BO is still valid */ - - bo = mem->bo; - - if (amdgpu_bo_reserve(bo, true)) - return -EAGAIN; - amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); - ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); - amdgpu_bo_unreserve(bo); - if (ret) { - pr_err("%s: Failed to invalidate userptr BO\n", - __func__); - return -EAGAIN; - } - - list_move_tail(&mem->validate_list.head, - &process_info->userptr_inval_list); - } - - if (list_empty(&process_info->userptr_inval_list)) - return 0; /* All evicted userptr BOs were freed */ - - /* Go through userptr_inval_list and update any invalid user_pages */ - list_for_each_entry(mem, &process_info->userptr_inval_list, - validate_list.head) { - invalid = atomic_read(&mem->invalid); - if (!invalid) - /* BO hasn't been invalidated since the last - * revalidation attempt. Keep its BO list. - */ - continue; - - bo = mem->bo; - - if (!mem->user_pages) { -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) - mem->user_pages = - drm_calloc_large(bo->tbo.ttm->num_pages, - sizeof(struct page *)); -#else - mem->user_pages = - kvmalloc_array(bo->tbo.ttm->num_pages, - sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); -#endif - if (!mem->user_pages) { - pr_err("%s: Failed to allocate pages array\n", - __func__); - return -ENOMEM; - } - } else if (mem->user_pages[0]) { - release_pages(mem->user_pages, - bo->tbo.ttm->num_pages, 0); - } - - /* Get updated user pages */ - ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, - mem->user_pages); - if (ret) { - mem->user_pages[0] = NULL; - pr_info("%s: Failed to get user pages: %d\n", - __func__, ret); - /* Pretend it succeeded. It will fail later - * with a VM fault if the GPU tries to access - * it. Better than hanging indefinitely with - * stalled user mode queues. - */ - } - - /* Mark the BO as valid unless it was invalidated - * again concurrently - */ - if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) - return -EAGAIN; - } - return 0; -} - -/* Validate invalid userptr BOs - * - * Validates BOs on the userptr_inval_list, and moves them back to the - * userptr_valid_list. Also updates GPUVM page tables with new page - * addresses and waits for the page table updates to complete. - */ -static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) -{ - struct amdgpu_bo_list_entry *pd_bo_list_entries; - struct list_head resv_list, duplicates; - struct ww_acquire_ctx ticket; - struct amdgpu_sync sync; - - struct amdkfd_vm *peer_vm; - struct kgd_mem *mem, *tmp_mem; - struct amdgpu_bo *bo; - int i, ret; - - pd_bo_list_entries = kcalloc(process_info->n_vms, - sizeof(struct amdgpu_bo_list_entry), - GFP_KERNEL); - if (!pd_bo_list_entries) { - pr_err("%s: Failed to allocate PD BO list entries\n", __func__); - return -ENOMEM; - } - - INIT_LIST_HEAD(&resv_list); - INIT_LIST_HEAD(&duplicates); - - /* Get all the page directory BOs that need to be reserved */ - i = 0; - list_for_each_entry(peer_vm, &process_info->vm_list_head, - vm_list_node) - amdgpu_vm_get_pd_bo(&peer_vm->base, &resv_list, - &pd_bo_list_entries[i++]); - /* Add the userptr_inval_list entries to resv_list */ - list_for_each_entry(mem, &process_info->userptr_inval_list, - validate_list.head) { - list_add_tail(&mem->resv_list.head, &resv_list); - mem->resv_list.bo = mem->validate_list.bo; - mem->resv_list.shared = mem->validate_list.shared; - } - - /* Reserve all BOs and page tables for validation */ - ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); - WARN(!list_empty(&duplicates), "Duplicates should be empty"); - if (ret) - goto out; - - amdgpu_sync_create(&sync); - - /* Avoid triggering eviction fences when unmapping invalid - * userptr BOs (waits for all fences, doesn't use - * FENCE_OWNER_VM) - */ - list_for_each_entry(peer_vm, &process_info->vm_list_head, - vm_list_node) - amdgpu_amdkfd_remove_eviction_fence(peer_vm->base.root.base.bo, - process_info->eviction_fence, - NULL, NULL); - - ret = process_validate_vms(process_info); - if (ret) - goto unreserve_out; - - /* Validate BOs and update GPUVM page tables */ - list_for_each_entry_safe(mem, tmp_mem, - &process_info->userptr_inval_list, - validate_list.head) { - struct kfd_bo_va_list *bo_va_entry; - - bo = mem->bo; - - /* Copy pages array and validate the BO if we got user pages */ - if (mem->user_pages[0]) { - amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, - mem->user_pages); - amdgpu_ttm_placement_from_domain(bo, mem->domain); - ret = ttm_bo_validate(&bo->tbo, &bo->placement, - false, false); - if (ret) { - pr_err("%s: failed to validate BO\n", __func__); - goto unreserve_out; - } - } - - /* Validate succeeded, now the BO owns the pages, free - * our copy of the pointer array. Put this BO back on - * the userptr_valid_list. If we need to revalidate - * it, we need to start from scratch. - */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) - drm_free_large(mem->user_pages); -#else - kvfree(mem->user_pages); -#endif - mem->user_pages = NULL; - list_move_tail(&mem->validate_list.head, - &process_info->userptr_valid_list); - - /* Update mapping. If the BO was not validated - * (because we couldn't get user pages), this will - * clear the page table entries, which will result in - * VM faults if the GPU tries to access the invalid - * memory. - */ - list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { - if (!bo_va_entry->is_mapped) - continue; - - ret = update_gpuvm_pte((struct amdgpu_device *) - bo_va_entry->kgd_dev, - bo_va_entry, &sync); - if (ret) { - pr_err("%s: update PTE failed\n", __func__); - /* make sure this gets validated again */ - atomic_inc(&mem->invalid); - goto unreserve_out; - } - } - } -unreserve_out: - list_for_each_entry(peer_vm, &process_info->vm_list_head, - vm_list_node) - amdgpu_bo_fence(peer_vm->base.root.base.bo, - &process_info->eviction_fence->base, true); - ttm_eu_backoff_reservation(&ticket, &resv_list); - amdgpu_sync_wait(&sync, false); - amdgpu_sync_free(&sync); -out: - kfree(pd_bo_list_entries); - - return ret; -} - -/* Worker callback to restore evicted userptr BOs - * - * Tries to update and validate all userptr BOs. If successful and no - * concurrent evictions happened, the queues are restarted. Otherwise, - * reschedule for another attempt later. - */ -static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct amdkfd_process_info *process_info = - container_of(dwork, struct amdkfd_process_info, work); - struct task_struct *usertask; - struct mm_struct *mm; - int evicted_bos; - - evicted_bos = atomic_read(&process_info->evicted_bos); - if (!evicted_bos) - return; - - /* Reference task and mm in case of concurrent process termination */ - usertask = get_pid_task(process_info->pid, PIDTYPE_PID); - if (!usertask) - return; - mm = get_task_mm(usertask); - if (!mm) { - put_task_struct(usertask); - return; - } - - mutex_lock(&process_info->lock); - - if (update_invalid_user_pages(process_info, mm)) - goto unlock_out; - /* userptr_inval_list can be empty if all evicted userptr BOs - * have been freed. In that case there is nothing to validate - * and we can just restart the queues. - */ - if (!list_empty(&process_info->userptr_inval_list)) { - if (atomic_read(&process_info->evicted_bos) != evicted_bos) - goto unlock_out; /* Concurrent eviction, try again */ - - if (validate_invalid_user_pages(process_info)) - goto unlock_out; - } - /* Final check for concurrent evicton and atomic update. If - * another eviction happens after successful update, it will - * be a first eviction that calls quiesce_mm. The eviction - * reference counting inside KFD will handle this case. - */ - if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != - evicted_bos) - goto unlock_out; - evicted_bos = 0; - if (kgd2kfd->resume_mm(NULL, mm)) { - pr_err("%s: Failed to resume KFD\n", __func__); - /* No recovery from this failure. Probably the CP is - * hanging. No point trying again. - */ - } -unlock_out: - mutex_unlock(&process_info->lock); - mmput(mm); - put_task_struct(usertask); - - /* If validation failed, reschedule another attempt */ - if (evicted_bos) - schedule_delayed_work(&process_info->work, 1); -} - -/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given - * KFD process identified by process_info - * - * @process_info: amdkfd_process_info of the KFD process - * - * After memory eviction, restore thread calls this function. The function - * should be called when the Process is still valid. BO restore involves - - * - * 1. Release old eviction fence and create new one - * 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list. - * 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of - * BOs that need to be reserved. - * 4. Reserve all the BOs - * 5. Validate of PD and PT BOs. - * 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence - * 7. Add fence to all PD and PT BOs. - * 8. Unreserve all BOs - */ - -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) -{ - struct amdgpu_bo_list_entry *pd_bo_list; - struct amdkfd_process_info *process_info = info; - struct amdkfd_vm *peer_vm; - struct kgd_mem *mem; - struct bo_vm_reservation_context ctx; - struct amdgpu_amdkfd_fence *new_fence; - int ret = 0, i; - struct list_head duplicate_save; - struct amdgpu_sync sync_obj; - - INIT_LIST_HEAD(&duplicate_save); - INIT_LIST_HEAD(&ctx.list); - INIT_LIST_HEAD(&ctx.duplicates); - - pd_bo_list = kcalloc(process_info->n_vms, - sizeof(struct amdgpu_bo_list_entry), - GFP_KERNEL); - if (pd_bo_list == NULL) - return -ENOMEM; - - i = 0; - mutex_lock(&process_info->lock); - list_for_each_entry(peer_vm, &process_info->vm_list_head, - vm_list_node) - amdgpu_vm_get_pd_bo(&peer_vm->base, &ctx.list, - &pd_bo_list[i++]); - - /* Reserve all BOs and page tables/directory. Add all BOs from - * kfd_bo_list to ctx.list - */ - list_for_each_entry(mem, &process_info->kfd_bo_list, - validate_list.head) { - - list_add_tail(&mem->resv_list.head, &ctx.list); - mem->resv_list.bo = mem->validate_list.bo; - mem->resv_list.shared = mem->validate_list.shared; - } - - ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list, - false, &duplicate_save); - if (ret) { - pr_debug("Memory eviction: TTM Reserve Failed. Try again\n"); - goto ttm_reserve_fail; - } - - amdgpu_sync_create(&sync_obj); - ctx.sync = &sync_obj; - - /* Validate PDs and PTs */ - ret = process_validate_vms(process_info); - if (ret) - goto validate_map_fail; - - /* Wait for PD/PTs validate to finish */ - /* FIXME: I think this isn't needed */ - list_for_each_entry(peer_vm, &process_info->vm_list_head, - vm_list_node) { - struct amdgpu_bo *bo = peer_vm->base.root.base.bo; - - ttm_bo_wait(&bo->tbo, false, false); - } - - /* Validate BOs and map them to GPUVM (update VM page tables). */ - list_for_each_entry(mem, &process_info->kfd_bo_list, - validate_list.head) { - - struct amdgpu_bo *bo = mem->bo; - uint32_t domain = mem->domain; - struct kfd_bo_va_list *bo_va_entry; - - ret = amdgpu_amdkfd_bo_validate(bo, domain, false); - if (ret) { - pr_debug("Memory eviction: Validate BOs failed. Try again\n"); - goto validate_map_fail; - } - - list_for_each_entry(bo_va_entry, &mem->bo_va_list, - bo_list) { - ret = update_gpuvm_pte((struct amdgpu_device *) - bo_va_entry->kgd_dev, - bo_va_entry, - ctx.sync); - if (ret) { - pr_debug("Memory eviction: update PTE failed. Try again\n"); - goto validate_map_fail; - } - } - } - - amdgpu_sync_wait(ctx.sync, false); - - /* Release old eviction fence and create new one, because fence only - * goes from unsignaled to signaled, fence cannot be reused. - * Use context and mm from the old fence. - */ - new_fence = amdgpu_amdkfd_fence_create( - process_info->eviction_fence->base.context, - process_info->eviction_fence->mm); - if (!new_fence) { - pr_err("Failed to create eviction fence\n"); - ret = -ENOMEM; - goto validate_map_fail; - } - dma_fence_put(&process_info->eviction_fence->base); - process_info->eviction_fence = new_fence; - *ef = dma_fence_get(&new_fence->base); - - /* Wait for validate to finish and attach new eviction fence */ - list_for_each_entry(mem, &process_info->kfd_bo_list, - validate_list.head) - ttm_bo_wait(&mem->bo->tbo, false, false); - list_for_each_entry(mem, &process_info->kfd_bo_list, - validate_list.head) - amdgpu_bo_fence(mem->bo, - &process_info->eviction_fence->base, true); - - /* Attach eviction fence to PD / PT BOs */ - list_for_each_entry(peer_vm, &process_info->vm_list_head, - vm_list_node) { - struct amdgpu_bo *bo = peer_vm->base.root.base.bo; - - amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true); - } -validate_map_fail: - ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list); - amdgpu_sync_free(&sync_obj); -ttm_reserve_fail: - mutex_unlock(&process_info->lock); -evict_fence_fail: - kfree(pd_bo_list); - return ret; -} - -int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, - uint64_t src_offset, struct kgd_mem *dst_mem, - uint64_t dst_offset, uint64_t size, - struct dma_fence **f, uint64_t *actual_size) -{ - struct amdgpu_device *adev = NULL; - struct ttm_mem_reg *src = NULL, *dst = NULL; - struct ttm_buffer_object *src_ttm_bo, *dst_ttm_bo; - struct drm_mm_node *src_mm, *dst_mm; - struct amdgpu_ring *ring; - struct ww_acquire_ctx ticket; - struct list_head list; - struct ttm_validate_buffer resv_list[2]; - uint64_t src_start, dst_start; - uint64_t src_left, dst_left, cur_copy_size, total_copy_size = 0; - struct dma_fence *fence = NULL; - int r; - - if (!kgd || !src_mem || !dst_mem) - return -EINVAL; - - if (actual_size) - *actual_size = 0; - - adev = get_amdgpu_device(kgd); - src_ttm_bo = &src_mem->bo->tbo; - dst_ttm_bo = &dst_mem->bo->tbo; - src = &src_ttm_bo->mem; - dst = &dst_ttm_bo->mem; - src_mm = (struct drm_mm_node *)src->mm_node; - dst_mm = (struct drm_mm_node *)dst->mm_node; - - ring = adev->mman.buffer_funcs_ring; - - INIT_LIST_HEAD(&list); - - resv_list[0].bo = src_ttm_bo; - resv_list[0].shared = true; - resv_list[1].bo = dst_ttm_bo; - resv_list[1].shared = true; - - list_add_tail(&resv_list[0].head, &list); - list_add_tail(&resv_list[1].head, &list); - - if (!ring->ready) { - pr_err("Trying to move memory with ring turned off.\n"); - return -EINVAL; - } - - r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL); - if (r) { - pr_err("Copy buffer failed. Unable to reserve bo (%d)\n", r); - return r; - } - - switch (src->mem_type) { - case TTM_PL_TT: - r = amdgpu_ttm_bind(src_ttm_bo, src); - if (r) { - DRM_ERROR("Copy failed. Cannot bind to gart\n"); - goto copy_fail; - } - break; - case TTM_PL_VRAM: - /* VRAM could be scattered. Find the node in which the offset - * belongs to - */ - while (src_offset >= (src_mm->size << PAGE_SHIFT)) { - src_offset -= (src_mm->size << PAGE_SHIFT); - ++src_mm; - } - break; - default: - DRM_ERROR("Unknown placement %d\n", src->mem_type); - r = -EINVAL; - goto copy_fail; - } - src_start = src_mm->start << PAGE_SHIFT; - src_start += src_ttm_bo->bdev->man[src->mem_type].gpu_offset; - src_start += src_offset; - src_left = (src_mm->size << PAGE_SHIFT) - src_offset; - - switch (dst->mem_type) { - case TTM_PL_TT: - r = amdgpu_ttm_bind(dst_ttm_bo, dst); - if (r) { - DRM_ERROR("Copy failed. Cannot bind to gart\n"); - goto copy_fail; - } - break; - case TTM_PL_VRAM: - while (dst_offset >= (dst_mm->size << PAGE_SHIFT)) { - dst_offset -= (dst_mm->size << PAGE_SHIFT); - ++dst_mm; - } - break; - default: - DRM_ERROR("Unknown placement %d\n", dst->mem_type); - r = -EINVAL; - goto copy_fail; - } - dst_start = dst_mm->start << PAGE_SHIFT; - dst_start += dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; - dst_start += dst_offset; - dst_left = (dst_mm->size << PAGE_SHIFT) - dst_offset; - - do { - struct dma_fence *next; - - /* src_left/dst_left: amount of space left in the current node - * Copy minimum of (src_left, dst_left, amount of bytes left to - * copy) - */ - cur_copy_size = min3(src_left, dst_left, - (size - total_copy_size)); - - r = amdgpu_copy_buffer(ring, src_start, dst_start, - cur_copy_size, NULL, &next, false, false); - if (r) - break; - - /* Just keep the last fence */ - dma_fence_put(fence); - fence = next; - - total_copy_size += cur_copy_size; - /* Required amount of bytes copied. Done. */ - if (total_copy_size >= size) - break; - - /* If end of src or dst node is reached, move to next node */ - src_left -= cur_copy_size; - if (!src_left) { - ++src_mm; - src_start = src_mm->start << PAGE_SHIFT; - src_start += - src_ttm_bo->bdev->man[src->mem_type].gpu_offset; - src_left = src_mm->size << PAGE_SHIFT; - } else - src_start += cur_copy_size; - - dst_left -= cur_copy_size; - if (!dst_left) { - ++dst_mm; - dst_start = dst_mm->start << PAGE_SHIFT; - dst_start += - dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; - dst_left = dst_mm->size << PAGE_SHIFT; - } else - dst_start += cur_copy_size; - - } while (total_copy_size < size); - - /* Failure could occur after partial copy. So fill in amount copied - * and fence, still fill-in - */ - if (actual_size) - *actual_size = total_copy_size; - - if (fence) { - amdgpu_bo_fence(src_mem->bo, fence, true); - amdgpu_bo_fence(dst_mem->bo, fence, true); - } - - if (f) - *f = fence; - -copy_fail: - ttm_eu_backoff_reservation(&ticket, &list); - return r; -} - diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index ff6f90a..5ad0580 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -27,7 +27,9 @@ #include #include #include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) #include +#endif #include "amdgpu.h" #include "amdgpu_trace.h" @@ -38,7 +40,7 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, struct drm_gem_object *gobj; unsigned long size; - gobj = drm_gem_object_lookup(p->filp, data->handle); + gobj = kcl_drm_gem_object_lookup(p->adev->ddev, p->filp, data->handle); if (gobj == NULL) return -EINVAL; @@ -54,7 +56,7 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, *offset = data->offset; - drm_gem_object_put_unlocked(gobj); + kcl_drm_gem_object_put_unlocked(gobj); if (amdgpu_ttm_tt_get_usermm(p->uf_entry.robj->tbo.ttm)) { amdgpu_bo_unref(&p->uf_entry.robj); @@ -90,7 +92,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) } /* get chunks */ - chunk_array_user = u64_to_user_ptr(cs->in.chunks); + chunk_array_user = kcl_u64_to_user_ptr(cs->in.chunks); if (copy_from_user(chunk_array, chunk_array_user, sizeof(uint64_t)*cs->in.num_chunks)) { ret = -EFAULT; @@ -110,7 +112,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) struct drm_amdgpu_cs_chunk user_chunk; uint32_t __user *cdata; - chunk_ptr = u64_to_user_ptr(chunk_array[i]); + chunk_ptr = kcl_u64_to_user_ptr(chunk_array[i]); if (copy_from_user(&user_chunk, chunk_ptr, sizeof(struct drm_amdgpu_cs_chunk))) { ret = -EFAULT; @@ -121,9 +123,13 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) p->chunks[i].length_dw = user_chunk.length_dw; size = p->chunks[i].length_dw; - cdata = u64_to_user_ptr(user_chunk.chunk_data); + cdata = kcl_u64_to_user_ptr(user_chunk.chunk_data); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + p->chunks[i].kdata = drm_malloc_ab(size, sizeof(uint32_t)); +#else p->chunks[i].kdata = kvmalloc_array(size, sizeof(uint32_t), GFP_KERNEL); +#endif if (p->chunks[i].kdata == NULL) { ret = -ENOMEM; i--; @@ -155,8 +161,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) break; case AMDGPU_CHUNK_ID_DEPENDENCIES: +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) case AMDGPU_CHUNK_ID_SYNCOBJ_IN: case AMDGPU_CHUNK_ID_SYNCOBJ_OUT: +#endif break; default: @@ -178,7 +186,11 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) i = p->nchunks - 1; free_partial_kdata: for (; i >= 0; i--) +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + drm_free_large(p->chunks[i].kdata); +#else kvfree(p->chunks[i].kdata); +#endif kfree(p->chunks); p->chunks = NULL; p->nchunks = 0; @@ -477,16 +489,11 @@ static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p, return -EPERM; /* Check if we have user pages and nobody bound the BO already */ - if (amdgpu_ttm_tt_userptr_needs_pages(bo->tbo.ttm) && - lobj->user_pages) { - amdgpu_ttm_placement_from_domain(bo, - AMDGPU_GEM_DOMAIN_CPU); - r = ttm_bo_validate(&bo->tbo, &bo->placement, true, - false); - if (r) - return r; - amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, - lobj->user_pages); + if (lobj->user_pages && bo->tbo.ttm->state != tt_bound) { + size_t size = sizeof(struct page *); + + size *= bo->tbo.ttm->num_pages; + memcpy(bo->tbo.ttm->pages, lobj->user_pages, size); binding_userptr = true; } @@ -498,7 +505,11 @@ static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p, return r; if (binding_userptr) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + drm_free_large(lobj->user_pages); +#else kvfree(lobj->user_pages); +#endif lobj->user_pages = NULL; } } @@ -511,6 +522,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, struct amdgpu_fpriv *fpriv = p->filp->driver_priv; struct amdgpu_bo_list_entry *e; struct list_head duplicates; + bool need_mmap_lock = false; unsigned i, tries = 10; int r; @@ -518,9 +530,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, p->bo_list = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle); if (p->bo_list) { + need_mmap_lock = p->bo_list->first_userptr != + p->bo_list->num_entries; amdgpu_bo_list_get_list(p->bo_list, &p->validated); - if (p->bo_list->first_userptr != p->bo_list->num_entries) - p->mn = amdgpu_mn_get(p->adev); } INIT_LIST_HEAD(&duplicates); @@ -529,6 +541,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, if (p->uf_entry.robj && !p->uf_entry.robj->parent) list_add(&p->uf_entry.tv.head, &p->validated); + if (need_mmap_lock) + down_read(¤t->mm->mmap_sem); + while (1) { struct list_head need_pages; unsigned i; @@ -548,25 +563,27 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, INIT_LIST_HEAD(&need_pages); for (i = p->bo_list->first_userptr; i < p->bo_list->num_entries; ++i) { - struct amdgpu_bo *bo; e = &p->bo_list->array[i]; - bo = e->robj; - - if (amdgpu_ttm_tt_userptr_invalidated(bo->tbo.ttm, + + if (amdgpu_ttm_tt_userptr_invalidated(e->robj->tbo.ttm, &e->user_invalidated) && e->user_pages) { /* We acquired a page array, but somebody * invalidated it. Free it and try again */ release_pages(e->user_pages, - bo->tbo.ttm->num_pages, + e->robj->tbo.ttm->num_pages, false); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + drm_free_large(e->user_pages); +#else kvfree(e->user_pages); +#endif e->user_pages = NULL; } - if (amdgpu_ttm_tt_userptr_needs_pages(bo->tbo.ttm) && + if (e->robj->tbo.ttm->state != tt_bound && !e->user_pages) { list_del(&e->tv.head); list_add(&e->tv.head, &need_pages); @@ -592,9 +609,14 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, list_for_each_entry(e, &need_pages, tv.head) { struct ttm_tt *ttm = e->robj->tbo.ttm; +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + e->user_pages = drm_calloc_large(ttm->num_pages, + sizeof(struct page*)); +#else e->user_pages = kvmalloc_array(ttm->num_pages, sizeof(struct page*), GFP_KERNEL | __GFP_ZERO); +#endif if (!e->user_pages) { r = -ENOMEM; DRM_ERROR("calloc failure in %s\n", __func__); @@ -604,7 +626,11 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, r = amdgpu_ttm_tt_get_user_pages(ttm, e->user_pages); if (r) { DRM_ERROR("amdgpu_ttm_tt_get_user_pages failed.\n"); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + drm_free_large(e->user_pages); +#else kvfree(e->user_pages); +#endif e->user_pages = NULL; goto error_free_pages; } @@ -643,6 +669,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved, p->bytes_moved_vis); + fpriv->vm.last_eviction_counter = + atomic64_read(&p->adev->num_evictions); + if (p->bo_list) { struct amdgpu_bo *gds = p->bo_list->gds_obj; struct amdgpu_bo *gws = p->bo_list->gws_obj; @@ -683,6 +712,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, error_free_pages: + if (need_mmap_lock) + up_read(¤t->mm->mmap_sem); + if (p->bo_list) { for (i = p->bo_list->first_userptr; i < p->bo_list->num_entries; ++i) { @@ -694,7 +726,11 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, release_pages(e->user_pages, e->robj->tbo.ttm->num_pages, false); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + drm_free_large(e->user_pages); +#else kvfree(e->user_pages); +#endif } } @@ -729,13 +765,19 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, { unsigned i; - if (error && backoff) + if (!error) + ttm_eu_fence_buffer_objects(&parser->ticket, + &parser->validated, + parser->fence); + else if (backoff) ttm_eu_backoff_reservation(&parser->ticket, &parser->validated); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) for (i = 0; i < parser->num_post_dep_syncobjs; i++) drm_syncobj_put(parser->post_dep_syncobjs[i]); kfree(parser->post_dep_syncobjs); +#endif dma_fence_put(parser->fence); @@ -745,7 +787,11 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, amdgpu_bo_list_put(parser->bo_list); for (i = 0; i < parser->nchunks; i++) +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + drm_free_large(parser->chunks[i].kdata); +#else kvfree(parser->chunks[i].kdata); +#endif kfree(parser->chunks); if (parser->job) amdgpu_job_free(parser->job); @@ -765,6 +811,10 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) if (r) return r; + r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_dir_update); + if (r) + return r; + r = amdgpu_vm_clear_freed(adev, vm, NULL); if (r) return r; @@ -818,13 +868,7 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) } - r = amdgpu_vm_handle_moved(adev, vm); - if (r) - return r; - - r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_update); - if (r) - return r; + r = amdgpu_vm_clear_moved(adev, vm, &p->job->sync); if (amdgpu_vm_debug && p->bo_list) { /* Invalidate all BOs to test for userspace bugs */ @@ -834,7 +878,7 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) if (!bo) continue; - amdgpu_vm_bo_invalidate(adev, bo, false); + amdgpu_vm_bo_invalidate(adev, bo); } } @@ -859,7 +903,7 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev, } if (p->job->vm) { - p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->root.base.bo); + p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->root.bo); r = amdgpu_bo_vm_update_pte(p); if (r) @@ -927,11 +971,11 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, uint64_t offset; uint8_t *kptr; - r = amdgpu_cs_find_mapping(parser, chunk_ib->va_start, - &aobj, &m); - if (r) { + m = amdgpu_cs_find_mapping(parser, chunk_ib->va_start, + &aobj); + if (!aobj) { DRM_ERROR("IB va_start is invalid\n"); - return r; + return -EINVAL; } if ((chunk_ib->va_start + chunk_ib->ib_bytes) > @@ -1029,12 +1073,13 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, return 0; } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p, uint32_t handle) { int r; struct dma_fence *fence; - r = drm_syncobj_find_fence(p->filp, handle, &fence); + r = drm_syncobj_fence_get(p->filp, handle, &fence); if (r) return r; @@ -1089,6 +1134,7 @@ static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p, } return 0; } +#endif static int amdgpu_cs_dependencies(struct amdgpu_device *adev, struct amdgpu_cs_parser *p) @@ -1104,6 +1150,7 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, r = amdgpu_cs_process_fence_dep(p, chunk); if (r) return r; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) } else if (chunk->chunk_id == AMDGPU_CHUNK_ID_SYNCOBJ_IN) { r = amdgpu_cs_process_syncobj_in_dep(p, chunk); if (r) @@ -1112,12 +1159,14 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, r = amdgpu_cs_process_syncobj_out_dep(p, chunk); if (r) return r; +#endif } } return amdgpu_sem_add_cs(p->ctx, p->job->ring, &p->job->dep_sync); } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) { int i; @@ -1125,6 +1174,7 @@ static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) for (i = 0; i < p->num_post_dep_syncobjs; ++i) drm_syncobj_replace_fence(p->post_dep_syncobjs[i], p->fence); } +#endif static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs) @@ -1132,29 +1182,14 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, struct amdgpu_ring *ring = p->job->ring; struct amd_sched_entity *entity = &p->ctx->rings[ring->idx].entity; struct amdgpu_job *job; - unsigned i; int r; - amdgpu_mn_lock(p->mn); - if (p->bo_list) { - for (i = p->bo_list->first_userptr; - i < p->bo_list->num_entries; ++i) { - struct amdgpu_bo *bo = p->bo_list->array[i].robj; - - if (amdgpu_ttm_tt_userptr_needs_pages(bo->tbo.ttm)) { - amdgpu_mn_unlock(p->mn); - return -ERESTARTSYS; - } - } - } - job = p->job; p->job = NULL; r = amd_sched_job_init(&job->base, &ring->sched, entity, p->filp); if (r) { amdgpu_job_free(job); - amdgpu_mn_unlock(p->mn); return r; } @@ -1162,18 +1197,17 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, job->fence_ctx = entity->fence_context; p->fence = dma_fence_get(&job->base.s_fence->finished); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) amdgpu_cs_post_dependencies(p); +#endif cs->out.handle = amdgpu_ctx_add_fence(p->ctx, ring, p->fence); job->uf_sequence = cs->out.handle; amdgpu_job_free_resources(job); + amdgpu_cs_parser_fini(p, 0, true); trace_amdgpu_cs_ioctl(job); amd_sched_entity_push_job(&job->base); - - ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence); - amdgpu_mn_unlock(p->mn); - return 0; } @@ -1228,7 +1262,10 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) goto out; r = amdgpu_cs_submit(&parser, cs); + if (r) + goto out; + return 0; out: amdgpu_cs_parser_fini(&parser, r, reserved_buffers); return r; @@ -1274,7 +1311,7 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data, if (IS_ERR(fence)) r = PTR_ERR(fence); else if (fence) { - r = dma_fence_wait_timeout(fence, true, timeout); + r = kcl_fence_wait_timeout(fence, true, timeout); dma_fence_put(fence); } else r = 1; @@ -1349,7 +1386,7 @@ static int amdgpu_cs_wait_all_fences(struct amdgpu_device *adev, else if (!fence) continue; - r = dma_fence_wait_timeout(fence, true, timeout); + r = kcl_fence_wait_timeout(fence, true, timeout); dma_fence_put(fence); if (r < 0) return r; @@ -1401,13 +1438,12 @@ static int amdgpu_cs_wait_any_fence(struct amdgpu_device *adev, array[i] = fence; } else { /* NULL, the fence has been already signaled */ r = 1; - first = i; goto out; } } - r = dma_fence_wait_any_timeout(array, fence_count, true, timeout, - &first); + r = kcl_fence_wait_any_timeout(array, fence_count, true, timeout, + &first); if (r < 0) goto err_free_fence_array; @@ -1452,7 +1488,7 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, if (fences == NULL) return -ENOMEM; - fences_user = u64_to_user_ptr(wait->in.fences); + fences_user = kcl_u64_to_user_ptr(wait->in.fences); if (copy_from_user(fences, fences_user, sizeof(struct drm_amdgpu_fence) * fence_count)) { r = -EFAULT; @@ -1481,36 +1517,78 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, * virtual memory address. Returns allocation structure when found, NULL * otherwise. */ -int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, - uint64_t addr, struct amdgpu_bo **bo, - struct amdgpu_bo_va_mapping **map) +struct amdgpu_bo_va_mapping * +amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, + uint64_t addr, struct amdgpu_bo **bo) { - struct amdgpu_fpriv *fpriv = parser->filp->driver_priv; - struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_bo_va_mapping *mapping; - int r; + unsigned i; + + if (!parser->bo_list) + return NULL; addr /= AMDGPU_GPU_PAGE_SIZE; - mapping = amdgpu_vm_bo_lookup_mapping(vm, addr); - if (!mapping || !mapping->bo_va || !mapping->bo_va->base.bo) - return -EINVAL; + for (i = 0; i < parser->bo_list->num_entries; i++) { + struct amdgpu_bo_list_entry *lobj; - *bo = mapping->bo_va->base.bo; - *map = mapping; + lobj = &parser->bo_list->array[i]; + if (!lobj->bo_va) + continue; - /* Double check that the BO is reserved by this CS */ - if (READ_ONCE((*bo)->tbo.resv->lock.ctx) != &parser->ticket) - return -EINVAL; + list_for_each_entry(mapping, &lobj->bo_va->valids, list) { + if (mapping->start > addr || + addr > mapping->last) + continue; - r = amdgpu_ttm_bind(&(*bo)->tbo, &(*bo)->tbo.mem); - if (unlikely(r)) - return r; + *bo = lobj->bo_va->base.bo; + return mapping; + } + + list_for_each_entry(mapping, &lobj->bo_va->invalids, list) { + if (mapping->start > addr || + addr > mapping->last) + continue; - if ((*bo)->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + *bo = lobj->bo_va->base.bo; + return mapping; + } + } + + return NULL; +} + +/** + * amdgpu_cs_sysvm_access_required - make BOs accessible by the system VM + * + * @parser: command submission parser context + * + * Helper for UVD/VCE VM emulation, make sure BOs are accessible by the system VM. + */ +int amdgpu_cs_sysvm_access_required(struct amdgpu_cs_parser *parser) +{ + unsigned i; + int r; + + if (!parser->bo_list) return 0; - (*bo)->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; - amdgpu_ttm_placement_from_domain(*bo, (*bo)->allowed_domains); - return ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, false, false); + for (i = 0; i < parser->bo_list->num_entries; i++) { + struct amdgpu_bo *bo = parser->bo_list->array[i].robj; + + r = amdgpu_ttm_bind(&bo->tbo, &bo->tbo.mem); + if (unlikely(r)) + return r; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + continue; + + bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; + amdgpu_ttm_placement_from_domain(bo, bo->allowed_domains); + r = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); + if (unlikely(r)) + return r; + } + + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f032e87..37398e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -404,15 +404,6 @@ void amdgpu_pci_config_reset(struct amdgpu_device *adev) */ static int amdgpu_doorbell_init(struct amdgpu_device *adev) { - /* No doorbell on SI hardware generation */ - if (adev->asic_type < CHIP_BONAIRE) { - adev->doorbell.base = 0; - adev->doorbell.size = 0; - adev->doorbell.num_doorbells = 0; - adev->doorbell.ptr = NULL; - return 0; - } - /* doorbell bar mapping */ adev->doorbell.base = pci_resource_start(adev->pdev, 2); adev->doorbell.size = pci_resource_len(adev->pdev, 2); @@ -2130,8 +2121,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); - /* doorbell bar mapping */ - amdgpu_doorbell_init(adev); + if (adev->asic_type >= CHIP_BONAIRE) + /* doorbell bar mapping */ + amdgpu_doorbell_init(adev); /* io port mapping */ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { @@ -2348,7 +2340,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_atombios_fini(adev); kfree(adev->bios); adev->bios = NULL; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0) if (!pci_is_thunderbolt_attached(adev->pdev)) +#endif vga_switcheroo_unregister_client(adev->pdev); if (adev->flags & AMD_IS_PX) vga_switcheroo_fini_domain_pm_ops(adev->dev); @@ -2358,7 +2352,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) adev->rio_mem = NULL; iounmap(adev->rmmio); adev->rmmio = NULL; - amdgpu_doorbell_fini(adev); + if (adev->asic_type >= CHIP_BONAIRE) + amdgpu_doorbell_fini(adev); amdgpu_debugfs_regs_cleanup(adev); } @@ -3159,6 +3154,27 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, return 0; } +#if defined(BUILD_AS_DKMS) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) +void amdgpu_debugfs_cleanup(struct drm_minor *minor) +{ + struct drm_info_node *node, *tmp; + + if (!&minor->debugfs_root) + return 0; + + mutex_lock(&minor->debugfs_lock); + list_for_each_entry_safe(node, tmp, + &minor->debugfs_list, list) { + debugfs_remove(node->dent); + list_del(&node->list); + kfree(node); + } + mutex_unlock(&minor->debugfs_lock); + + return 0; +} +#endif + #if defined(CONFIG_DEBUG_FS) static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user *buf, @@ -3570,7 +3586,10 @@ static ssize_t amdgpu_debugfs_sensor_read(struct file *f, char __user *buf, valuesize = sizeof(values); if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->read_sensor) - r = amdgpu_dpm_read_sensor(adev, idx, &values[0], &valuesize); + r = adev->powerplay.pp_funcs->read_sensor(adev->powerplay.pp_handle, idx, &values[0], &valuesize); + else if (adev->pm.funcs && adev->pm.funcs->read_sensor) + r = adev->pm.funcs->read_sensor(adev, idx, &values[0], + &valuesize); else return -EINVAL; @@ -3594,7 +3613,7 @@ static ssize_t amdgpu_debugfs_sensor_read(struct file *f, char __user *buf, static ssize_t amdgpu_debugfs_wave_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; int r, x; ssize_t result=0; uint32_t offset, se, sh, cu, wave, simd, data[32]; @@ -3644,7 +3663,8 @@ static ssize_t amdgpu_debugfs_wave_read(struct file *f, char __user *buf, static ssize_t amdgpu_debugfs_gpr_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; + int r; ssize_t result = 0; uint32_t offset, se, sh, cu, wave, simd, thread, bank, *data; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h index 0d22259..12a4a78 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h @@ -427,6 +427,7 @@ struct amdgpu_pm { struct amdgpu_dpm dpm; const struct firmware *fw; /* SMC firmware */ uint32_t fw_version; + const struct amdgpu_dpm_funcs *funcs; uint32_t pcie_gen_mask; uint32_t pcie_mlw_mask; struct amd_pp_display_configuration pm_display_cfg;/* set by dc */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 2be2e05..0720358 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -69,10 +69,9 @@ * - 3.17.0 - Add AMDGPU_NUM_VRAM_CPU_PAGE_FAULTS. * - 3.18.0 - Export gpu always on cu bitmap * - 3.19.0 - Add support for UVD MJPEG decode - * - 3.20.0 - Add support for local BOs */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 20 +#define KMS_DRIVER_MINOR 19 #define KMS_DRIVER_PATCHLEVEL 0 int amdgpu_vram_limit = 0; @@ -124,7 +123,6 @@ int amdgpu_cntl_sb_buf_per_se = 0; int amdgpu_param_buf_per_se = 0; int amdgpu_job_hang_limit = 0; int amdgpu_lbpw = -1; -int amdgpu_compute_multipipe = -1; MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes"); module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); @@ -274,9 +272,6 @@ module_param_named(job_hang_limit, amdgpu_job_hang_limit, int ,0444); MODULE_PARM_DESC(lbpw, "Load Balancing Per Watt (LBPW) support (1 = enable, 0 = disable, -1 = auto)"); module_param_named(lbpw, amdgpu_lbpw, int, 0444); -MODULE_PARM_DESC(compute_multipipe, "Force compute queues to be spread across pipes (1 = enable, 0 = disable, -1 = auto)"); -module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444); - #ifdef CONFIG_DRM_AMDGPU_SI int amdgpu_si_support = 1; @@ -822,10 +817,8 @@ static struct drm_driver kms_driver = { .open = amdgpu_driver_open_kms, .postclose = amdgpu_driver_postclose_kms, .lastclose = amdgpu_driver_lastclose_kms, + .set_busid = drm_pci_set_busid, .unload = amdgpu_driver_unload_kms, - .get_vblank_counter = amdgpu_get_vblank_counter_kms, - .enable_vblank = amdgpu_enable_vblank_kms, - .disable_vblank = amdgpu_disable_vblank_kms, .get_vblank_timestamp = drm_calc_vbltimestamp_from_scanoutpos, .get_scanout_position = amdgpu_get_crtc_scanout_position, #if defined(CONFIG_DEBUG_FS) @@ -841,6 +834,7 @@ static struct drm_driver kms_driver = { .gem_close_object = amdgpu_gem_object_close, .dumb_create = amdgpu_mode_dumb_create, .dumb_map_offset = amdgpu_mode_dumb_mmap, + .dumb_destroy = drm_gem_dumb_destroy, .fops = &amdgpu_driver_kms_fops, .prime_handle_to_fd = drm_gem_prime_handle_to_fd, @@ -931,4 +925,3 @@ module_exit(amdgpu_exit); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL and additional rights"); -MODULE_VERSION("17.50.2.13"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 3d08c6f..fdb9d85 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -44,12 +44,20 @@ * This is the main unload function for KMS (all asics). * Returns 0 on success. */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) +int amdgpu_driver_unload_kms(struct drm_device *dev) +#else void amdgpu_driver_unload_kms(struct drm_device *dev) +#endif { struct amdgpu_device *adev = dev->dev_private; if (adev == NULL) +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) + return 0; +#else return; +#endif if (adev->rmmio == NULL) goto done_free; @@ -71,6 +79,9 @@ void amdgpu_driver_unload_kms(struct drm_device *dev) done_free: kfree(adev); dev->dev_private = NULL; +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) + return 0; +#endif } /** @@ -129,8 +140,12 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags) amdgpu_has_atpx() && (amdgpu_is_atpx_hybrid() || amdgpu_has_atpx_dgpu_power_cntl()) && +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + ((flags & AMD_IS_APU) == 0)) +#else ((flags & AMD_IS_APU) == 0) && !pci_is_thunderbolt_attached(dev->pdev)) +#endif flags |= AMD_IS_PX; /* amdgpu_device_init should report only fatal error @@ -1052,6 +1067,72 @@ void amdgpu_disable_vblank_kms(struct drm_device *dev, unsigned int pipe) amdgpu_irq_put(adev, &adev->crtc_irq, idx); } +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) +/** + * amdgpu_get_vblank_timestamp_kms - get vblank timestamp + * + * @dev: drm dev pointer + * @crtc: crtc to get the timestamp for + * @max_error: max error + * @vblank_time: time value + * @flags: flags passed to the driver + * + * Gets the timestamp on the requested crtc based on the + * scanout position. (all asics). + * Returns postive status flags on success, negative error on failure. + */ +int amdgpu_get_vblank_timestamp_kms(struct drm_device *dev, unsigned int pipe, + int *max_error, + struct timeval *vblank_time, + unsigned flags) +{ + struct drm_crtc *crtc; + struct amdgpu_device *adev = dev->dev_private; + + if (pipe >= dev->num_crtcs) { + DRM_ERROR("Invalid crtc %u\n", pipe); + return -EINVAL; + } + + /* Get associated drm_crtc: */ + crtc = &adev->mode_info.crtcs[pipe]->base; + if (!crtc) { + /* This can occur on driver load if some component fails to + * initialize completely and driver is unloaded */ + DRM_ERROR("Uninitialized crtc %d\n", pipe); + return -EINVAL; + } + + /* Helper routine in DRM core does all the work: */ + return kcl_drm_calc_vbltimestamp_from_scanoutpos(dev, pipe, max_error, + vblank_time, flags, + crtc, &crtc->hwmode); +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) +const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_CREATE, amdgpu_gem_create_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_CTX, amdgpu_ctx_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_BO_LIST, amdgpu_bo_list_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + /* KMS */ + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_MMAP, amdgpu_gem_mmap_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_WAIT_IDLE, amdgpu_gem_wait_idle_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_CS, amdgpu_cs_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_INFO, amdgpu_info_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_WAIT_CS, amdgpu_cs_wait_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_WAIT_FENCES, amdgpu_cs_wait_fences_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_METADATA, amdgpu_gem_metadata_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_VA, amdgpu_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_OP, amdgpu_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_USERPTR, amdgpu_gem_userptr_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_FIND_BO, amdgpu_gem_find_bo_by_cpu_mapping_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_FREESYNC, amdgpu_freesync_ioctl, DRM_MASTER|DRM_UNLOCKED), + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_FIND_BO, amdgpu_gem_find_bo_by_cpu_mapping_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_GEM_DGMA, amdgpu_gem_dgma_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_SEM, amdgpu_sem_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), +}; +#else const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(AMDGPU_GEM_CREATE, amdgpu_gem_create_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_CTX, amdgpu_ctx_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), @@ -1073,6 +1154,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(AMDGPU_GEM_DGMA, amdgpu_gem_dgma_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_SEM, amdgpu_sem_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), }; +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) */ const int amdgpu_max_kms_ioctl = ARRAY_SIZE(amdgpu_ioctls_kms); /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index d25ec38..430c622 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -50,10 +50,8 @@ struct amdgpu_mn { struct hlist_node node; /* objects protected by lock */ - struct rw_semaphore lock; - struct rb_root_cached objects; - struct mutex read_lock; - atomic_t recursion; + struct mutex lock; + struct rb_root objects; }; struct amdgpu_mn_node { @@ -76,17 +74,17 @@ static void amdgpu_mn_destroy(struct work_struct *work) struct amdgpu_bo *bo, *next_bo; mutex_lock(&adev->mn_lock); - down_write(&rmn->lock); + mutex_lock(&rmn->lock); hash_del(&rmn->node); - rbtree_postorder_for_each_entry_safe(node, next_node, - &rmn->objects.rb_root, it.rb) { + rbtree_postorder_for_each_entry_safe(node, next_node, &rmn->objects, + it.rb) { list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) { bo->mn = NULL; list_del_init(&bo->mn_list); } kfree(node); } - up_write(&rmn->lock); + mutex_unlock(&rmn->lock); mutex_unlock(&adev->mn_lock); mmu_notifier_unregister_no_release(&rmn->mn, rmn->mm); kfree(rmn); @@ -108,53 +106,6 @@ static void amdgpu_mn_release(struct mmu_notifier *mn, schedule_work(&rmn->work); } - -/** - * amdgpu_mn_lock - take the write side lock for this mn - */ -void amdgpu_mn_lock(struct amdgpu_mn *mn) -{ - if (mn) - down_write(&mn->lock); -} - -/** - * amdgpu_mn_unlock - drop the write side lock for this mn - */ -void amdgpu_mn_unlock(struct amdgpu_mn *mn) -{ - if (mn) - up_write(&mn->lock); -} - -/** - * amdgpu_mn_read_lock - take the rmn read lock - * - * @rmn: our notifier - * - * Take the rmn read side lock. - */ -static void amdgpu_mn_read_lock(struct amdgpu_mn *rmn) -{ - mutex_lock(&rmn->read_lock); - if (atomic_inc_return(&rmn->recursion) == 1) - down_read_non_owner(&rmn->lock); - mutex_unlock(&rmn->read_lock); -} - -/** - * amdgpu_mn_read_unlock - drop the rmn read lock - * - * @rmn: our notifier - * - * Drop the rmn read side lock. - */ -static void amdgpu_mn_read_unlock(struct amdgpu_mn *rmn) -{ - if (atomic_dec_return(&rmn->recursion) == 0) - up_read_non_owner(&rmn->lock); -} - /** * amdgpu_mn_invalidate_node - unmap all BOs of a node * @@ -175,12 +126,23 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end)) continue; - r = reservation_object_wait_timeout_rcu(bo->tbo.resv, + r = amdgpu_bo_reserve(bo, true); + if (r) { + DRM_ERROR("(%ld) failed to reserve user bo\n", r); + continue; + } + + r = kcl_reservation_object_wait_timeout_rcu(bo->tbo.resv, true, false, MAX_SCHEDULE_TIMEOUT); if (r <= 0) DRM_ERROR("(%ld) failed to wait for user bo\n", r); - amdgpu_ttm_tt_mark_user_pages(bo->tbo.ttm); + amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); + r = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); + if (r) + DRM_ERROR("(%ld) failed to validate user bo\n", r); + + amdgpu_bo_unreserve(bo); } } @@ -206,7 +168,7 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end -= 1; - amdgpu_mn_read_lock(rmn); + mutex_lock(&rmn->lock); it = interval_tree_iter_first(&rmn->objects, start, end); while (it) { @@ -218,33 +180,12 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, amdgpu_mn_invalidate_node(node, start, end); } - up_read(&rmn->lock); -} - -/** - * amdgpu_mn_invalidate_range_end - callback to notify about mm change - * - * @mn: our notifier - * @mn: the mm this callback is about - * @start: start of updated range - * @end: end of updated range - * - * Release the lock again to allow new command submissions. - */ -static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ - struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); - - amdgpu_mn_read_unlock(rmn); + mutex_unlock(&rmn->lock); } static const struct mmu_notifier_ops amdgpu_mn_ops = { .release = amdgpu_mn_release, .invalidate_range_start = amdgpu_mn_invalidate_range_start, - .invalidate_range_end = amdgpu_mn_invalidate_range_end, }; /** @@ -254,19 +195,30 @@ static const struct mmu_notifier_ops amdgpu_mn_ops = { * * Creates a notifier context for current->mm. */ -struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) +static struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) { struct mm_struct *mm = current->mm; struct amdgpu_mn *rmn; int r; +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) + struct hlist_node *node; +#endif mutex_lock(&adev->mn_lock); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 7, 0) + down_write(&mm->mmap_sem); +#else if (down_write_killable(&mm->mmap_sem)) { mutex_unlock(&adev->mn_lock); return ERR_PTR(-EINTR); } +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) + hash_for_each_possible(adev->mn_hash, rmn, node, node, (unsigned long)mm) +#else hash_for_each_possible(adev->mn_hash, rmn, node, (unsigned long)mm) +#endif if (rmn->mm == mm) goto release_locks; @@ -279,10 +231,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) rmn->adev = adev; rmn->mm = mm; rmn->mn.ops = &amdgpu_mn_ops; - init_rwsem(&rmn->lock); - rmn->objects = RB_ROOT_CACHED; - mutex_init(&rmn->read_lock); - atomic_set(&rmn->recursion, 0); + mutex_init(&rmn->lock); + rmn->objects = RB_ROOT; r = __mmu_notifier_register(&rmn->mn, mm); if (r) @@ -328,7 +278,7 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) INIT_LIST_HEAD(&bos); - down_write(&rmn->lock); + mutex_lock(&rmn->lock); while ((it = interval_tree_iter_first(&rmn->objects, addr, end))) { kfree(node); @@ -340,9 +290,9 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) } if (!node) { - node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_NOIO); + node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_KERNEL); if (!node) { - up_write(&rmn->lock); + mutex_unlock(&rmn->lock); return -ENOMEM; } } @@ -357,7 +307,7 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) interval_tree_insert(&node->it, &rmn->objects); - up_write(&rmn->lock); + mutex_unlock(&rmn->lock); return 0; } @@ -383,7 +333,7 @@ void amdgpu_mn_unregister(struct amdgpu_bo *bo) return; } - down_write(&rmn->lock); + mutex_lock(&rmn->lock); /* save the next list entry for later */ head = bo->mn_list.next; @@ -398,7 +348,6 @@ void amdgpu_mn_unregister(struct amdgpu_bo *bo) kfree(node); } - up_write(&rmn->lock); + mutex_unlock(&rmn->lock); mutex_unlock(&adev->mn_lock); } - diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index f421505..fb6c3d6 100755 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -36,7 +36,6 @@ #include #include "amdgpu.h" #include "amdgpu_trace.h" -#include "amdgpu_amdkfd.h" static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) { @@ -47,9 +46,8 @@ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) if (bo->tbo.mem.mem_type == AMDGPU_PL_DGMA_IMPORT) kfree(tbo->mem.bus.addr); - if (bo->kfd_bo) - amdgpu_amdkfd_unreserve_system_memory_limit(bo); amdgpu_bo_kunmap(bo); + drm_gem_object_release(&bo->gem_base); if (bo->gem_base.import_attach) drm_prime_gem_destroy(&bo->gem_base, bo->tbo.sg); @@ -70,12 +68,11 @@ bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo) return false; } -void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain) +static void amdgpu_ttm_placement_init(struct amdgpu_device *adev, + struct ttm_placement *placement, + struct ttm_place *places, + u32 domain, u64 flags) { - struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev); - struct ttm_placement *placement = &abo->placement; - struct ttm_place *places = abo->placements; - u64 flags = abo->flags; u32 c = 0, i; if ((domain & AMDGPU_GEM_DOMAIN_DGMA) && amdgpu_direct_gma_size) { @@ -178,6 +175,27 @@ void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain) placement->busy_placement = places; } +void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain) +{ + struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev); + + amdgpu_ttm_placement_init(adev, &abo->placement, abo->placements, + domain, abo->flags); +} + +static void amdgpu_fill_placement_to_bo(struct amdgpu_bo *bo, + struct ttm_placement *placement) +{ + BUG_ON(placement->num_placement > (AMDGPU_GEM_DOMAIN_MAX + 1)); + + memcpy(bo->placements, placement->placement, + placement->num_placement * sizeof(struct ttm_place)); + bo->placement.num_placement = placement->num_placement; + bo->placement.num_busy_placement = placement->num_busy_placement; + bo->placement.placement = bo->placements; + bo->placement.busy_placement = bo->placements; +} + /** * amdgpu_bo_create_reserved - create reserved BO for kernel use * @@ -309,13 +327,14 @@ void amdgpu_bo_free_kernel(struct amdgpu_bo **bo, u64 *gpu_addr, *cpu_addr = NULL; } -static int amdgpu_bo_do_create(struct amdgpu_device *adev, - unsigned long size, int byte_align, - bool kernel, u32 domain, u64 flags, - struct sg_table *sg, - struct reservation_object *resv, - uint64_t init_value, - struct amdgpu_bo **bo_ptr) +int amdgpu_bo_create_restricted(struct amdgpu_device *adev, + unsigned long size, int byte_align, + bool kernel, u32 domain, u64 flags, + struct sg_table *sg, + struct ttm_placement *placement, + struct reservation_object *resv, + uint64_t init_value, + struct amdgpu_bo **bo_ptr) { struct amdgpu_bo *bo; enum ttm_bo_type type; @@ -342,10 +361,13 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, bo = kzalloc(sizeof(struct amdgpu_bo), GFP_KERNEL); if (bo == NULL) return -ENOMEM; - + r = drm_gem_object_init(adev->ddev, &bo->gem_base, size); + if (unlikely(r)) { + kfree(bo); + return r; + } INIT_LIST_HEAD(&bo->shadow_list); INIT_LIST_HEAD(&bo->va); - INIT_LIST_HEAD(&bo->gem_objects); bo->preferred_domains = domain & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT | AMDGPU_GEM_DOMAIN_CPU | @@ -388,17 +410,13 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; #endif - bo->tbo.bdev = &adev->mman.bdev; - amdgpu_ttm_placement_from_domain(bo, domain); + amdgpu_fill_placement_to_bo(bo, placement); + /* Kernel allocation are uninterruptible */ initial_bytes_moved = atomic64_read(&adev->num_bytes_moved); - /* Kernel allocation are uninterruptible */ r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type, &bo->placement, page_align, !kernel, NULL, acc_size, sg, resv, &amdgpu_ttm_bo_destroy); - if (unlikely(r != 0)) - return r; - bytes_moved = atomic64_read(&adev->num_bytes_moved) - initial_bytes_moved; if (adev->mc.visible_vram_size < adev->mc.real_vram_size && @@ -408,6 +426,9 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, else amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0); + if (unlikely(r != 0)) + return r; + if (domain & AMDGPU_GEM_DOMAIN_DGMA && adev->ssg.enabled) bo->tbo.ssg_can_map = true; @@ -422,9 +443,13 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, if (unlikely(r)) goto fail_unreserve; +#if defined(BUILD_AS_DKMS) + dma_fence_wait(fence, false); +#else amdgpu_bo_fence(bo, fence, false); dma_fence_put(bo->tbo.moving); bo->tbo.moving = dma_fence_get(fence); +#endif dma_fence_put(fence); } if (!resv) @@ -459,17 +484,27 @@ static int amdgpu_bo_create_shadow(struct amdgpu_device *adev, unsigned long size, int byte_align, struct amdgpu_bo *bo) { + struct ttm_placement placement = {0}; + struct ttm_place placements[AMDGPU_GEM_DOMAIN_MAX + 1]; int r; if (bo->shadow) return 0; - r = amdgpu_bo_do_create(adev, size, byte_align, true, - AMDGPU_GEM_DOMAIN_GTT, - AMDGPU_GEM_CREATE_CPU_GTT_USWC | - AMDGPU_GEM_CREATE_SHADOW, - NULL, bo->tbo.resv, 0, - &bo->shadow); + memset(&placements, 0, sizeof(placements)); + amdgpu_ttm_placement_init(adev, &placement, placements, + AMDGPU_GEM_DOMAIN_GTT, + AMDGPU_GEM_CREATE_CPU_GTT_USWC | + AMDGPU_GEM_CREATE_SHADOW); + + r = amdgpu_bo_create_restricted(adev, size, byte_align, true, + AMDGPU_GEM_DOMAIN_GTT, + AMDGPU_GEM_CREATE_CPU_GTT_USWC | + AMDGPU_GEM_CREATE_SHADOW, + NULL, &placement, + bo->tbo.resv, + 0, + &bo->shadow); if (!r) { bo->shadow->parent = amdgpu_bo_ref(bo); mutex_lock(&adev->shadow_list_lock); @@ -491,11 +526,18 @@ int amdgpu_bo_create(struct amdgpu_device *adev, uint64_t init_value, struct amdgpu_bo **bo_ptr) { + struct ttm_placement placement = {0}; + struct ttm_place placements[AMDGPU_GEM_DOMAIN_MAX + 1]; uint64_t parent_flags = flags & ~AMDGPU_GEM_CREATE_SHADOW; int r; - r = amdgpu_bo_do_create(adev, size, byte_align, kernel, domain, - parent_flags, sg, resv, init_value, bo_ptr); + memset(&placements, 0, sizeof(placements)); + amdgpu_ttm_placement_init(adev, &placement, placements, + domain, parent_flags); + + r = amdgpu_bo_create_restricted(adev, size, byte_align, kernel, domain, + parent_flags, sg, &placement, resv, + init_value, bo_ptr); if (r) return r; @@ -931,7 +973,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, return; abo = container_of(bo, struct amdgpu_bo, tbo); - amdgpu_vm_bo_invalidate(adev, abo, evict); + amdgpu_vm_bo_invalidate(adev, abo); amdgpu_bo_kunmap(abo); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index f73dba5..024e5cb 100755 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -35,7 +35,6 @@ /* bo virtual addresses in a vm */ struct amdgpu_bo_va_mapping { - struct amdgpu_bo_va *bo_va; struct list_head list; struct rb_node rb; uint64_t start; @@ -50,17 +49,12 @@ struct amdgpu_bo_va { struct amdgpu_vm_bo_base base; /* protected by bo being reserved */ - unsigned ref_count; - - /* all other members protected by the VM PD being reserved */ struct dma_fence *last_pt_update; + unsigned ref_count; /* mappings for this bo_va */ struct list_head invalids; struct list_head valids; - - /* If the mappings are cleared or filled */ - bool cleared; }; struct amdgpu_bo { @@ -78,18 +72,16 @@ struct amdgpu_bo { void *metadata; u32 metadata_size; unsigned prime_shared_count; - /* GEM objects refereing to this BO */ - struct list_head gem_objects; - /* list of all virtual address to which this bo is associated to */ struct list_head va; /* Constant after initialization */ + struct drm_gem_object gem_base; struct amdgpu_bo *parent; struct amdgpu_bo *shadow; struct ttm_bo_kmap_obj dma_buf_vmap; struct amdgpu_mn *mn; - struct kgd_mem *kfd_bo; + struct kfd_process_device *pdd; union { struct list_head mn_list; @@ -207,6 +199,14 @@ int amdgpu_bo_create(struct amdgpu_device *adev, struct reservation_object *resv, uint64_t init_value, struct amdgpu_bo **bo_ptr); +int amdgpu_bo_create_restricted(struct amdgpu_device *adev, + unsigned long size, int byte_align, + bool kernel, u32 domain, u64 flags, + struct sg_table *sg, + struct ttm_placement *placement, + struct reservation_object *resv, + uint64_t init_value, + struct amdgpu_bo **bo_ptr); int amdgpu_bo_create_reserved(struct amdgpu_device *adev, unsigned long size, int align, u32 domain, struct amdgpu_bo **bo_ptr, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 90adff8..06b824c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -136,8 +136,7 @@ void amdgpu_ring_commit(struct amdgpu_ring *ring) if (ring->funcs->end_use) ring->funcs->end_use(ring); - if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ) - amdgpu_ring_lru_touch(ring->adev, ring); + amdgpu_ring_lru_touch(ring->adev, ring); } /** @@ -382,7 +381,7 @@ void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring *ring) static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_ring *ring = file_inode(f)->i_private; + struct amdgpu_ring *ring = (struct amdgpu_ring*)kcl_file_private(f); int r, i; uint32_t value, result, early[3]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index af8e544..322d2529 100755 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -36,7 +36,6 @@ /* some special values for the owner field */ #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul) #define AMDGPU_FENCE_OWNER_VM ((void*)1ul) -#define AMDGPU_FENCE_OWNER_KFD ((void *)2ul) #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) #define AMDGPU_FENCE_FLAG_INT (1 << 1) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c index 7ee8247..8492a26 100755 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c @@ -31,7 +31,6 @@ #include #include "amdgpu.h" #include "amdgpu_trace.h" -#include "amdgpu_amdkfd.h" struct amdgpu_sync_entry { struct hlist_node node; @@ -85,20 +84,11 @@ static bool amdgpu_sync_same_dev(struct amdgpu_device *adev, */ static void *amdgpu_sync_get_owner(struct dma_fence *f) { - struct amd_sched_fence *s_fence; - struct amdgpu_amdkfd_fence *kfd_fence; - - if (f == NULL) - return AMDGPU_FENCE_OWNER_UNDEFINED; + struct amd_sched_fence *s_fence = to_amd_sched_fence(f); - s_fence = to_amd_sched_fence(f); if (s_fence) return s_fence->owner; - kfd_fence = to_amdgpu_amdkfd_fence(f); - if (kfd_fence) - return AMDGPU_FENCE_OWNER_KFD; - return AMDGPU_FENCE_OWNER_UNDEFINED; } @@ -180,9 +170,7 @@ int amdgpu_sync_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync, * @sync: sync object to add fences from reservation object to * @resv: reservation object with embedded fence * @shared: true if we should only sync to the exclusive fence - * - * Sync to the fence except if it is KFD eviction fence and owner is - * AMDGPU_FENCE_OWNER_VM. + * Sync to the fence */ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync, @@ -209,15 +197,12 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, for (i = 0; i < flist->shared_count; ++i) { f = rcu_dereference_protected(flist->shared[i], reservation_object_held(resv)); - fence_owner = amdgpu_sync_get_owner(f); - if (fence_owner == AMDGPU_FENCE_OWNER_KFD && - owner != AMDGPU_FENCE_OWNER_UNDEFINED) - continue; if (amdgpu_sync_same_dev(adev, f)) { /* VM updates are only interesting * for other VM updates and moves. */ + fence_owner = amdgpu_sync_get_owner(f); if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) && (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) && ((owner == AMDGPU_FENCE_OWNER_VM) != diff --git a/drivers/gpu/drm/amd/amdgpu/vid.h b/drivers/gpu/drm/amd/amdgpu/vid.h index d09592a..a648525 100755 --- a/drivers/gpu/drm/amd/amdgpu/vid.h +++ b/drivers/gpu/drm/amd/amdgpu/vid.h @@ -27,8 +27,6 @@ #define SDMA1_REGISTER_OFFSET 0x200 /* not a register */ #define SDMA_MAX_INSTANCE 2 -#define KFD_VI_SDMA_QUEUE_OFFSET 0x80 /* not a register */ - /* crtc instance offsets */ #define CRTC0_REGISTER_OFFSET (0x1b9c - 0x1b9c) #define CRTC1_REGISTER_OFFSET (0x1d9c - 0x1b9c) @@ -369,10 +367,6 @@ * x=0: tmz_begin * x=1: tmz_end */ -#define PACKET3_INVALIDATE_TLBS 0x98 -# define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) -# define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) - #define PACKET3_SET_RESOURCES 0xA0 /* 1. header * 2. CONTROL diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index 95be0dd..e13c67c 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -4,7 +4,6 @@ config HSA_AMD tristate "HSA kernel driver for AMD GPU devices" - depends on (DRM_RADEON || DRM_AMDGPU) && (X86_64 || PPC64 || ARM64) - select DRM_AMDGPU_USERPTR + depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64 help Enable this if you want to use HSA features on AMD GPU devices. diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile old mode 100755 new mode 100644 index dba08ec..b400d56 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -1,28 +1,19 @@ -# SPDX-License-Identifier: GPL-2.0 # # Makefile for Heterogenous System Architecture support for AMD GPU devices # -FULL_AMD_PATH=$(src)/.. - -ccflags-y := -I$(FULL_AMD_PATH)/include/ \ - -I$(FULL_AMD_PATH)/include/asic_reg +ccflags-y := -Idrivers/gpu/drm/amd/include/ \ + -Idrivers/gpu/drm/amd/include/asic_reg amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ kfd_process.o kfd_queue.o kfd_mqd_manager.o \ kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ - kfd_mqd_manager_v9.o \ kfd_kernel_queue.o kfd_kernel_queue_cik.o \ - kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ - kfd_packet_manager.o kfd_process_queue_manager.o \ - kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ - kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ - kfd_interrupt.o kfd_events.o cik_event_interrupt.o kfd_int_process_v9.o \ - kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o \ - kfd_peerdirect.o kfd_ipc.o - -amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o + kfd_kernel_queue_vi.o kfd_packet_manager.o \ + kfd_process_queue_manager.o kfd_device_queue_manager.o \ + kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ + kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ + kfd_dbgdev.o kfd_dbgmgr.o obj-$(CONFIG_HSA_AMD) += amdkfd.o - diff --git a/drivers/gpu/drm/amd/amdkfd/backport/Makefile b/drivers/gpu/drm/amd/amdkfd/backport/Makefile deleted file mode 100644 index 6a3845e..0000000 --- a/drivers/gpu/drm/amd/amdkfd/backport/Makefile +++ /dev/null @@ -1,7 +0,0 @@ - - -LINUXINCLUDE := $(DKMS_INCLUDE_PREFIX) $(LINUXINCLUDE) - -ccflags-y += \ - -I$(AMDKFD_FULL_PATH) \ - -include backport/backport.h diff --git a/drivers/gpu/drm/amd/amdkfd/backport/backport.h b/drivers/gpu/drm/amd/amdkfd/backport/backport.h deleted file mode 100644 index e1f8c1d..0000000 --- a/drivers/gpu/drm/amd/amdkfd/backport/backport.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef AMDKFD_BACKPORT_H -#define AMDKFD_BACKPORT_H - -#include - -#endif diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 00536a1..211fc48 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,89 +24,40 @@ #include "kfd_events.h" #include "cik_int.h" -static bool is_cpc_vm_fault(struct kfd_dev *dev, - const uint32_t *ih_ring_entry) -{ - const struct cik_ih_ring_entry *ihre = - (const struct cik_ih_ring_entry *)ih_ring_entry; - - if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || - ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && - ihre->vmid >= dev->vm_info.first_vmid_kfd && - ihre->vmid <= dev->vm_info.last_vmid_kfd) - return true; - return false; -} - static bool cik_event_interrupt_isr(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, - uint32_t *patched_ihre, - bool *patched_flag) + const uint32_t *ih_ring_entry) { + unsigned int pasid; const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; - const struct kfd2kgd_calls *f2g = dev->kfd2kgd; - struct cik_ih_ring_entry *tmp_ihre = - (struct cik_ih_ring_entry *) patched_ihre; - /* This workaround is due to HW/FW limitation on Hawaii that - * VMID and PASID are not written into ih_ring_entry - */ - if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || - ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && - dev->device_info->asic_family == CHIP_HAWAII) { - *patched_flag = true; - *tmp_ihre = *ihre; + pasid = (ihre->ring_id & 0xffff0000) >> 16; - tmp_ihre->vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); - tmp_ihre->pasid = f2g->get_atc_vmid_pasid_mapping_pasid( - dev->kgd, tmp_ihre->vmid); - return (tmp_ihre->pasid != 0) && - tmp_ihre->vmid >= dev->vm_info.first_vmid_kfd && - tmp_ihre->vmid <= dev->vm_info.last_vmid_kfd; - } /* Do not process in ISR, just request it to be forwarded to WQ. */ - return (ihre->pasid != 0) && + return (pasid != 0) && (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || - ihre->source_id == CIK_INTSRC_SDMA_TRAP || ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || - ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || - is_cpc_vm_fault(dev, ih_ring_entry)); + ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); } static void cik_event_interrupt_wq(struct kfd_dev *dev, const uint32_t *ih_ring_entry) { + unsigned int pasid; const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; - if (ihre->pasid == 0) + pasid = (ihre->ring_id & 0xffff0000) >> 16; + + if (pasid == 0) return; if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(ihre->pasid, 0, 0); - else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) - kfd_signal_event_interrupt(ihre->pasid, 0, 0); + kfd_signal_event_interrupt(pasid, 0, 0); else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) - kfd_signal_event_interrupt(ihre->pasid, ihre->data & 0xFF, 8); + kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8); else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) - kfd_signal_hw_exception_event(ihre->pasid); - else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || - ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { - struct kfd_vm_fault_info info; - - kfd_process_vm_fault(dev->dqm, ihre->pasid); - - memset(&info, 0, sizeof(info)); - dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); - if (!info.page_addr && !info.status) - return; - - if (info.vmid == ihre->vmid) - kfd_signal_vm_fault_event(dev, ihre->pasid, &info); - else - kfd_signal_vm_fault_event(dev, ihre->pasid, NULL); - } + kfd_signal_hw_exception_event(pasid); } const struct kfd_event_interrupt_class event_interrupt_class_cik = { diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h index ff8255d..79a16d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_int.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h @@ -26,32 +26,16 @@ #include struct cik_ih_ring_entry { - uint32_t source_id:8; - uint32_t reserved1:8; - uint32_t reserved2:16; - - uint32_t data:28; - uint32_t reserved3:4; - - /* pipeid, meid and unused3 are officially called RINGID, - * but for our purposes, they always decode into pipe and ME. - */ - uint32_t pipeid:2; - uint32_t meid:2; - uint32_t reserved4:4; - uint32_t vmid:8; - uint32_t pasid:16; - - uint32_t reserved5; + uint32_t source_id; + uint32_t data; + uint32_t ring_id; + uint32_t reserved; }; #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF -#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 -#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 -#define CIK_INTSRC_SDMA_TRAP 0xE0 #endif diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h index 37ce6dd..48769d1 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h @@ -33,8 +33,7 @@ #define APE1_MTYPE(x) ((x) << 7) /* valid for both DEFAULT_MTYPE and APE1_MTYPE */ -#define MTYPE_CACHED_NV 0 -#define MTYPE_CACHED 1 +#define MTYPE_CACHED 0 #define MTYPE_NONCACHED 3 #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h deleted file mode 100644 index d5d1331..0000000 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h +++ /dev/null @@ -1,1384 +0,0 @@ -/* - * Copyright 2015 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#if 0 -HW (VI) source code for CWSR trap handler -#Version 18 + multiple trap handler - -// this performance-optimal version was originally from Seven Xu at SRDC - -// Revison #18 --... -/* Rev History -** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) -** #4. SR Memory Layout: -** 1. VGPR-SGPR-HWREG-{LDS} -** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. -** #5. Update: 1. Accurate g8sr_ts_save_d timestamp -** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) -** #7. Update: 1. don't barrier if noLDS -** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version -** 2. Fix SQ issue by s_sleep 2 -** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last -** 2. optimize s_buffer save by burst 16sgprs... -** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. -** #11. Update 1. Add 2 more timestamp for debug version -** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance -** #13. Integ 1. Always use MUBUF for PV trap shader... -** #14. Update 1. s_buffer_store soft clause... -** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. -** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree -** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] -** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... -** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 -** 2. FUNC - Handle non-CWSR traps -*/ - -var G8SR_WDMEM_HWREG_OFFSET = 0 -var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes - -// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. - -var G8SR_DEBUG_TIMESTAMP = 0 -var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset -var s_g8sr_ts_save_s = s[34:35] // save start -var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi -var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ -var s_g8sr_ts_save_d = s[40:41] // save end -var s_g8sr_ts_restore_s = s[42:43] // restore start -var s_g8sr_ts_restore_d = s[44:45] // restore end - -var G8SR_VGPR_SR_IN_DWX4 = 0 -var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes -var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 - - -/*************************************************************************/ -/* control on how to run the shader */ -/*************************************************************************/ -//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) -var EMU_RUN_HACK = 0 -var EMU_RUN_HACK_RESTORE_NORMAL = 0 -var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 -var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 -var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var SAVE_LDS = 1 -var WG_BASE_ADDR_LO = 0x9000a000 -var WG_BASE_ADDR_HI = 0x0 -var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem -var CTX_SAVE_CONTROL = 0x0 -var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) -var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes -var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing - -/**************************************************************************/ -/* variables */ -/**************************************************************************/ -var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 -var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 -var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 - -var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 -var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 -var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 -var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 -var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 -var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits - -var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 -var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask -var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 -var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 -var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 - -var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME -var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME -var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME -var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME -var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME - -var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 -var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 - - -/* Save */ -var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes -var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE - -var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -var S_SAVE_SPI_INIT_ATC_SHIFT = 27 -var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 -var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 - -var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used -var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME - -var s_save_spi_init_lo = exec_lo -var s_save_spi_init_hi = exec_hi - - //tba_lo and tba_hi need to be saved/restored -var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} -var s_save_pc_hi = ttmp1 -var s_save_exec_lo = ttmp2 -var s_save_exec_hi = ttmp3 -var s_save_status = ttmp4 -var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine -var s_save_xnack_mask_lo = ttmp6 -var s_save_xnack_mask_hi = ttmp7 -var s_save_buf_rsrc0 = ttmp8 -var s_save_buf_rsrc1 = ttmp9 -var s_save_buf_rsrc2 = ttmp10 -var s_save_buf_rsrc3 = ttmp11 - -var s_save_mem_offset = tma_lo -var s_save_alloc_size = s_save_trapsts //conflict -var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) -var s_save_m0 = tma_hi - -/* Restore */ -var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE -var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC - -var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 -var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 -var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 - -var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT -var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK -var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK - -var s_restore_spi_init_lo = exec_lo -var s_restore_spi_init_hi = exec_hi - -var s_restore_mem_offset = ttmp2 -var s_restore_alloc_size = ttmp3 -var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored -var s_restore_mem_offset_save = s_restore_tmp //no conflict - -var s_restore_m0 = s_restore_alloc_size //no conflict - -var s_restore_mode = ttmp7 - -var s_restore_pc_lo = ttmp0 -var s_restore_pc_hi = ttmp1 -var s_restore_exec_lo = tma_lo //no conflict -var s_restore_exec_hi = tma_hi //no conflict -var s_restore_status = ttmp4 -var s_restore_trapsts = ttmp5 -var s_restore_xnack_mask_lo = xnack_mask_lo -var s_restore_xnack_mask_hi = xnack_mask_hi -var s_restore_buf_rsrc0 = ttmp8 -var s_restore_buf_rsrc1 = ttmp9 -var s_restore_buf_rsrc2 = ttmp10 -var s_restore_buf_rsrc3 = ttmp11 - -/**************************************************************************/ -/* trap handler entry points */ -/**************************************************************************/ -/* Shader Main*/ - -shader main - asic(VI) - type(CS) - - - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore - //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC - s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. - s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE - //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE - s_branch L_SKIP_RESTORE //NOT restore, SAVE actually - else - s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save - end - -L_JUMP_TO_RESTORE: - s_branch L_RESTORE //restore - -L_SKIP_RESTORE: - - s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save - s_cbranch_scc1 L_SAVE //this is the operation for save - - // ********* Handle non-CWSR traps ******************* -if (!EMU_RUN_HACK) - /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ - s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 - s_waitcnt lgkmcnt(0) - s_or_b32 ttmp7, ttmp8, ttmp9 - s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) - s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler - -L_NO_NEXT_TRAP: - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception - s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. - s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 - s_addc_u32 ttmp1, ttmp1, 0 -L_EXCP_CASE: - s_and_b32 ttmp1, ttmp1, 0xFFFF - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) - s_rfe_b64 [ttmp0, ttmp1] -end - // ********* End handling of non-CWSR traps ******************* - -/**************************************************************************/ -/* save routine */ -/**************************************************************************/ - -L_SAVE: - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -end - - //check whether there is mem_viol - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_PC_REWIND - - //if so, need rewind PC assuming GDS operation gets NACKed - s_mov_b32 s_save_tmp, 0 //clear mem_viol bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 - s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc - -L_NO_PC_REWIND: - s_mov_b32 s_save_tmp, 0 //clear saveCtx bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit - - s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK - s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT - s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT - s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY - s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT - s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS - s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG - - s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp - - /* inform SPI the readiness and wait for SPI's go signal */ - s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI - s_mov_b32 s_save_exec_hi, exec_hi - s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_sq_save_msg - s_waitcnt lgkmcnt(0) -end - - if (EMU_RUN_HACK) - - else - s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC - end - - L_SLEEP: - s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 - - if (EMU_RUN_HACK) - - else - s_cbranch_execz L_SLEEP - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_spi_wrexec - s_waitcnt lgkmcnt(0) -end - - /* setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_save_tmp, v9, 0 - s_lshr_b32 s_save_tmp, s_save_tmp, 6 - s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - - - s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo - s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited - s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK - s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK - s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE - - //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) - s_mov_b32 s_save_m0, m0 //save M0 - - /* global mem offset */ - s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 - - - - - /* save HW registers */ - ////////////////////////////// - - L_SAVE_HWREG: - // HWREG SR memory offset : size(VGPR)+size(SGPR) - get_vgpr_size_bytes(s_save_mem_offset) - get_sgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp - - - s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 - - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO - s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI - end - - write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC - write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC - write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS - - //s_save_trapsts conflicts with s_save_alloc_size - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS - - write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO - write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI - - //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 - s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE - write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO - write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI - - - - /* the first wave in the threadgroup */ - // save fist_wave bits in tba_hi unused bit.26 - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit - //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] - s_mov_b32 s_save_exec_hi, 0x0 - s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] - - - /* save SGPRs */ - // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... - ////////////////////////////// - - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - // TODO, change RSRC word to rearrange memory layout for SGPRS - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - - if (SGPR_SAVE_USE_SQC) - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 - //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 - s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 - s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset - s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 - - s_mov_b32 m0, 0x0 //SGPR initial index value =0 - L_SAVE_SGPR_LOOP: - // SGPR is allocated in 16 SGPR granularity - s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] - s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] - s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] - s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] - s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] - s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] - s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] - s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] - - write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 - s_add_u32 m0, m0, 16 //next sgpr index - s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? - // restore s_save_buf_rsrc0,1 - //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo - s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo - - - - - /* save first 4 VGPR, then LDS save could use */ - // each wave will alloc 4 vgprs at least... - ///////////////////////////////////////////////////////////////////////////////////// - - s_mov_b32 s_save_mem_offset, 0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity - -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -end - - - - /* save LDS */ - ////////////////////////////// - - L_SAVE_LDS: - - // Change EXEC to all threads... - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size - s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? - s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE - - s_barrier //LDS is used? wait for other waves in the same TG - //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here - s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here - s_cbranch_scc0 L_SAVE_LDS_DONE - - // first wave do LDS save; - - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes - s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes - - // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) - // - get_vgpr_size_bytes(s_save_mem_offset) - get_sgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp - s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() - - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - s_mov_b32 m0, 0x0 //lds_offset initial value = 0 - - -var LDS_DMA_ENABLE = 0 -var UNROLL = 0 -if UNROLL==0 && LDS_DMA_ENABLE==1 - s_mov_b32 s3, 256*2 - s_nop 0 - s_nop 0 - s_nop 0 - L_SAVE_LDS_LOOP: - //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? - if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - end - - s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes - s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? - -elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss - // store from higest LDS address to lowest - s_mov_b32 s3, 256*2 - s_sub_u32 m0, s_save_alloc_size, s3 - s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 - s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... - s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest - s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction - s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc - s_nop 0 - s_nop 0 - s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes - s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved - s_add_u32 s0, s0,s_save_alloc_size - s_addc_u32 s1, s1, 0 - s_setpc_b64 s[0:1] - - - for var i =0; i< 128; i++ - // be careful to make here a 64Byte aligned address, which could improve performance... - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - - if i!=127 - s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline - s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 - end - end - -else // BUFFER_STORE - v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 - v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid - v_mul_i32_i24 v2, v3, 8 // tid*8 - v_mov_b32 v3, 256*2 - s_mov_b32 m0, 0x10000 - s_mov_b32 s0, s_save_buf_rsrc3 - s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT - -L_SAVE_LDS_LOOP_VECTOR: - ds_read_b64 v[0:1], v2 //x =LDS[a], byte address - s_waitcnt lgkmcnt(0) - buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 -// s_waitcnt vmcnt(0) - v_add_u32 v2, vcc[0:1], v2, v3 - v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size - s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR - - // restore rsrc3 - s_mov_b32 s_save_buf_rsrc3, s0 - -end - -L_SAVE_LDS_DONE: - - - /* save VGPRs - set the Rest VGPRs */ - ////////////////////////////////////////////////////////////////////////////////////// - L_SAVE_VGPR: - // VGPR SR memory offset: 0 - // TODO rearrange the RSRC words to use swizzle for VGPR save... - - s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity - -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, 4 // skip first 4 VGPRs - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs - - s_set_gpr_idx_on m0, 0x1 // This will change M0 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 -L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 // v0 = v[0+m0] - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - s_add_u32 m0, m0, 4 - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -L_SAVE_VGPR_LOOP_END: - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else - // VGPR store using dw burst - s_mov_b32 m0, 0x4 //VGPR initial index value =0 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_END - - - s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later - - L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 //v0 = v[0+m0] - v_mov_b32 v1, v1 //v0 = v[0+m0] - v_mov_b32 v2, v2 //v0 = v[0+m0] - v_mov_b32 v3, v3 //v0 = v[0+m0] - - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 - end - - s_add_u32 m0, m0, 4 //next vgpr index - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -end - -L_SAVE_VGPR_END: - - - - - - - /* S_PGM_END_SAVED */ //FIXME graphics ONLY - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - s_rfe_b64 s_save_pc_lo //Return to the main shader program - else - end - -// Save Done timestamp -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_d - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // Need reset rsrc2?? - s_mov_b32 m0, s_save_mem_offset - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 -end - - - s_branch L_END_PGM - - - -/**************************************************************************/ -/* restore routine */ -/**************************************************************************/ - -L_RESTORE: - /* Setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_restore_tmp, v9, 0 - s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 - s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE - s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL - else - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... - s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] - s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. -end - - - - s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo - s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) - s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK - s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position - s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK - s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position - s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE - - /* global mem offset */ -// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 - - /* the first wave in the threadgroup */ - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK - s_cbranch_scc0 L_RESTORE_VGPR - - /* restore LDS */ - ////////////////////////////// - L_RESTORE_LDS: - - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size - s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? - s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes - s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes - - // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) - // - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? - - - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - s_mov_b32 m0, 0x0 //lds_offset initial value = 0 - - L_RESTORE_LDS_LOOP: - if (SAVE_LDS) - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW - end - s_add_u32 m0, m0, 256*2 // 128 DW - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW - s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? - - - /* restore VGPRs */ - ////////////////////////////// - L_RESTORE_VGPR: - // VGPR SR memory offset : 0 - s_mov_b32 s_restore_mem_offset, 0x0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - -if G8SR_VGPR_SR_IN_DWX4 - get_vgpr_size_bytes(s_restore_mem_offset) - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, s_restore_alloc_size - s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 - -L_RESTORE_VGPR_LOOP: - buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - s_waitcnt vmcnt(0) - s_sub_u32 m0, m0, 4 - v_mov_b32 v0, v0 // v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_cmp_eq_u32 m0, 0x8000 - s_cbranch_scc0 L_RESTORE_VGPR_LOOP - s_set_gpr_idx_off - - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes - -else - // VGPR load using dw burst - s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_mov_b32 m0, 4 //VGPR initial index value = 1 - s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later - - L_RESTORE_VGPR_LOOP: - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 - end - s_waitcnt vmcnt(0) //ensure data ready - v_mov_b32 v0, v0 //v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_add_u32 m0, m0, 4 //next vgpr index - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes - s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? - s_set_gpr_idx_off - /* VGPR restore on v0 */ - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 - end - -end - - /* restore SGPRs */ - ////////////////////////////// - - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group - // TODO, change RSRC word to rearrange memory layout for SGPRS - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - - if (SGPR_SAVE_USE_SQC) - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), - However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG - */ - s_mov_b32 m0, s_restore_alloc_size - - L_RESTORE_SGPR_LOOP: - read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made - s_waitcnt lgkmcnt(0) //ensure data ready - - s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] - - s_movreld_b64 s0, s0 //s[0+m0] = s0 - s_movreld_b64 s2, s2 - s_movreld_b64 s4, s4 - s_movreld_b64 s6, s6 - s_movreld_b64 s8, s8 - s_movreld_b64 s10, s10 - s_movreld_b64 s12, s12 - s_movreld_b64 s14, s14 - - s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? - - /* restore HW registers */ - ////////////////////////////// - L_RESTORE_HWREG: - - -if G8SR_DEBUG_TIMESTAMP - s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo - s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi -end - - // HWREG SR memory offset : size(VGPR)+size(SGPR) - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - - - s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 - read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC - read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC - read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS - read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS - read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO - read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI - read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE - read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO - read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI - - s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS - - s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS - - //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - - s_mov_b32 m0, s_restore_m0 - s_mov_b32 exec_lo, s_restore_exec_lo - s_mov_b32 exec_hi, s_restore_exec_hi - - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 - //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore - s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - //reuse s_restore_m0 as a temp register - s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT - s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT - s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero - s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT - s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT - s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT - s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp - - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu - - s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_d - s_waitcnt lgkmcnt(0) -end - -// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution - s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc - - -/**************************************************************************/ -/* the END */ -/**************************************************************************/ -L_END_PGM: - s_endpgm - -end - - -/**************************************************************************/ -/* the helper functions */ -/**************************************************************************/ - -//Only for save hwreg to mem -function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) - s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on - s_mov_b32 m0, s_mem_offset - s_buffer_store_dword s, s_rsrc, m0 glc:1 - s_add_u32 s_mem_offset, s_mem_offset, 4 - s_mov_b32 m0, exec_lo -end - - -// HWREG are saved before SGPRs, so all HWREG could be use. -function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) - - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 - s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 - s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 - s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc -end - - -function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 - s_add_u32 s_mem_offset, s_mem_offset, 4 -end - -function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 - s_sub_u32 s_mem_offset, s_mem_offset, 4*16 -end - - - -function get_lds_size_bytes(s_lds_size_byte) - // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW - s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size - s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW -end - -function get_vgpr_size_bytes(s_vgpr_size_byte) - s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 - s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible -end - -function get_sgpr_size_bytes(s_sgpr_size_byte) - s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 - s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) -end - -function get_hwreg_size_bytes - return 128 //HWREG size 128 bytes -end - - -#endif - -static const uint32_t cwsr_trap_carrizo_hex[] = { - 0xbf820001, 0xbf820123, - 0xb8f4f802, 0x89748674, - 0xb8f5f803, 0x8675ff75, - 0x00000400, 0xbf850011, - 0xc00a1e37, 0x00000000, - 0xbf8c007f, 0x87777978, - 0xbf840002, 0xb974f802, - 0xbe801d78, 0xb8f5f803, - 0x8675ff75, 0x000001ff, - 0xbf850002, 0x80708470, - 0x82718071, 0x8671ff71, - 0x0000ffff, 0xb974f802, - 0xbe801f70, 0xb8f5f803, - 0x8675ff75, 0x00000100, - 0xbf840006, 0xbefa0080, - 0xb97a0203, 0x8671ff71, - 0x0000ffff, 0x80f08870, - 0x82f18071, 0xbefa0080, - 0xb97a0283, 0xbef60068, - 0xbef70069, 0xb8fa1c07, - 0x8e7a9c7a, 0x87717a71, - 0xb8fa03c7, 0x8e7a9b7a, - 0x87717a71, 0xb8faf807, - 0x867aff7a, 0x00007fff, - 0xb97af807, 0xbef2007e, - 0xbef3007f, 0xbefe0180, - 0xbf900004, 0xbf8e0002, - 0xbf88fffe, 0xbef8007e, - 0x8679ff7f, 0x0000ffff, - 0x8779ff79, 0x00040000, - 0xbefa0080, 0xbefb00ff, - 0x00807fac, 0x867aff7f, - 0x08000000, 0x8f7a837a, - 0x877b7a7b, 0x867aff7f, - 0x70000000, 0x8f7a817a, - 0x877b7a7b, 0xbeef007c, - 0xbeee0080, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8fa1605, 0x807a817a, - 0x8e7a867a, 0x806e7a6e, - 0xbefa0084, 0xbefa00ff, - 0x01000000, 0xbefe007c, - 0xbefc006e, 0xc0611bfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611c3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611c7c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611cbc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611cfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611d3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xb8f5f803, - 0xbefe007c, 0xbefc006e, - 0xc0611d7c, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xbefe007c, 0xbefc006e, - 0xc0611dbc, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xbefe007c, 0xbefc006e, - 0xc0611dfc, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xb8eff801, 0xbefe007c, - 0xbefc006e, 0xc0611bfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611b3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611b7c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbef30080, - 0x8773737a, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8f51605, 0x80758175, - 0x8e758475, 0x8e7a8275, - 0xbefa00ff, 0x01000000, - 0xbef60178, 0x80786e78, - 0x82798079, 0xbefc0080, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003c, 0x00000000, - 0xc06b013c, 0x00000010, - 0xc06b023c, 0x00000020, - 0xc06b033c, 0x00000030, - 0x8078c078, 0x82798079, - 0x807c907c, 0xbf0a757c, - 0xbf85ffeb, 0xbef80176, - 0xbeee0080, 0xbefe00c1, - 0xbeff00c1, 0xbefa00ff, - 0x01000000, 0xe0724000, - 0x6e1e0000, 0xe0724100, - 0x6e1e0100, 0xe0724200, - 0x6e1e0200, 0xe0724300, - 0x6e1e0300, 0xbefe00c1, - 0xbeff00c1, 0xb8f54306, - 0x8675c175, 0xbf84002c, - 0xbf8a0000, 0x867aff73, - 0x04000000, 0xbf840028, - 0x8e758675, 0x8e758275, - 0xbefa0075, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8fa1605, 0x807a817a, - 0x8e7a867a, 0x806e7a6e, - 0x806eff6e, 0x00000080, - 0xbefa00ff, 0x01000000, - 0xbefc0080, 0xd28c0002, - 0x000100c1, 0xd28d0003, - 0x000204c1, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe80007b, - 0x867bff7b, 0xff7fffff, - 0x877bff7b, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8c007f, 0xe0765000, - 0x6e1e0002, 0x32040702, - 0xd0c9006a, 0x0000eb02, - 0xbf87fff7, 0xbefb0000, - 0xbeee00ff, 0x00000400, - 0xbefe00c1, 0xbeff00c1, - 0xb8f52a05, 0x80758175, - 0x8e758275, 0x8e7a8875, - 0xbefa00ff, 0x01000000, - 0xbefc0084, 0xbf0a757c, - 0xbf840015, 0xbf11017c, - 0x8075ff75, 0x00001000, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0xe0724000, 0x6e1e0000, - 0xe0724100, 0x6e1e0100, - 0xe0724200, 0x6e1e0200, - 0xe0724300, 0x6e1e0300, - 0x807c847c, 0x806eff6e, - 0x00000400, 0xbf0a757c, - 0xbf85ffef, 0xbf9c0000, - 0xbf8200ca, 0xbef8007e, - 0x8679ff7f, 0x0000ffff, - 0x8779ff79, 0x00040000, - 0xbefa0080, 0xbefb00ff, - 0x00807fac, 0x8676ff7f, - 0x08000000, 0x8f768376, - 0x877b767b, 0x8676ff7f, - 0x70000000, 0x8f768176, - 0x877b767b, 0x8676ff7f, - 0x04000000, 0xbf84001e, - 0xbefe00c1, 0xbeff00c1, - 0xb8f34306, 0x8673c173, - 0xbf840019, 0x8e738673, - 0x8e738273, 0xbefa0073, - 0xb8f22a05, 0x80728172, - 0x8e728a72, 0xb8f61605, - 0x80768176, 0x8e768676, - 0x80727672, 0x8072ff72, - 0x00000080, 0xbefa00ff, - 0x01000000, 0xbefc0080, - 0xe0510000, 0x721e0000, - 0xe0510100, 0x721e0000, - 0x807cff7c, 0x00000200, - 0x8072ff72, 0x00000200, - 0xbf0a737c, 0xbf85fff6, - 0xbef20080, 0xbefe00c1, - 0xbeff00c1, 0xb8f32a05, - 0x80738173, 0x8e738273, - 0x8e7a8873, 0xbefa00ff, - 0x01000000, 0xbef60072, - 0x8072ff72, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0x8073ff73, 0x00008000, - 0xe0524000, 0x721e0000, - 0xe0524100, 0x721e0100, - 0xe0524200, 0x721e0200, - 0xe0524300, 0x721e0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8072ff72, 0x00000400, - 0xbf0a737c, 0xbf85ffee, - 0xbf9c0000, 0xe0524000, - 0x761e0000, 0xe0524100, - 0x761e0100, 0xe0524200, - 0x761e0200, 0xe0524300, - 0x761e0300, 0xb8f22a05, - 0x80728172, 0x8e728a72, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x80727672, - 0x80f2c072, 0xb8f31605, - 0x80738173, 0x8e738473, - 0x8e7a8273, 0xbefa00ff, - 0x01000000, 0xbefc0073, - 0xc031003c, 0x00000072, - 0x80f2c072, 0xbf8c007f, - 0x80fc907c, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff1, 0xb8f22a05, - 0x80728172, 0x8e728a72, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x80727672, - 0xbefa0084, 0xbefa00ff, - 0x01000000, 0xc0211cfc, - 0x00000072, 0x80728472, - 0xc0211c3c, 0x00000072, - 0x80728472, 0xc0211c7c, - 0x00000072, 0x80728472, - 0xc0211bbc, 0x00000072, - 0x80728472, 0xc0211bfc, - 0x00000072, 0x80728472, - 0xc0211d3c, 0x00000072, - 0x80728472, 0xc0211d7c, - 0x00000072, 0x80728472, - 0xc0211a3c, 0x00000072, - 0x80728472, 0xc0211a7c, - 0x00000072, 0x80728472, - 0xc0211dfc, 0x00000072, - 0x80728472, 0xc0211b3c, - 0x00000072, 0x80728472, - 0xc0211b7c, 0x00000072, - 0x80728472, 0xbf8c007f, - 0x8671ff71, 0x0000ffff, - 0xbefc0073, 0xbefe006e, - 0xbeff006f, 0x867375ff, - 0x000003ff, 0xb9734803, - 0x867375ff, 0xfffff800, - 0x8f738b73, 0xb973a2c3, - 0xb977f801, 0x8673ff71, - 0xf0000000, 0x8f739c73, - 0x8e739073, 0xbef60080, - 0x87767376, 0x8673ff71, - 0x08000000, 0x8f739b73, - 0x8e738f73, 0x87767376, - 0x8673ff74, 0x00800000, - 0x8f739773, 0xb976f807, - 0x86fe7e7e, 0x86ea6a6a, - 0xb974f802, 0xbf8a0000, - 0x95807370, 0xbf810000, -}; - diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm deleted file mode 100644 index ae2af3d..0000000 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ /dev/null @@ -1,1388 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#if 0 -HW (GFX9) source code for CWSR trap handler -#Version 18 + multiple trap handler - -// this performance-optimal version was originally from Seven Xu at SRDC - -// Revison #18 --... -/* Rev History -** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) -** #4. SR Memory Layout: -** 1. VGPR-SGPR-HWREG-{LDS} -** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. -** #5. Update: 1. Accurate g8sr_ts_save_d timestamp -** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) -** #7. Update: 1. don't barrier if noLDS -** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version -** 2. Fix SQ issue by s_sleep 2 -** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last -** 2. optimize s_buffer save by burst 16sgprs... -** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. -** #11. Update 1. Add 2 more timestamp for debug version -** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance -** #13. Integ 1. Always use MUBUF for PV trap shader... -** #14. Update 1. s_buffer_store soft clause... -** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. -** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree -** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] -** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... -** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 -** 2. FUNC - Handle non-CWSR traps -*/ - -var G8SR_WDMEM_HWREG_OFFSET = 0 -var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes - -// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. - -var G8SR_DEBUG_TIMESTAMP = 0 -var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset -var s_g8sr_ts_save_s = s[34:35] // save start -var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi -var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ -var s_g8sr_ts_save_d = s[40:41] // save end -var s_g8sr_ts_restore_s = s[42:43] // restore start -var s_g8sr_ts_restore_d = s[44:45] // restore end - -var G8SR_VGPR_SR_IN_DWX4 = 0 -var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes -var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 - - -/*************************************************************************/ -/* control on how to run the shader */ -/*************************************************************************/ -//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) -var EMU_RUN_HACK = 0 -var EMU_RUN_HACK_RESTORE_NORMAL = 0 -var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 -var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 -var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var SAVE_LDS = 1 -var WG_BASE_ADDR_LO = 0x9000a000 -var WG_BASE_ADDR_HI = 0x0 -var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem -var CTX_SAVE_CONTROL = 0x0 -var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) -var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes -var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing - -/**************************************************************************/ -/* variables */ -/**************************************************************************/ -var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 -var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 -var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 -var SQ_WAVE_STATUS_HALT_MASK = 0x2000 - -var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 -var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 -var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 -var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 -var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 -var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits - -var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 -var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask -var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 -var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 -var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 -var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 - -var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME -var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME -var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME - -var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 -var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 - - -/* Save */ -var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes -var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE - -var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -var S_SAVE_SPI_INIT_ATC_SHIFT = 27 -var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 -var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 - -var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used -var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME - -var s_save_spi_init_lo = exec_lo -var s_save_spi_init_hi = exec_hi - -var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} -var s_save_pc_hi = ttmp1 -var s_save_exec_lo = ttmp2 -var s_save_exec_hi = ttmp3 -var s_save_status = ttmp4 -var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine -var s_save_xnack_mask_lo = ttmp6 -var s_save_xnack_mask_hi = ttmp7 -var s_save_buf_rsrc0 = ttmp8 -var s_save_buf_rsrc1 = ttmp9 -var s_save_buf_rsrc2 = ttmp10 -var s_save_buf_rsrc3 = ttmp11 - -var s_save_mem_offset = ttmp14 -var s_save_alloc_size = s_save_trapsts //conflict -var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) -var s_save_m0 = ttmp15 - -/* Restore */ -var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE -var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC - -var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 -var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 -var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 - -var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT -var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK -var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK - -var s_restore_spi_init_lo = exec_lo -var s_restore_spi_init_hi = exec_hi - -var s_restore_mem_offset = ttmp12 -var s_restore_alloc_size = ttmp3 -var s_restore_tmp = ttmp6 -var s_restore_mem_offset_save = s_restore_tmp //no conflict - -var s_restore_m0 = s_restore_alloc_size //no conflict - -var s_restore_mode = ttmp7 - -var s_restore_pc_lo = ttmp0 -var s_restore_pc_hi = ttmp1 -var s_restore_exec_lo = ttmp14 -var s_restore_exec_hi = ttmp15 -var s_restore_status = ttmp4 -var s_restore_trapsts = ttmp5 -var s_restore_xnack_mask_lo = xnack_mask_lo -var s_restore_xnack_mask_hi = xnack_mask_hi -var s_restore_buf_rsrc0 = ttmp8 -var s_restore_buf_rsrc1 = ttmp9 -var s_restore_buf_rsrc2 = ttmp10 -var s_restore_buf_rsrc3 = ttmp11 - -/**************************************************************************/ -/* trap handler entry points */ -/**************************************************************************/ -/* Shader Main*/ - -shader main - asic(GFX9) - type(CS) - - - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore - //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC - s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. - s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE - //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE - s_branch L_SKIP_RESTORE //NOT restore, SAVE actually - else - s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save - end - -L_JUMP_TO_RESTORE: - s_branch L_RESTORE //restore - -L_SKIP_RESTORE: - - s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save - s_cbranch_scc1 L_SAVE //this is the operation for save - - // ********* Handle non-CWSR traps ******************* -if (!EMU_RUN_HACK) - // Illegal instruction is a non-maskable exception which blocks context save. - // Halt the wavefront and return from the trap. - s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK - s_cbranch_scc1 L_HALT_WAVE - - // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. - // Instead, halt the wavefront and return from the trap. - s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_MEM_VIOL - -L_HALT_WAVE: - s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK - s_branch L_EXCP_CASE - -L_NO_MEM_VIOL: - /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ - s_getreg_b32 ttmp14,hwreg(HW_REG_SQ_SHADER_TMA_LO) - s_getreg_b32 ttmp15,hwreg(HW_REG_SQ_SHADER_TMA_HI) - s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [ttmp14, ttmp15], 0 - s_waitcnt lgkmcnt(0) - s_or_b32 ttmp7, ttmp8, ttmp9 - s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) - s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler - -L_NO_NEXT_TRAP: - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception - s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. - s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 - s_addc_u32 ttmp1, ttmp1, 0 -L_EXCP_CASE: - s_and_b32 ttmp1, ttmp1, 0xFFFF - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) - s_rfe_b64 [ttmp0, ttmp1] -end - // ********* End handling of non-CWSR traps ******************* - -/**************************************************************************/ -/* save routine */ -/**************************************************************************/ - -L_SAVE: - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -end - - //check whether there is mem_viol - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_PC_REWIND - - //if so, need rewind PC assuming GDS operation gets NACKed - s_mov_b32 s_save_tmp, 0 //clear mem_viol bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 - s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc - -L_NO_PC_REWIND: - s_mov_b32 s_save_tmp, 0 //clear saveCtx bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit - - s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK - s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT - s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT - s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY - s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT - s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS - s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG - - s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp - - /* inform SPI the readiness and wait for SPI's go signal */ - s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI - s_mov_b32 s_save_exec_hi, exec_hi - s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_sq_save_msg - s_waitcnt lgkmcnt(0) -end - - if (EMU_RUN_HACK) - - else - s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC - end - - L_SLEEP: - s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 - - if (EMU_RUN_HACK) - - else - s_cbranch_execz L_SLEEP - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_spi_wrexec - s_waitcnt lgkmcnt(0) -end - - /* setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_save_tmp, v9, 0 - s_lshr_b32 s_save_tmp, s_save_tmp, 6 - s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - - - s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo - s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited - s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK - s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK - s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE - - //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) - s_mov_b32 s_save_m0, m0 //save M0 - - /* global mem offset */ - s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 - - - - - /* save HW registers */ - ////////////////////////////// - - L_SAVE_HWREG: - // HWREG SR memory offset : size(VGPR)+size(SGPR) - get_vgpr_size_bytes(s_save_mem_offset) - get_sgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp - - - s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 - - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - end - - write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC - write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC - write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS - - //s_save_trapsts conflicts with s_save_alloc_size - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS - - write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO - write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI - - //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 - s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE - write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) - - - - /* the first wave in the threadgroup */ - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit - s_mov_b32 s_save_exec_hi, 0x0 - s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] - - - /* save SGPRs */ - // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... - ////////////////////////////// - - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - // TODO, change RSRC word to rearrange memory layout for SGPRS - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - - if (SGPR_SAVE_USE_SQC) - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 - //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 - s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 - s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset - s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 - - s_mov_b32 m0, 0x0 //SGPR initial index value =0 - s_nop 0x0 //Manually inserted wait states - L_SAVE_SGPR_LOOP: - // SGPR is allocated in 16 SGPR granularity - s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] - s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] - s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] - s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] - s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] - s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] - s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] - s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] - - write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 - s_add_u32 m0, m0, 16 //next sgpr index - s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? - // restore s_save_buf_rsrc0,1 - //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo - s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo - - - - - /* save first 4 VGPR, then LDS save could use */ - // each wave will alloc 4 vgprs at least... - ///////////////////////////////////////////////////////////////////////////////////// - - s_mov_b32 s_save_mem_offset, 0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity - -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -end - - - - /* save LDS */ - ////////////////////////////// - - L_SAVE_LDS: - - // Change EXEC to all threads... - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size - s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? - s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE - - s_barrier //LDS is used? wait for other waves in the same TG - s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here - s_cbranch_scc0 L_SAVE_LDS_DONE - - // first wave do LDS save; - - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes - s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes - - // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) - // - get_vgpr_size_bytes(s_save_mem_offset) - get_sgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp - s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() - - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - s_mov_b32 m0, 0x0 //lds_offset initial value = 0 - - -var LDS_DMA_ENABLE = 0 -var UNROLL = 0 -if UNROLL==0 && LDS_DMA_ENABLE==1 - s_mov_b32 s3, 256*2 - s_nop 0 - s_nop 0 - s_nop 0 - L_SAVE_LDS_LOOP: - //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? - if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - end - - s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes - s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? - -elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss - // store from higest LDS address to lowest - s_mov_b32 s3, 256*2 - s_sub_u32 m0, s_save_alloc_size, s3 - s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 - s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... - s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest - s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction - s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc - s_nop 0 - s_nop 0 - s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes - s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved - s_add_u32 s0, s0,s_save_alloc_size - s_addc_u32 s1, s1, 0 - s_setpc_b64 s[0:1] - - - for var i =0; i< 128; i++ - // be careful to make here a 64Byte aligned address, which could improve performance... - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - - if i!=127 - s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline - s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 - end - end - -else // BUFFER_STORE - v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 - v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid - v_mul_i32_i24 v2, v3, 8 // tid*8 - v_mov_b32 v3, 256*2 - s_mov_b32 m0, 0x10000 - s_mov_b32 s0, s_save_buf_rsrc3 - s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT - -L_SAVE_LDS_LOOP_VECTOR: - ds_read_b64 v[0:1], v2 //x =LDS[a], byte address - s_waitcnt lgkmcnt(0) - buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 -// s_waitcnt vmcnt(0) -// v_add_u32 v2, vcc[0:1], v2, v3 - v_add_u32 v2, v2, v3 - v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size - s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR - - // restore rsrc3 - s_mov_b32 s_save_buf_rsrc3, s0 - -end - -L_SAVE_LDS_DONE: - - - /* save VGPRs - set the Rest VGPRs */ - ////////////////////////////////////////////////////////////////////////////////////// - L_SAVE_VGPR: - // VGPR SR memory offset: 0 - // TODO rearrange the RSRC words to use swizzle for VGPR save... - - s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity - -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, 4 // skip first 4 VGPRs - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs - - s_set_gpr_idx_on m0, 0x1 // This will change M0 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 -L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 // v0 = v[0+m0] - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - s_add_u32 m0, m0, 4 - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -L_SAVE_VGPR_LOOP_END: - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else - // VGPR store using dw burst - s_mov_b32 m0, 0x4 //VGPR initial index value =0 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_END - - - s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later - - L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 //v0 = v[0+m0] - v_mov_b32 v1, v1 //v0 = v[0+m0] - v_mov_b32 v2, v2 //v0 = v[0+m0] - v_mov_b32 v3, v3 //v0 = v[0+m0] - - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 - end - - s_add_u32 m0, m0, 4 //next vgpr index - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -end - -L_SAVE_VGPR_END: - - - - - - - /* S_PGM_END_SAVED */ //FIXME graphics ONLY - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - s_rfe_b64 s_save_pc_lo //Return to the main shader program - else - end - -// Save Done timestamp -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_d - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // Need reset rsrc2?? - s_mov_b32 m0, s_save_mem_offset - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 -end - - - s_branch L_END_PGM - - - -/**************************************************************************/ -/* restore routine */ -/**************************************************************************/ - -L_RESTORE: - /* Setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_restore_tmp, v9, 0 - s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 - s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE - s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL - else - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... - s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] - s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. -end - - - - s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo - s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) - s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK - s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position - s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK - s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position - s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE - - /* global mem offset */ -// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 - - /* the first wave in the threadgroup */ - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK - s_cbranch_scc0 L_RESTORE_VGPR - - /* restore LDS */ - ////////////////////////////// - L_RESTORE_LDS: - - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size - s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? - s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes - s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes - - // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) - // - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? - - - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - s_mov_b32 m0, 0x0 //lds_offset initial value = 0 - - L_RESTORE_LDS_LOOP: - if (SAVE_LDS) - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW - end - s_add_u32 m0, m0, 256*2 // 128 DW - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW - s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? - - - /* restore VGPRs */ - ////////////////////////////// - L_RESTORE_VGPR: - // VGPR SR memory offset : 0 - s_mov_b32 s_restore_mem_offset, 0x0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - -if G8SR_VGPR_SR_IN_DWX4 - get_vgpr_size_bytes(s_restore_mem_offset) - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, s_restore_alloc_size - s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 - -L_RESTORE_VGPR_LOOP: - buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - s_waitcnt vmcnt(0) - s_sub_u32 m0, m0, 4 - v_mov_b32 v0, v0 // v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_cmp_eq_u32 m0, 0x8000 - s_cbranch_scc0 L_RESTORE_VGPR_LOOP - s_set_gpr_idx_off - - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes - -else - // VGPR load using dw burst - s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_mov_b32 m0, 4 //VGPR initial index value = 1 - s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later - - L_RESTORE_VGPR_LOOP: - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 - end - s_waitcnt vmcnt(0) //ensure data ready - v_mov_b32 v0, v0 //v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_add_u32 m0, m0, 4 //next vgpr index - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes - s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? - s_set_gpr_idx_off - /* VGPR restore on v0 */ - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 - end - -end - - /* restore SGPRs */ - ////////////////////////////// - - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group - // TODO, change RSRC word to rearrange memory layout for SGPRS - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - - if (SGPR_SAVE_USE_SQC) - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - s_mov_b32 m0, s_restore_alloc_size - - L_RESTORE_SGPR_LOOP: - read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made - s_waitcnt lgkmcnt(0) //ensure data ready - - s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] - s_nop 0 // hazard SALU M0=> S_MOVREL - - s_movreld_b64 s0, s0 //s[0+m0] = s0 - s_movreld_b64 s2, s2 - s_movreld_b64 s4, s4 - s_movreld_b64 s6, s6 - s_movreld_b64 s8, s8 - s_movreld_b64 s10, s10 - s_movreld_b64 s12, s12 - s_movreld_b64 s14, s14 - - s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? - - /* restore HW registers */ - ////////////////////////////// - L_RESTORE_HWREG: - - -if G8SR_DEBUG_TIMESTAMP - s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo - s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi -end - - // HWREG SR memory offset : size(VGPR)+size(SGPR) - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - - - s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 - read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC - read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC - read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS - read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS - read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO - read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI - read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE - - s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS - - s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS - - //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - - s_mov_b32 m0, s_restore_m0 - s_mov_b32 exec_lo, s_restore_exec_lo - s_mov_b32 exec_hi, s_restore_exec_hi - - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 - //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore - s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - //reuse s_restore_m0 as a temp register - s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT - s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT - s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero - s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT - s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT - s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT - s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp - - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu - - s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_d - s_waitcnt lgkmcnt(0) -end - -// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution - s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc - - -/**************************************************************************/ -/* the END */ -/**************************************************************************/ -L_END_PGM: - s_endpgm - -end - - -/**************************************************************************/ -/* the helper functions */ -/**************************************************************************/ - -//Only for save hwreg to mem -function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) - s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on - s_mov_b32 m0, s_mem_offset - s_buffer_store_dword s, s_rsrc, m0 glc:1 - s_add_u32 s_mem_offset, s_mem_offset, 4 - s_mov_b32 m0, exec_lo -end - - -// HWREG are saved before SGPRs, so all HWREG could be use. -function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) - - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 - s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 - s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 - s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc -end - - -function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 - s_add_u32 s_mem_offset, s_mem_offset, 4 -end - -function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 - s_sub_u32 s_mem_offset, s_mem_offset, 4*16 -end - - - -function get_lds_size_bytes(s_lds_size_byte) - // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW - s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size - s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW -end - -function get_vgpr_size_bytes(s_vgpr_size_byte) - s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 - s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible -end - -function get_sgpr_size_bytes(s_sgpr_size_byte) - s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 - s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) -end - -function get_hwreg_size_bytes - return 128 //HWREG size 128 bytes -end - - - -#endif - -static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf820124, - 0xb8f0f802, 0x89708670, - 0xb8f1f803, 0x8674ff71, - 0x00000400, 0xbf85001d, - 0x8674ff71, 0x00000800, - 0xbf850003, 0x8674ff71, - 0x00000100, 0xbf840003, - 0x8770ff70, 0x00002000, - 0xbf820010, 0xb8faf812, - 0xb8fbf813, 0x8efa887a, - 0xc00a1d3d, 0x00000000, - 0xbf8cc07f, 0x87737574, - 0xbf840002, 0xb970f802, - 0xbe801d74, 0xb8f1f803, - 0x8671ff71, 0x000001ff, - 0xbf850002, 0x806c846c, - 0x826d806d, 0x866dff6d, - 0x0000ffff, 0xb970f802, - 0xbe801f6c, 0xb8f1f803, - 0x8671ff71, 0x00000100, - 0xbf840006, 0xbef60080, - 0xb9760203, 0x866dff6d, - 0x0000ffff, 0x80ec886c, - 0x82ed806d, 0xbef60080, - 0xb9760283, 0xbef20068, - 0xbef30069, 0xb8f62407, - 0x8e769c76, 0x876d766d, - 0xb8f603c7, 0x8e769b76, - 0x876d766d, 0xb8f6f807, - 0x8676ff76, 0x00007fff, - 0xb976f807, 0xbeee007e, - 0xbeef007f, 0xbefe0180, - 0xbf900004, 0xbf8e0002, - 0xbf88fffe, 0xbef4007e, - 0x8675ff7f, 0x0000ffff, - 0x8775ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x00807fac, 0x8676ff7f, - 0x08000000, 0x8f768376, - 0x87777677, 0x8676ff7f, - 0x70000000, 0x8f768176, - 0x87777677, 0xbefb007c, - 0xbefa0080, 0xb8fa2a05, - 0x807a817a, 0x8e7a8a7a, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x807a767a, - 0xbef60084, 0xbef600ff, - 0x01000000, 0xbefe007c, - 0xbefc007a, 0xc0611efa, - 0x0000007c, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611b3a, - 0x0000007c, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611b7a, - 0x0000007c, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611bba, - 0x0000007c, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611bfa, - 0x0000007c, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611c3a, - 0x0000007c, 0x807a847a, - 0xbefc007e, 0xb8f1f803, - 0xbefe007c, 0xbefc007a, - 0xc0611c7a, 0x0000007c, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611cba, 0x0000007c, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611cfa, 0x0000007c, - 0x807a847a, 0xbefc007e, - 0xb8fbf801, 0xbefe007c, - 0xbefc007a, 0xc0611efa, - 0x0000007c, 0x807a847a, - 0xbefc007e, 0x8676ff7f, - 0x04000000, 0xbeef0080, - 0x876f6f76, 0xb8fa2a05, - 0x807a817a, 0x8e7a8a7a, - 0xb8f11605, 0x80718171, - 0x8e718471, 0x8e768271, - 0xbef600ff, 0x01000000, - 0xbef20174, 0x80747a74, - 0x82758075, 0xbefc0080, - 0xbf800000, 0xbe802b00, - 0xbe822b02, 0xbe842b04, - 0xbe862b06, 0xbe882b08, - 0xbe8a2b0a, 0xbe8c2b0c, - 0xbe8e2b0e, 0xc06b003a, - 0x00000000, 0xc06b013a, - 0x00000010, 0xc06b023a, - 0x00000020, 0xc06b033a, - 0x00000030, 0x8074c074, - 0x82758075, 0x807c907c, - 0xbf0a717c, 0xbf85ffeb, - 0xbef40172, 0xbefa0080, - 0xbefe00c1, 0xbeff00c1, - 0xbef600ff, 0x01000000, - 0xe0724000, 0x7a1d0000, - 0xe0724100, 0x7a1d0100, - 0xe0724200, 0x7a1d0200, - 0xe0724300, 0x7a1d0300, - 0xbefe00c1, 0xbeff00c1, - 0xb8f14306, 0x8671c171, - 0xbf84002c, 0xbf8a0000, - 0x8676ff6f, 0x04000000, - 0xbf840028, 0x8e718671, - 0x8e718271, 0xbef60071, - 0xb8fa2a05, 0x807a817a, - 0x8e7a8a7a, 0xb8f61605, - 0x80768176, 0x8e768676, - 0x807a767a, 0x807aff7a, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, - 0xd1060002, 0x00011103, - 0x7e0602ff, 0x00000200, - 0xbefc00ff, 0x00010000, - 0xbe800077, 0x8677ff77, - 0xff7fffff, 0x8777ff77, - 0x00058000, 0xd8ec0000, - 0x00000002, 0xbf8cc07f, - 0xe0765000, 0x7a1d0002, - 0x68040702, 0xd0c9006a, - 0x0000e302, 0xbf87fff7, - 0xbef70000, 0xbefa00ff, - 0x00000400, 0xbefe00c1, - 0xbeff00c1, 0xb8f12a05, - 0x80718171, 0x8e718271, - 0x8e768871, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a717c, 0xbf840015, - 0xbf11017c, 0x8071ff71, - 0x00001000, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x7a1d0000, 0xe0724100, - 0x7a1d0100, 0xe0724200, - 0x7a1d0200, 0xe0724300, - 0x7a1d0300, 0x807c847c, - 0x807aff7a, 0x00000400, - 0xbf0a717c, 0xbf85ffef, - 0xbf9c0000, 0xbf8200c5, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x8672ff7f, 0x08000000, - 0x8f728372, 0x87777277, - 0x8672ff7f, 0x70000000, - 0x8f728172, 0x87777277, - 0x8672ff7f, 0x04000000, - 0xbf84001e, 0xbefe00c1, - 0xbeff00c1, 0xb8ef4306, - 0x866fc16f, 0xbf840019, - 0x8e6f866f, 0x8e6f826f, - 0xbef6006f, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8f21605, 0x80728172, - 0x8e728672, 0x80787278, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbef80080, - 0xbefe00c1, 0xbeff00c1, - 0xb8ef2a05, 0x806f816f, - 0x8e6f826f, 0x8e76886f, - 0xbef600ff, 0x01000000, - 0xbef20078, 0x8078ff78, - 0x00000400, 0xbefc0084, - 0xbf11087c, 0x806fff6f, - 0x00008000, 0xe0524000, - 0x781d0000, 0xe0524100, - 0x781d0100, 0xe0524200, - 0x781d0200, 0xe0524300, - 0x781d0300, 0xbf8c0f70, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0x807c847c, 0x8078ff78, - 0x00000400, 0xbf0a6f7c, - 0xbf85ffee, 0xbf9c0000, - 0xe0524000, 0x721d0000, - 0xe0524100, 0x721d0100, - 0xe0524200, 0x721d0200, - 0xe0524300, 0x721d0300, - 0xb8f82a05, 0x80788178, - 0x8e788a78, 0xb8f21605, - 0x80728172, 0x8e728672, - 0x80787278, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8f21605, 0x80728172, - 0x8e728672, 0x80787278, - 0xbef60084, 0xbef600ff, - 0x01000000, 0xc0211bfa, - 0x00000078, 0x80788478, - 0xc0211b3a, 0x00000078, - 0x80788478, 0xc0211b7a, - 0x00000078, 0x80788478, - 0xc0211eba, 0x00000078, - 0x80788478, 0xc0211efa, - 0x00000078, 0x80788478, - 0xc0211c3a, 0x00000078, - 0x80788478, 0xc0211c7a, - 0x00000078, 0x80788478, - 0xc0211a3a, 0x00000078, - 0x80788478, 0xc0211a7a, - 0x00000078, 0x80788478, - 0xc0211cfa, 0x00000078, - 0x80788478, 0xbf8cc07f, - 0x866dff6d, 0x0000ffff, - 0xbefc006f, 0xbefe007a, - 0xbeff007b, 0x866f71ff, - 0x000003ff, 0xb96f4803, - 0x866f71ff, 0xfffff800, - 0x8f6f8b6f, 0xb96fa2c3, - 0xb973f801, 0x866fff6d, - 0xf0000000, 0x8f6f9c6f, - 0x8e6f906f, 0xbef20080, - 0x87726f72, 0x866fff6d, - 0x08000000, 0x8f6f9b6f, - 0x8e6f8f6f, 0x87726f72, - 0x866fff70, 0x00800000, - 0x8f6f976f, 0xb972f807, - 0x86fe7e7e, 0x86ea6a6a, - 0xb970f802, 0xbf8a0000, - 0x95806f6c, 0xbf810000, -}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 207a05e..6316aad 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -34,17 +33,13 @@ #include #include #include -#include - #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" -#include "kfd_ipc.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); static int kfd_mmap(struct file *, struct vm_area_struct *); -static bool kfd_is_large_bar(struct kfd_dev *dev); static const char kfd_dev_name[] = "kfd"; @@ -60,14 +55,6 @@ static int kfd_char_dev_major = -1; static struct class *kfd_class; struct device *kfd_device; -static char *kfd_devnode(struct device *dev, umode_t *mode) -{ - if (mode && dev->devt == MKDEV(kfd_char_dev_major, 0)) - *mode = 0666; - - return NULL; -} - int kfd_chardev_init(void) { int err = 0; @@ -82,8 +69,6 @@ int kfd_chardev_init(void) if (IS_ERR(kfd_class)) goto err_class_create; - kfd_class->devnode = kfd_devnode; - kfd_device = device_create(kfd_class, NULL, MKDEV(kfd_char_dev_major, 0), NULL, kfd_dev_name); @@ -132,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep) return -EPERM; } - process = kfd_create_process(filep); + process = kfd_create_process(current); if (IS_ERR(process)) return PTR_ERR(process); @@ -157,12 +142,12 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, struct kfd_ioctl_create_queue_args *args) { if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { - pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); return -EINVAL; } if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { - pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); return -EINVAL; } @@ -170,26 +155,26 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, (!access_ok(VERIFY_WRITE, (const void __user *) args->ring_base_address, sizeof(uint64_t)))) { - pr_err("Can't access ring base address\n"); + pr_err("kfd: can't access ring base address\n"); return -EFAULT; } if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { - pr_err("Ring size must be a power of 2 or 0\n"); + pr_err("kfd: ring size must be a power of 2 or 0\n"); return -EINVAL; } if (!access_ok(VERIFY_WRITE, (const void __user *) args->read_pointer_address, sizeof(uint32_t))) { - pr_err("Can't access read pointer\n"); + pr_err("kfd: can't access read pointer\n"); return -EFAULT; } if (!access_ok(VERIFY_WRITE, (const void __user *) args->write_pointer_address, sizeof(uint32_t))) { - pr_err("Can't access write pointer\n"); + pr_err("kfd: can't access write pointer\n"); return -EFAULT; } @@ -197,7 +182,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, !access_ok(VERIFY_WRITE, (const void __user *) args->eop_buffer_address, sizeof(uint32_t))) { - pr_debug("Can't access eop buffer"); + pr_debug("kfd: can't access eop buffer"); return -EFAULT; } @@ -205,7 +190,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, !access_ok(VERIFY_WRITE, (const void __user *) args->ctx_save_restore_address, sizeof(uint32_t))) { - pr_debug("Can't access ctx save restore buffer"); + pr_debug("kfd: can't access ctx save restore buffer"); return -EFAULT; } @@ -221,7 +206,6 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->ctx_save_restore_area_address = args->ctx_save_restore_address; q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; - q_properties->ctl_stack_size = args->ctl_stack_size; if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) q_properties->type = KFD_QUEUE_TYPE_COMPUTE; @@ -235,27 +219,27 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, else q_properties->format = KFD_QUEUE_FORMAT_PM4; - pr_debug("Queue Percentage: %d, %d\n", + pr_debug("Queue Percentage (%d, %d)\n", q_properties->queue_percent, args->queue_percentage); - pr_debug("Queue Priority: %d, %d\n", + pr_debug("Queue Priority (%d, %d)\n", q_properties->priority, args->queue_priority); - pr_debug("Queue Address: 0x%llX, 0x%llX\n", + pr_debug("Queue Address (0x%llX, 0x%llX)\n", q_properties->queue_address, args->ring_base_address); - pr_debug("Queue Size: 0x%llX, %u\n", + pr_debug("Queue Size (0x%llX, %u)\n", q_properties->queue_size, args->ring_size); - pr_debug("Queue r/w Pointers: %p, %p\n", - q_properties->read_ptr, - q_properties->write_ptr); + pr_debug("Queue r/w Pointers (0x%llX, 0x%llX)\n", + (uint64_t) q_properties->read_ptr, + (uint64_t) q_properties->write_ptr); - pr_debug("Queue Format: %d\n", q_properties->format); + pr_debug("Queue Format (%d)\n", q_properties->format); - pr_debug("Queue EOP: 0x%llX\n", q_properties->eop_ring_buffer_address); + pr_debug("Queue EOP (0x%llX)\n", q_properties->eop_ring_buffer_address); - pr_debug("Queue CTX save area: 0x%llX\n", + pr_debug("Queue CTX save arex (0x%llX)\n", q_properties->ctx_save_restore_area_address); return 0; @@ -273,16 +257,16 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, memset(&q_properties, 0, sizeof(struct queue_properties)); - pr_debug("Creating queue ioctl\n"); + pr_debug("kfd: creating queue ioctl\n"); err = set_queue_properties_from_user(&q_properties, args); if (err) return err; - pr_debug("Looking for gpu id 0x%x\n", args->gpu_id); + pr_debug("kfd: looking for gpu id 0x%x\n", args->gpu_id); dev = kfd_device_by_id(args->gpu_id); - if (!dev) { - pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); + if (dev == NULL) { + pr_debug("kfd: gpu id 0x%x was not found\n", args->gpu_id); return -EINVAL; } @@ -294,11 +278,12 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, goto err_bind_process; } - pr_debug("Creating queue for PASID %d on gpu 0x%x\n", + pr_debug("kfd: creating queue for PASID %d on GPU 0x%x\n", p->pasid, dev->id); - err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id); + err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, + 0, q_properties.type, &queue_id); if (err != 0) goto err_create_queue; @@ -306,28 +291,20 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, /* Return gpu_id as doorbell offset for mmap usage */ - args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; - args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); + args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); args->doorbell_offset <<= PAGE_SHIFT; - if (KFD_IS_SOC15(dev->device_info->asic_family)) - /* On SOC15 ASICs, doorbell allocation must be - * per-device, and independent from the per-process - * queue_id. Return the doorbell offset within the - * doorbell aperture to user mode. - */ - args->doorbell_offset |= q_properties.doorbell_off; mutex_unlock(&p->mutex); - pr_debug("Queue id %d was created successfully\n", args->queue_id); + pr_debug("kfd: queue id %d was created successfully\n", args->queue_id); - pr_debug("Ring buffer address == 0x%016llX\n", + pr_debug("ring buffer address == 0x%016llX\n", args->ring_base_address); - pr_debug("Read ptr address == 0x%016llX\n", + pr_debug("read ptr address == 0x%016llX\n", args->read_pointer_address); - pr_debug("Write ptr address == 0x%016llX\n", + pr_debug("write ptr address == 0x%016llX\n", args->write_pointer_address); return 0; @@ -344,7 +321,7 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, int retval; struct kfd_ioctl_destroy_queue_args *args = data; - pr_debug("Destroying queue id %d for pasid %d\n", + pr_debug("kfd: destroying queue id %d for PASID %d\n", args->queue_id, p->pasid); @@ -364,12 +341,12 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, struct queue_properties properties; if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { - pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); return -EINVAL; } if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { - pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); return -EINVAL; } @@ -377,12 +354,12 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, (!access_ok(VERIFY_WRITE, (const void __user *) args->ring_base_address, sizeof(uint64_t)))) { - pr_err("Can't access ring base address\n"); + pr_err("kfd: can't access ring base address\n"); return -EFAULT; } if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { - pr_err("Ring size must be a power of 2 or 0\n"); + pr_err("kfd: ring size must be a power of 2 or 0\n"); return -EINVAL; } @@ -391,7 +368,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, properties.queue_percent = args->queue_percentage; properties.priority = args->queue_priority; - pr_debug("Updating queue id %d for pasid %d\n", + pr_debug("kfd: updating queue id %d for PASID %d\n", args->queue_id, p->pasid); mutex_lock(&p->mutex); @@ -403,58 +380,6 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, return retval; } -static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, - void *data) -{ - int retval; - const int max_num_cus = 1024; - struct kfd_ioctl_set_cu_mask_args *args = data; - struct queue_properties properties; - uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; - size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32); - - if ((args->num_cu_mask % 32) != 0) { - pr_debug("num_cu_mask 0x%x must be a multiple of 32", - args->num_cu_mask); - return -EINVAL; - } - - properties.cu_mask_count = args->num_cu_mask; - if (properties.cu_mask_count == 0) { - pr_debug("CU mask cannot be 0"); - return -EINVAL; - } - - /* To prevent an unreasonably large CU mask size, set an arbitrary - * limit of max_num_cus bits. We can then just drop any CU mask bits - * past max_num_cus bits and just use the first max_num_cus bits. - */ - if (properties.cu_mask_count > max_num_cus) { - pr_debug("CU mask cannot be greater than 1024 bits"); - properties.cu_mask_count = max_num_cus; - cu_mask_size = sizeof(uint32_t) * (max_num_cus/32); - } - - properties.cu_mask = kzalloc(cu_mask_size, GFP_KERNEL); - if (!properties.cu_mask) - return -ENOMEM; - - retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size); - if (retval) { - pr_debug("Could not copy CU mask from userspace"); - kfree(properties.cu_mask); - return -EFAULT; - } - - mutex_lock(&p->mutex); - - retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); - - mutex_unlock(&p->mutex); - - return retval; -} - static int kfd_ioctl_set_memory_policy(struct file *filep, struct kfd_process *p, void *data) { @@ -475,7 +400,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, } dev = kfd_device_by_id(args->gpu_id); - if (!dev) + if (dev == NULL) return -EINVAL; mutex_lock(&p->mutex); @@ -507,38 +432,6 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, return err; } -static int kfd_ioctl_set_trap_handler(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_set_trap_handler_args *args = data; - struct kfd_dev *dev; - int err = 0; - struct kfd_process_device *pdd; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - err = -ESRCH; - goto out; - } - - if (dev->dqm->ops.set_trap_handler(dev->dqm, - &pdd->qpd, - args->tba_addr, - args->tma_addr)) - err = -EINVAL; - -out: - mutex_unlock(&p->mutex); - - return err; -} - static int kfd_ioctl_dbg_register(struct file *filep, struct kfd_process *p, void *data) { @@ -550,11 +443,16 @@ static int kfd_ioctl_dbg_register(struct file *filep, long status = 0; dev = kfd_device_by_id(args->gpu_id); - if (!dev) + if (dev == NULL) return -EINVAL; - mutex_lock(&p->mutex); + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); + return -EINVAL; + } + mutex_lock(kfd_get_dbgmgr_mutex()); + mutex_lock(&p->mutex); /* * make sure that we have pdd, if this the first queue created for @@ -562,11 +460,12 @@ static int kfd_ioctl_dbg_register(struct file *filep, */ pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { - status = PTR_ERR(pdd); - goto out; + mutex_unlock(&p->mutex); + mutex_unlock(kfd_get_dbgmgr_mutex()); + return PTR_ERR(pdd); } - if (!dev->dbgmgr) { + if (dev->dbgmgr == NULL) { /* In case of a legal call, we have no dbgmgr yet */ create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev); if (create_ok) { @@ -581,9 +480,8 @@ static int kfd_ioctl_dbg_register(struct file *filep, status = -EINVAL; } -out: - mutex_unlock(kfd_get_dbgmgr_mutex()); mutex_unlock(&p->mutex); + mutex_unlock(kfd_get_dbgmgr_mutex()); return status; } @@ -596,7 +494,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, long status; dev = kfd_device_by_id(args->gpu_id); - if (!dev) + if (dev == NULL) return -EINVAL; if (dev->device_info->asic_family == CHIP_CARRIZO) { @@ -607,7 +505,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, mutex_lock(kfd_get_dbgmgr_mutex()); status = kfd_dbgmgr_unregister(dev->dbgmgr, p); - if (!status) { + if (status == 0) { kfd_dbgmgr_destroy(dev->dbgmgr); dev->dbgmgr = NULL; } @@ -641,13 +539,21 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info)); dev = kfd_device_by_id(args->gpu_id); - if (!dev) + if (dev == NULL) + return -EINVAL; + + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); return -EINVAL; + } cmd_from_user = (void __user *) args->content_ptr; - if (args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE || - (args->buf_size_in_bytes <= sizeof(*args))) + /* Validate arguments */ + + if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) || + (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) || + (cmd_from_user == NULL)) return -EINVAL; /* this is the actual buffer to work with */ @@ -673,9 +579,9 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, /* skip over the addresses buffer */ args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; - if (args_idx >= args->buf_size_in_bytes) { - status = -EINVAL; - goto out; + if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) { + kfree(args_buff); + return -EINVAL; } watch_mask_value = (uint64_t) args_buff[args_idx]; @@ -697,9 +603,9 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, args_idx += sizeof(aw_info.watch_mask); } - if (args_idx > args->buf_size_in_bytes) { - status = -EINVAL; - goto out; + if (args_idx >= args->buf_size_in_bytes - sizeof(args)) { + kfree(args_buff); + return -EINVAL; } /* Currently HSA Event is not supported for DBG */ @@ -711,7 +617,6 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, mutex_unlock(kfd_get_dbgmgr_mutex()); -out: kfree(args_buff); return status; @@ -741,9 +646,14 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, sizeof(wac_info.trapId); dev = kfd_device_by_id(args->gpu_id); - if (!dev) + if (dev == NULL) return -EINVAL; + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); + return -EINVAL; + } + /* input size must match the computed "compact" size */ if (args->buf_size_in_bytes != computed_buff_size) { pr_debug("size mismatch, computed : actual %u : %u\n", @@ -802,37 +712,22 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, { struct kfd_ioctl_get_clock_counters_args *args = data; struct kfd_dev *dev; -#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \ - || (defined OS_NAME_RHEL_7_2) - struct timespec time; -#else struct timespec64 time; -#endif dev = kfd_device_by_id(args->gpu_id); - if (dev) - /* Reading GPU clock counter from KGD */ - args->gpu_clock_counter = - dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); - else - /* Node without GPU resource */ - args->gpu_clock_counter = 0; + if (dev == NULL) + return -EINVAL; + + /* Reading GPU clock counter from KGD */ + args->gpu_clock_counter = + dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); /* No access to rdtsc. Using raw monotonic time */ -#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \ - || (defined OS_NAME_RHEL_7_2) - getrawmonotonic(&time); - args->cpu_clock_counter = (uint64_t)timespec_to_ns(&time); - - get_monotonic_boottime(&time); - args->system_clock_counter = (uint64_t)timespec_to_ns(&time); -#else getrawmonotonic64(&time); args->cpu_clock_counter = (uint64_t)timespec64_to_ns(&time); get_monotonic_boottime64(&time); args->system_clock_counter = (uint64_t)timespec64_to_ns(&time); -#endif /* Since the counter is in nano-seconds we use 1GHz frequency */ args->system_clock_freq = 1000000000; @@ -887,104 +782,12 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, "scratch_limit %llX\n", pdd->scratch_limit); args->num_of_nodes++; - - pdd = kfd_get_next_process_device_data(p, pdd); - } while (pdd && (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); - } - - mutex_unlock(&p->mutex); - - return 0; -} - -static int kfd_ioctl_get_process_apertures_new(struct file *filp, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_get_process_apertures_new_args *args = data; - struct kfd_process_device_apertures *pa; - struct kfd_process_device *pdd; - uint32_t nodes = 0; - int ret; - - dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); - - if (args->num_of_nodes == 0) { - /* Return number of nodes, so that user space can alloacate - * sufficient memory - */ - mutex_lock(&p->mutex); - - if (!kfd_has_process_device_data(p)) - goto out_upwrite; - - /* Run over all pdd of the process */ - pdd = kfd_get_first_process_device_data(p); - do { - args->num_of_nodes++; - pdd = kfd_get_next_process_device_data(p, pdd); - } while (pdd); - - goto out_upwrite; - } - - /* Fill in process-aperture information for all available - * nodes, but not more than args->num_of_nodes as that is - * the amount of memory allocated by user - */ - pa = kzalloc((sizeof(struct kfd_process_device_apertures) * - args->num_of_nodes), GFP_KERNEL); - if (!pa) - return -ENOMEM; - - mutex_lock(&p->mutex); - - if (!kfd_has_process_device_data(p)) { - args->num_of_nodes = 0; - kfree(pa); - goto out_upwrite; + } while ((pdd = kfd_get_next_process_device_data(p, pdd)) != NULL && + (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); } - /* Run over all pdd of the process */ - pdd = kfd_get_first_process_device_data(p); - do { - pa[nodes].gpu_id = pdd->dev->id; - pa[nodes].lds_base = pdd->lds_base; - pa[nodes].lds_limit = pdd->lds_limit; - pa[nodes].gpuvm_base = pdd->gpuvm_base; - pa[nodes].gpuvm_limit = pdd->gpuvm_limit; - pa[nodes].scratch_base = pdd->scratch_base; - pa[nodes].scratch_limit = pdd->scratch_limit; - - dev_dbg(kfd_device, - "gpu id %u\n", pdd->dev->id); - dev_dbg(kfd_device, - "lds_base %llX\n", pdd->lds_base); - dev_dbg(kfd_device, - "lds_limit %llX\n", pdd->lds_limit); - dev_dbg(kfd_device, - "gpuvm_base %llX\n", pdd->gpuvm_base); - dev_dbg(kfd_device, - "gpuvm_limit %llX\n", pdd->gpuvm_limit); - dev_dbg(kfd_device, - "scratch_base %llX\n", pdd->scratch_base); - dev_dbg(kfd_device, - "scratch_limit %llX\n", pdd->scratch_limit); - nodes++; - - pdd = kfd_get_next_process_device_data(p, pdd); - } while (pdd && (nodes < args->num_of_nodes)); mutex_unlock(&p->mutex); - args->num_of_nodes = nodes; - ret = copy_to_user( - (void __user *)args->kfd_process_device_apertures_ptr, - pa, - (nodes * sizeof(struct kfd_process_device_apertures))); - kfree(pa); - return ret ? -EFAULT : 0; - -out_upwrite: - mutex_unlock(&p->mutex); return 0; } @@ -992,57 +795,15 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_create_event_args *args = data; - struct kfd_dev *kfd; - struct kfd_process_device *pdd; - int err = -EINVAL; - void *mem, *kern_addr = NULL; - - pr_debug("Event page offset 0x%llx\n", args->event_page_offset); - - if (args->event_page_offset) { - kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); - if (!kfd) { - pr_err("Getting device by id failed in %s\n", __func__); - return -EFAULT; - } - if (!kfd->device_info->is_need_iommu_device) { - mutex_lock(&p->mutex); - pdd = kfd_bind_process_to_device(kfd, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto out_upwrite; - } - mem = kfd_process_device_translate_handle(pdd, - GET_IDR_HANDLE(args->event_page_offset)); - if (!mem) { - pr_err("Can't find BO, offset is 0x%llx\n", - args->event_page_offset); - err = -EFAULT; - goto out_upwrite; - } - mutex_unlock(&p->mutex); - - /* Map dGPU gtt BO to kernel */ - kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, - mem, &kern_addr); - } - } + int err; - err = kfd_event_create(filp, p, - args->event_type, - args->auto_reset != 0, - args->node_id, - &args->event_id, - &args->event_trigger_data, - &args->event_page_offset, - &args->event_slot_index, - kern_addr); + err = kfd_event_create(filp, p, args->event_type, + args->auto_reset != 0, args->node_id, + &args->event_id, &args->event_trigger_data, + &args->event_page_offset, + &args->event_slot_index); return err; - -out_upwrite: - mutex_unlock(&p->mutex); - return err; } static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, @@ -1085,870 +846,9 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, return err; } -static int kfd_ioctl_alloc_scratch_memory(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_alloc_memory_of_scratch_args *args = data; - struct kfd_process_device *pdd; - struct kfd_dev *dev; - long err; - - if (args->size == 0) - return -EINVAL; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto bind_process_to_device_fail; - } - - pdd->sh_hidden_private_base_vmid = args->va_addr; - pdd->qpd.sh_hidden_private_base = args->va_addr; - - mutex_unlock(&p->mutex); - - if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && - pdd->qpd.vmid != 0) { - err = dev->kfd2kgd->alloc_memory_of_scratch( - dev->kgd, args->va_addr, pdd->qpd.vmid); - if (err != 0) - goto alloc_memory_of_scratch_failed; - } - - return 0; - -bind_process_to_device_fail: - mutex_unlock(&p->mutex); -alloc_memory_of_scratch_failed: - return -EFAULT; -} - -bool kfd_is_large_bar(struct kfd_dev *dev) -{ - struct kfd_local_mem_info mem_info; - - if (debug_largebar) { - pr_debug("Simulate large-bar allocation on non large-bar machine\n"); - return true; - } - - if (dev->device_info->is_need_iommu_device) - return false; - - dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info); - if (mem_info.local_mem_size_private == 0 && - mem_info.local_mem_size_public > 0) - return true; - return false; -} - -static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_alloc_memory_of_gpu_args *args = data; - struct kfd_process_device *pdd; - void *mem; - struct kfd_dev *dev; - int idr_handle; - long err; - uint64_t offset = args->mmap_offset; - uint32_t flags = args->flags; - struct vm_area_struct *vma; - - if (args->size == 0) - return -EINVAL; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { - /* Check if the userptr corresponds to another (or third-party) - * device local memory. If so treat is as a doorbell. User - * space will be oblivious of this and will use this doorbell - * BO as a regular userptr BO - */ - vma = find_vma(current->mm, args->mmap_offset); - if (vma && (vma->vm_flags & VM_IO)) { - unsigned long pfn; - - follow_pfn(vma, args->mmap_offset, &pfn); - flags |= KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL; - flags &= ~KFD_IOC_ALLOC_MEM_FLAGS_USERPTR; - offset = (pfn << PAGE_SHIFT); - } - } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { - if (args->size != kfd_doorbell_process_slice(dev)) - return -EINVAL; - offset = kfd_get_process_doorbells(dev, p); - } - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto err_unlock; - } - - err = dev->kfd2kgd->alloc_memory_of_gpu( - dev->kgd, args->va_addr, args->size, - pdd->vm, (struct kgd_mem **) &mem, &offset, - flags); - - if (err) - goto err_unlock; - - idr_handle = kfd_process_device_create_obj_handle(pdd, mem, - args->va_addr, args->size, NULL); - if (idr_handle < 0) { - err = -EFAULT; - goto err_free; - } - - mutex_unlock(&p->mutex); - - args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); - if ((args->flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) != 0 && - !kfd_is_large_bar(dev)) { - args->mmap_offset = 0; - } else { - args->mmap_offset = KFD_MMAP_TYPE_MAP_BO; - args->mmap_offset |= KFD_MMAP_GPU_ID(args->gpu_id); - args->mmap_offset <<= PAGE_SHIFT; - args->mmap_offset |= offset; - } - - return 0; - -err_free: - dev->kfd2kgd->free_memory_of_gpu(dev->kgd, - (struct kgd_mem *) mem, - pdd->vm); -err_unlock: - mutex_unlock(&p->mutex); - return err; -} - -static int kfd_ioctl_free_memory_of_gpu(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_free_memory_of_gpu_args *args = data; - struct kfd_process_device *pdd; - struct kfd_bo *buf_obj; - struct kfd_dev *dev; - int ret; - - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); - if (!dev) - return -EINVAL; - - mutex_lock(&p->mutex); - - pdd = kfd_get_process_device_data(dev, p); - if (!pdd) { - pr_err("Process device data doesn't exist\n"); - ret = -EINVAL; - goto err_unlock; - } - - buf_obj = kfd_process_device_find_bo(pdd, - GET_IDR_HANDLE(args->handle)); - if (!buf_obj) { - ret = -EINVAL; - goto err_unlock; - } - run_rdma_free_callback(buf_obj); - - ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem, - pdd->vm); - - /* If freeing the buffer failed, leave the handle in place for - * clean-up during process tear-down. - */ - if (ret == 0) - kfd_process_device_remove_obj_handle( - pdd, GET_IDR_HANDLE(args->handle)); - -err_unlock: - mutex_unlock(&p->mutex); - return ret; -} - -static int kfd_ioctl_map_memory_to_gpu(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_map_memory_to_gpu_args *args = data; - struct kfd_process_device *pdd, *peer_pdd; - void *mem; - struct kfd_dev *dev, *peer; - long err = 0; - int i, num_dev = 0; - uint32_t *devices_arr = NULL; - - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); - if (!dev) - return -EINVAL; - - if (args->device_ids_array_size > 0 && - (args->device_ids_array_size < sizeof(uint32_t))) { - pr_err("Node IDs array size %u\n", - args->device_ids_array_size); - return -EFAULT; - } - - if (args->device_ids_array_size > 0) { - devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); - if (!devices_arr) - return -ENOMEM; - - err = copy_from_user(devices_arr, - (void __user *)args->device_ids_array_ptr, - args->device_ids_array_size); - if (err != 0) { - err = -EFAULT; - goto copy_from_user_failed; - } - } - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto bind_process_to_device_failed; - } - - mem = kfd_process_device_translate_handle(pdd, - GET_IDR_HANDLE(args->handle)); - if (!mem) { - err = PTR_ERR(mem); - goto get_mem_obj_from_handle_failed; - } - - if (args->device_ids_array_size > 0) { - num_dev = args->device_ids_array_size / sizeof(uint32_t); - for (i = 0 ; i < num_dev; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (!peer) { - pr_err("Getting device by id failed for 0x%x\n", - devices_arr[i]); - err = -EFAULT; - goto get_mem_obj_from_handle_failed; - } - - peer_pdd = kfd_bind_process_to_device(peer, p); - if (!peer_pdd) { - err = -EFAULT; - goto get_mem_obj_from_handle_failed; - } - err = peer->kfd2kgd->map_memory_to_gpu( - peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); - if (err != 0) - pr_err("Failed to map\n"); - } - } else { - err = dev->kfd2kgd->map_memory_to_gpu( - dev->kgd, (struct kgd_mem *)mem, pdd->vm); - if (err != 0) - pr_err("Failed to map\n"); - } - - mutex_unlock(&p->mutex); - - err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true); - if (err) { - pr_debug("Sync memory failed, wait interrupted by user signal\n"); - goto sync_memory_failed; - } - - /* Flush TLBs after waiting for the page table updates to complete */ - if (args->device_ids_array_size > 0) { - for (i = 0; i < num_dev; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (WARN_ON_ONCE(!peer)) - continue; - peer_pdd = kfd_get_process_device_data(dev, p); - if (WARN_ON_ONCE(!peer_pdd)) - continue; - kfd_flush_tlb(peer, p->pasid); - } - } else { - kfd_flush_tlb(dev, p->pasid); - } - - if (args->device_ids_array_size > 0 && devices_arr) - kfree(devices_arr); - - return err; - -bind_process_to_device_failed: -get_mem_obj_from_handle_failed: - mutex_unlock(&p->mutex); -copy_from_user_failed: -sync_memory_failed: - kfree(devices_arr); - return err; -} - -int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd) -{ - int err; - struct kfd_dev *dev = pdd->dev; - - err = dev->kfd2kgd->unmap_memory_to_gpu( - dev->kgd, (struct kgd_mem *) mem, pdd->vm); - - if (err != 0) - return err; - - kfd_flush_tlb(dev, pdd->process->pasid); - - return 0; -} - -static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; - struct kfd_process_device *pdd, *peer_pdd; - void *mem; - struct kfd_dev *dev, *peer; - long err = 0; - uint32_t *devices_arr = NULL, num_dev, i; - - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); - if (!dev) - return -EINVAL; - - if (args->device_ids_array_size > 0 && - (args->device_ids_array_size < sizeof(uint32_t))) { - pr_err("Node IDs array size %u\n", - args->device_ids_array_size); - return -EFAULT; - } - - if (args->device_ids_array_size > 0) { - devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); - if (!devices_arr) - return -ENOMEM; - - err = copy_from_user(devices_arr, - (void __user *)args->device_ids_array_ptr, - args->device_ids_array_size); - if (err != 0) { - err = -EFAULT; - goto copy_from_user_failed; - } - } - - mutex_lock(&p->mutex); - - pdd = kfd_get_process_device_data(dev, p); - if (!pdd) { - pr_err("Process device data doesn't exist\n"); - err = PTR_ERR(pdd); - goto bind_process_to_device_failed; - } - - mem = kfd_process_device_translate_handle(pdd, - GET_IDR_HANDLE(args->handle)); - if (!mem) { - err = PTR_ERR(mem); - goto get_mem_obj_from_handle_failed; - } - - if (args->device_ids_array_size > 0) { - num_dev = args->device_ids_array_size / sizeof(uint32_t); - for (i = 0 ; i < num_dev; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (!peer) { - err = -EFAULT; - goto get_mem_obj_from_handle_failed; - } - - peer_pdd = kfd_get_process_device_data(peer, p); - if (!peer_pdd) { - err = -EFAULT; - goto get_mem_obj_from_handle_failed; - } - kfd_unmap_memory_from_gpu(mem, peer_pdd); - } - kfree(devices_arr); - } else - kfd_unmap_memory_from_gpu(mem, pdd); - - mutex_unlock(&p->mutex); - - return 0; - -bind_process_to_device_failed: -get_mem_obj_from_handle_failed: - mutex_unlock(&p->mutex); -copy_from_user_failed: - kfree(devices_arr); - return err; -} - -static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_set_process_dgpu_aperture_args *args = data; - struct kfd_dev *dev; - struct kfd_process_device *pdd; - long err; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto exit; - } - - err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base, - args->dgpu_limit); - -exit: - mutex_unlock(&p->mutex); - return err; -} - -static int kfd_ioctl_get_dmabuf_info(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_get_dmabuf_info_args *args = data; - struct kfd_dev *dev = NULL; - struct kgd_dev *dma_buf_kgd; - void *metadata_buffer = NULL; - uint32_t flags; - unsigned int i; - int r; - - /* Find a KFD GPU device that supports the get_dmabuf_info query */ - for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) - if (dev && dev->kfd2kgd->get_dmabuf_info) - break; - if (!dev) - return -EINVAL; - - if (args->metadata_ptr) { - metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL); - if (!metadata_buffer) - return -ENOMEM; - } - - /* Get dmabuf info from KGD */ - r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd, - &dma_buf_kgd, &args->size, - metadata_buffer, args->metadata_size, - &args->metadata_size, &flags); - if (r) - goto exit; - - /* Reverse-lookup gpu_id from kgd pointer */ - dev = kfd_device_by_kgd(dma_buf_kgd); - if (!dev) { - r = -EINVAL; - goto exit; - } - args->gpu_id = dev->id; - args->flags = flags; - - /* Copy metadata buffer to user mode */ - if (metadata_buffer) { - r = copy_to_user((void __user *)args->metadata_ptr, - metadata_buffer, args->metadata_size); - if (r != 0) - r = -EFAULT; - } - -exit: - kfree(metadata_buffer); - - return r; -} - -static int kfd_ioctl_import_dmabuf(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_import_dmabuf_args *args = data; - struct kfd_dev *dev; - int r; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - r = kfd_ipc_import_dmabuf(dev, p, args->gpu_id, args->dmabuf_fd, - args->va_addr, &args->handle, NULL); - if (r) - pr_err("Failed to import dmabuf\n"); - - return r; -} - -static int kfd_ioctl_ipc_export_handle(struct file *filep, - struct kfd_process *p, - void *data) -{ - struct kfd_ioctl_ipc_export_handle_args *args = data; - struct kfd_dev *dev; - int r; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - r = kfd_ipc_export_as_handle(dev, p, args->handle, args->share_handle); - if (r) - pr_err("Failed to export IPC handle\n"); - - return r; -} - -static int kfd_ioctl_ipc_import_handle(struct file *filep, - struct kfd_process *p, - void *data) -{ - struct kfd_ioctl_ipc_import_handle_args *args = data; - struct kfd_dev *dev = NULL; - int r; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - r = kfd_ipc_import_handle(dev, p, args->gpu_id, args->share_handle, - args->va_addr, &args->handle, - &args->mmap_offset); - if (r) - pr_err("Failed to import IPC handle\n"); - - return r; -} - -static int kfd_ioctl_get_tile_config(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_get_tile_config_args *args = data; - struct kfd_dev *dev; - struct tile_config config; - int err = 0; - - dev = kfd_device_by_id(args->gpu_id); - - dev->kfd2kgd->get_tile_config(dev->kgd, &config); - - args->gb_addr_config = config.gb_addr_config; - args->num_banks = config.num_banks; - args->num_ranks = config.num_ranks; - - if (args->num_tile_configs > config.num_tile_configs) - args->num_tile_configs = config.num_tile_configs; - err = copy_to_user((void __user *)args->tile_config_ptr, - config.tile_config_ptr, - args->num_tile_configs * sizeof(uint32_t)); - if (err) { - args->num_tile_configs = 0; - return -EFAULT; - } - - if (args->num_macro_tile_configs > config.num_macro_tile_configs) - args->num_macro_tile_configs = - config.num_macro_tile_configs; - err = copy_to_user((void __user *)args->macro_tile_config_ptr, - config.macro_tile_config_ptr, - args->num_macro_tile_configs * sizeof(uint32_t)); - if (err) { - args->num_macro_tile_configs = 0; - return -EFAULT; - } - - return 0; -} - -#if defined(BUILD_AS_DKMS) -static int kfd_ioctl_cross_memory_copy(struct file *filep, - struct kfd_process *local_p, void *data) -{ - return 0; -} -#else -static int kfd_ioctl_cross_memory_copy(struct file *filep, - struct kfd_process *local_p, void *data) -{ - struct kfd_ioctl_cross_memory_copy_args *args = data; - struct kfd_memory_range *src_array, *dst_array; - struct kfd_bo *src_bo, *dst_bo; - struct kfd_process *remote_p, *src_p, *dst_p; - struct task_struct *remote_task; - struct mm_struct *remote_mm; - struct pid *remote_pid; - struct dma_fence *fence = NULL, *lfence = NULL; - uint64_t dst_va_addr; - uint64_t copied, total_copied = 0; - uint64_t src_offset, dst_offset; - int i, j = 0, err = 0; - - /* Check parameters */ - if (args->src_mem_range_array == 0 || args->dst_mem_range_array == 0 || - args->src_mem_array_size == 0 || args->dst_mem_array_size == 0) - return -EINVAL; - args->bytes_copied = 0; - - /* Allocate space for source and destination arrays */ - src_array = kmalloc_array((args->src_mem_array_size + - args->dst_mem_array_size), - sizeof(struct kfd_memory_range), - GFP_KERNEL); - if (!src_array) - return -ENOMEM; - dst_array = &src_array[args->src_mem_array_size]; - - if (copy_from_user(src_array, (void __user *)args->src_mem_range_array, - args->src_mem_array_size * - sizeof(struct kfd_memory_range))) { - err = -EFAULT; - goto copy_from_user_fail; - } - if (copy_from_user(dst_array, (void __user *)args->dst_mem_range_array, - args->dst_mem_array_size * - sizeof(struct kfd_memory_range))) { - err = -EFAULT; - goto copy_from_user_fail; - } - - /* Get remote process */ - remote_pid = find_get_pid(args->pid); - if (!remote_pid) { - pr_err("Cross mem copy failed. Invalid PID %d\n", args->pid); - err = -ESRCH; - goto copy_from_user_fail; - } - - remote_task = get_pid_task(remote_pid, PIDTYPE_PID); - if (!remote_pid) { - pr_err("Cross mem copy failed. Invalid PID or task died %d\n", - args->pid); - err = -ESRCH; - goto get_pid_task_fail; - } - - /* Check access permission */ - remote_mm = mm_access(remote_task, PTRACE_MODE_ATTACH_REALCREDS); - if (!remote_mm || IS_ERR(remote_mm)) { - err = IS_ERR(remote_mm) ? PTR_ERR(remote_mm) : -ESRCH; - if (err == -EACCES) { - pr_err("Cross mem copy failed. Permission error\n"); - err = -EPERM; - } else - pr_err("Cross mem copy failed. Invalid task %d\n", - err); - goto mm_access_fail; - } - - remote_p = kfd_get_process(remote_task); - if (!remote_p) { - pr_err("Cross mem copy failed. Invalid kfd process %d\n", - args->pid); - err = -EINVAL; - goto kfd_process_fail; - } - - if (KFD_IS_CROSS_MEMORY_WRITE(args->flags)) { - src_p = local_p; - dst_p = remote_p; - pr_debug("CMA WRITE: local -> remote\n"); - } else { - src_p = remote_p; - dst_p = local_p; - pr_debug("CMA READ: remote -> local\n"); - } - - - /* For each source kfd_range: - * - Find the BO. Each range has to be within the same BO. - * - Copy this range to single or multiple destination BOs. - * - dst_va_addr - will point to next va address into which data will - * be copied. - * - dst_bo & src_bo - the current destination and source BOs - * - src_offset & dst_offset - offset into the respective BOs from - * data will be sourced or copied - */ - dst_va_addr = dst_array[0].va_addr; - mutex_lock(&dst_p->mutex); - dst_bo = kfd_process_find_bo_from_interval(dst_p, - dst_va_addr, - dst_va_addr + dst_array[0].size - 1); - mutex_unlock(&dst_p->mutex); - if (!dst_bo) { - err = -EFAULT; - goto kfd_process_fail; - } - dst_offset = dst_va_addr - dst_bo->it.start; - - for (i = 0; i < args->src_mem_array_size; i++) { - uint64_t src_va_addr_end = src_array[i].va_addr + - src_array[i].size - 1; - uint64_t src_size_to_copy = src_array[i].size; - - mutex_lock(&src_p->mutex); - src_bo = kfd_process_find_bo_from_interval(src_p, - src_array[i].va_addr, - src_va_addr_end); - mutex_unlock(&src_p->mutex); - if (!src_bo || src_va_addr_end > src_bo->it.last) { - pr_err("Cross mem copy failed. Invalid range\n"); - err = -EFAULT; - break; - } - - src_offset = src_array[i].va_addr - src_bo->it.start; - - /* Copy src_bo to one or multiple dst_bo(s) based on size and - * and current copy location. - */ - while (j < args->dst_mem_array_size) { - uint64_t copy_size; - int64_t space_left; - - /* Find the current copy_size. This will be smaller of - * the following - * - space left in the current dest memory range - * - data left to copy from source range - */ - space_left = (dst_array[j].va_addr + dst_array[j].size) - - dst_va_addr; - copy_size = (src_size_to_copy < space_left) ? - src_size_to_copy : space_left; - - /* Check both BOs belong to same device */ - if (src_bo->dev->kgd != dst_bo->dev->kgd) { - pr_err("Cross Memory failed. Not same device\n"); - err = -EINVAL; - break; - } - - /* Store prev fence. Release it when a later fence is - * created - */ - lfence = fence; - fence = NULL; - - err = dst_bo->dev->kfd2kgd->copy_mem_to_mem( - src_bo->dev->kgd, - src_bo->mem, src_offset, - dst_bo->mem, dst_offset, - copy_size, - &fence, &copied); - - if (err) { - pr_err("GPU Cross mem copy failed\n"); - err = -EFAULT; - break; - } - - /* Later fence available. Release old fence */ - if (fence && lfence) { - dma_fence_put(lfence); - lfence = NULL; - } - - total_copied += copied; - src_size_to_copy -= copied; - space_left -= copied; - dst_va_addr += copied; - dst_offset += copied; - src_offset += copied; - if (dst_va_addr > dst_bo->it.last + 1) { - pr_err("Cross mem copy failed. Memory overflow\n"); - err = -EFAULT; - break; - } - - /* If the cur dest range is full move to next one */ - if (space_left <= 0) { - if (++j >= args->dst_mem_array_size) - break; - - dst_va_addr = dst_array[j].va_addr; - dst_bo = kfd_process_find_bo_from_interval( - dst_p, - dst_va_addr, - dst_va_addr + - dst_array[j].size - 1); - dst_offset = dst_va_addr - dst_bo->it.start; - } - - /* If the cur src range is done, move to next one */ - if (src_size_to_copy <= 0) - break; - } - if (err) - break; - } - - /* Wait for the last fence irrespective of error condition */ - if (fence) { - if (dma_fence_wait_timeout(fence, false, msecs_to_jiffies(1000)) - < 0) - pr_err("Cross mem copy failed. BO timed out\n"); - dma_fence_put(fence); - } else if (lfence) { - pr_debug("GPU copy fail. But wait for prev DMA to finish\n"); - dma_fence_wait_timeout(lfence, true, msecs_to_jiffies(1000)); - dma_fence_put(lfence); - } - -kfd_process_fail: - mmput(remote_mm); -mm_access_fail: - put_task_struct(remote_task); -get_pid_task_fail: - put_pid(remote_pid); -copy_from_user_fail: - kfree(src_array); - - /* An error could happen after partial copy. In that case this will - * reflect partial amount of bytes copied - */ - args->bytes_copied = total_copied; - return err; -} -#endif - -static int kfd_ioctl_get_queue_wave_state(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_get_queue_wave_state_args *args = data; - int r; - - mutex_lock(&p->mutex); - - r = pqm_get_wave_state(&p->pqm, args->queue_id, - (void __user *)args->ctl_stack_address, - &args->ctl_stack_used_size, - &args->save_area_used_size); - - mutex_unlock(&p->mutex); - - return r; -} #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ - [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ - .cmd_drv = 0, .name = #ioctl} + [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0, .name = #ioctl} /** Ioctl table */ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { @@ -1999,55 +899,6 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, kfd_ioctl_dbg_wave_control, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, - kfd_ioctl_alloc_memory_of_gpu, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU, - kfd_ioctl_free_memory_of_gpu, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU, - kfd_ioctl_map_memory_to_gpu, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, - kfd_ioctl_unmap_memory_from_gpu, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH, - kfd_ioctl_alloc_scratch_memory, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, - kfd_ioctl_set_cu_mask, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, - kfd_ioctl_set_process_dgpu_aperture, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, - kfd_ioctl_set_trap_handler, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, - kfd_ioctl_get_process_apertures_new, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO, - kfd_ioctl_get_dmabuf_info, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, - kfd_ioctl_import_dmabuf, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, - kfd_ioctl_get_tile_config, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_IMPORT_HANDLE, - kfd_ioctl_ipc_import_handle, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_EXPORT_HANDLE, - kfd_ioctl_ipc_export_handle, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_CROSS_MEMORY_COPY, - kfd_ioctl_cross_memory_copy, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE, - kfd_ioctl_get_queue_wave_state, 0) - }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) @@ -2143,37 +994,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) { struct kfd_process *process; - struct kfd_dev *kfd; - unsigned long vm_pgoff; - int retval; process = kfd_get_process(current); if (IS_ERR(process)) return PTR_ERR(process); - vm_pgoff = vma->vm_pgoff; - vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff); - - switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { - case KFD_MMAP_TYPE_DOORBELL: - kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); - if (!kfd) - return -EFAULT; - return kfd_doorbell_mmap(kfd, process, vma); - - case KFD_MMAP_TYPE_EVENTS: + if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == + KFD_MMAP_DOORBELL_MASK) { + vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; + return kfd_doorbell_mmap(process, vma); + } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == + KFD_MMAP_EVENTS_MASK) { + vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; return kfd_event_mmap(process, vma); - - case KFD_MMAP_TYPE_MAP_BO: - kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); - if (!kfd) - return -EFAULT; - retval = kfd->kfd2kgd->mmap_bo(kfd->kgd, vma); - return retval; - - case KFD_MMAP_TYPE_RESERVED_MEM: - return kfd_reserved_mem_mmap(process, vma); - } return -EFAULT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c deleted file mode 100644 index 4e94081..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ /dev/null @@ -1,1304 +0,0 @@ -#include -#include -#include -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -#include -#endif -#include -#include "kfd_crat.h" -#include "kfd_priv.h" -#include "kfd_topology.h" - -/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. - * GPU processor ID are expressed with Bit[31]=1. - * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs - * used in the CRAT. - */ -static uint32_t gpu_processor_id_low = 0x80001000; - -/* Return the next available gpu_processor_id and increment it for next GPU - * @total_cu_count - Total CUs present in the GPU including ones - * masked off - */ -static inline unsigned int get_and_inc_gpu_processor_id( - unsigned int total_cu_count) -{ - int current_id = gpu_processor_id_low; - - gpu_processor_id_low += total_cu_count; - return current_id; -} - -/* Static table to describe GPU Cache information */ -struct kfd_gpu_cache_info { - uint32_t cache_size; - uint32_t cache_level; - uint32_t flags; - /* Indicates how many Compute Units share this cache - * Value = 1 indicates the cache is not shared - */ - uint32_t num_cu_shared; -}; - -static struct kfd_gpu_cache_info kaveri_cache_info[] = { - { - /* TCP L1 Cache per CU */ - .cache_size = 16, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_DATA_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 1, - - }, - { - /* Scalar L1 Instruction Cache (in SQC module) per bank */ - .cache_size = 16, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_INST_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 2, - }, - { - /* Scalar L1 Data Cache (in SQC module) per bank */ - .cache_size = 8, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_DATA_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 2, - }, - - /* TODO: Add L2 Cache information */ -}; - - -static struct kfd_gpu_cache_info carrizo_cache_info[] = { - { - /* TCP L1 Cache per CU */ - .cache_size = 16, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_DATA_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 1, - }, - { - /* Scalar L1 Instruction Cache (in SQC module) per bank */ - .cache_size = 8, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_INST_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 4, - }, - { - /* Scalar L1 Data Cache (in SQC module) per bank. */ - .cache_size = 4, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_DATA_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 4, - }, - - /* TODO: Add L2 Cache information */ -}; - -/* NOTE: In future if more information is added to struct kfd_gpu_cache_info - * the following ASICs may need a separate table. - */ -#define hawaii_cache_info kaveri_cache_info -#define tonga_cache_info carrizo_cache_info -#define fiji_cache_info carrizo_cache_info -#define polaris10_cache_info carrizo_cache_info -#define polaris11_cache_info carrizo_cache_info -/* TODO - check & update Vega10 cache details */ -#define vega10_cache_info carrizo_cache_info -#define raven_cache_info carrizo_cache_info - -static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, - struct crat_subtype_computeunit *cu) -{ - dev->node_props.cpu_cores_count = cu->num_cpu_cores; - dev->node_props.cpu_core_id_base = cu->processor_id_low; -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) - dev->node_props.capability |= HSA_CAP_ATS_PRESENT; -#endif - - pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, - cu->processor_id_low); -} - -static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, - struct crat_subtype_computeunit *cu) -{ - dev->node_props.simd_id_base = cu->processor_id_low; - dev->node_props.simd_count = cu->num_simd_cores; - dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; - dev->node_props.max_waves_per_simd = cu->max_waves_simd; - dev->node_props.wave_front_size = cu->wave_front_size; - dev->node_props.array_count = cu->array_count; - dev->node_props.cu_per_simd_array = cu->num_cu_per_array; - dev->node_props.simd_per_cu = cu->num_simd_per_cu; - dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; - if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) - dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; - pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); -} - -/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct - * topology device present in the device_list - */ -static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, - struct list_head *device_list) -{ - struct kfd_topology_device *dev; - - pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", - cu->proximity_domain, cu->hsa_capability); - list_for_each_entry(dev, device_list, list) { - if (cu->proximity_domain == dev->proximity_domain) { - if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) - kfd_populated_cu_info_cpu(dev, cu); - - if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) - kfd_populated_cu_info_gpu(dev, cu); - break; - } - } - - return 0; -} - -/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct - * topology device present in the device_list - */ -static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, - struct list_head *device_list) -{ - struct kfd_mem_properties *props; - struct kfd_topology_device *dev; - - pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", - mem->proximity_domain); - list_for_each_entry(dev, device_list, list) { - if (mem->proximity_domain == dev->proximity_domain) { - props = kfd_alloc_struct(props); - if (!props) - return -ENOMEM; - - /* We're on GPU node */ - if (dev->node_props.cpu_cores_count == 0) { - /* APU */ - if (mem->visibility_type == 0) - props->heap_type = - HSA_MEM_HEAP_TYPE_FB_PRIVATE; - /* dGPU */ - else - props->heap_type = mem->visibility_type; - } else - props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; - - if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) - props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; - if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) - props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; - - props->size_in_bytes = - ((uint64_t)mem->length_high << 32) + - mem->length_low; - props->width = mem->width; - - dev->node_props.mem_banks_count++; - list_add_tail(&props->list, &dev->mem_props); - - break; - } - } - - return 0; -} - -/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct - * topology device present in the device_list - */ -static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, - struct list_head *device_list) -{ - struct kfd_cache_properties *props; - struct kfd_topology_device *dev; - uint32_t id; - uint32_t total_num_of_cu; - - id = cache->processor_id_low; - - list_for_each_entry(dev, device_list, list) { - total_num_of_cu = (dev->node_props.array_count * - dev->node_props.cu_per_simd_array); - - /* Cache infomration in CRAT doesn't have proximity_domain - * information as it is associated with a CPU core or GPU - * Compute Unit. So map the cache using CPU core Id or SIMD - * (GPU) ID. - * TODO: This works because currently we can safely assume that - * Compute Units are parsed before caches are parsed. In - * future, remove this dependency - */ - if ((id >= dev->node_props.cpu_core_id_base && - id <= dev->node_props.cpu_core_id_base + - dev->node_props.cpu_cores_count) || - (id >= dev->node_props.simd_id_base && - id < dev->node_props.simd_id_base + - total_num_of_cu)) { - props = kfd_alloc_struct(props); - if (!props) - return -ENOMEM; - - props->processor_id_low = id; - props->cache_level = cache->cache_level; - props->cache_size = cache->cache_size; - props->cacheline_size = cache->cache_line_size; - props->cachelines_per_tag = cache->lines_per_tag; - props->cache_assoc = cache->associativity; - props->cache_latency = cache->cache_latency; - memcpy(props->sibling_map, cache->sibling_map, - sizeof(props->sibling_map)); - - if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) - props->cache_type |= HSA_CACHE_TYPE_DATA; - if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) - props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; - if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) - props->cache_type |= HSA_CACHE_TYPE_CPU; - if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) - props->cache_type |= HSA_CACHE_TYPE_HSACU; - - dev->cache_count++; - dev->node_props.caches_count++; - list_add_tail(&props->list, &dev->cache_props); - - break; - } - } - - return 0; -} - -/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct - * topology device present in the device_list - */ -static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, - struct list_head *device_list) -{ - struct kfd_iolink_properties *props = NULL, *props2; - struct kfd_topology_device *dev, *cpu_dev; - uint32_t id_from; - uint32_t id_to; - - id_from = iolink->proximity_domain_from; - id_to = iolink->proximity_domain_to; - - pr_debug("Found IO link entry in CRAT table with id_from=%d\n", - id_from); - list_for_each_entry(dev, device_list, list) { - if (id_from == dev->proximity_domain) { - props = kfd_alloc_struct(props); - if (!props) - return -ENOMEM; - - props->node_from = id_from; - props->node_to = id_to; - props->ver_maj = iolink->version_major; - props->ver_min = iolink->version_minor; - props->iolink_type = iolink->io_interface_type; - - if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) - props->weight = 20; - else - props->weight = node_distance(id_from, id_to); - - props->min_latency = iolink->minimum_latency; - props->max_latency = iolink->maximum_latency; - props->min_bandwidth = iolink->minimum_bandwidth_mbs; - props->max_bandwidth = iolink->maximum_bandwidth_mbs; - props->rec_transfer_size = - iolink->recommended_transfer_size; - - dev->io_link_count++; - dev->node_props.io_links_count++; - list_add_tail(&props->list, &dev->io_link_props); - break; - } - } - - /* CPU topology is created before GPUs are detected, so CPU->GPU - * links are not built at that time. If a PCIe type is discovered, it - * means a GPU is detected and we are adding GPU->CPU to the topology. - * At this time, also add the corresponded CPU->GPU link. - */ - if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) { - cpu_dev = kfd_topology_device_by_proximity_domain(id_to); - if (!cpu_dev) - return -ENODEV; - /* same everything but the other direction */ - props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); - props2->node_from = id_to; - props2->node_to = id_from; - props2->kobj = NULL; - cpu_dev->io_link_count++; - cpu_dev->node_props.io_links_count++; - list_add_tail(&props2->list, &cpu_dev->io_link_props); - } - - return 0; -} - -/* kfd_parse_subtype - parse subtypes and attach it to correct topology device - * present in the device_list - * @sub_type_hdr - subtype section of crat_image - * @device_list - list of topology devices present in this crat_image - */ -static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, - struct list_head *device_list) -{ - struct crat_subtype_computeunit *cu; - struct crat_subtype_memory *mem; - struct crat_subtype_cache *cache; - struct crat_subtype_iolink *iolink; - int ret = 0; - - switch (sub_type_hdr->type) { - case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: - cu = (struct crat_subtype_computeunit *)sub_type_hdr; - ret = kfd_parse_subtype_cu(cu, device_list); - break; - case CRAT_SUBTYPE_MEMORY_AFFINITY: - mem = (struct crat_subtype_memory *)sub_type_hdr; - ret = kfd_parse_subtype_mem(mem, device_list); - break; - case CRAT_SUBTYPE_CACHE_AFFINITY: - cache = (struct crat_subtype_cache *)sub_type_hdr; - ret = kfd_parse_subtype_cache(cache, device_list); - break; - case CRAT_SUBTYPE_TLB_AFFINITY: - /* For now, nothing to do here */ - pr_debug("Found TLB entry in CRAT table (not processing)\n"); - break; - case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: - /* For now, nothing to do here */ - pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); - break; - case CRAT_SUBTYPE_IOLINK_AFFINITY: - iolink = (struct crat_subtype_iolink *)sub_type_hdr; - ret = kfd_parse_subtype_iolink(iolink, device_list); - break; - default: - pr_warn("Unknown subtype %d in CRAT\n", - sub_type_hdr->type); - } - - return ret; -} - -/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT - * create a kfd_topology_device and add in to device_list. Also parse - * CRAT subtypes and attach it to appropriate kfd_topology_device - * @crat_image - input image containing CRAT - * @device_list - [OUT] list of kfd_topology_device generated after - * parsing crat_image - * @proximity_domain - Proximity domain of the first device in the table - * - * Return - 0 if successful else -ve value - */ -int kfd_parse_crat_table(void *crat_image, - struct list_head *device_list, - uint32_t proximity_domain) -{ - struct kfd_topology_device *top_dev = NULL; - struct crat_subtype_generic *sub_type_hdr; - uint16_t node_id; - int ret = 0; - struct crat_header *crat_table = (struct crat_header *)crat_image; - uint16_t num_nodes; - uint32_t image_len; - uint32_t last_header_type, last_header_length; - - if (!crat_image) - return -EINVAL; - - if (!list_empty(device_list)) { - pr_warn("Error device list should be empty\n"); - return -EINVAL; - } - - num_nodes = crat_table->num_domains; - image_len = crat_table->length; - - pr_info("Parsing CRAT table with %d nodes\n", num_nodes); - - for (node_id = 0; node_id < num_nodes; node_id++) { - top_dev = kfd_create_topology_device(device_list); - if (!top_dev) - break; - top_dev->proximity_domain = proximity_domain++; - } - - if (!top_dev) { - ret = -ENOMEM; - goto err; - } - - memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); - memcpy(top_dev->oem_table_id, crat_table->oem_table_id, - CRAT_OEMTABLEID_LENGTH); - top_dev->oem_revision = crat_table->oem_revision; - - last_header_type = last_header_length = 0; - sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); - while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < - ((char *)crat_image) + image_len) { - pr_debug("Parsing CRAT subtype header %p enabled: %s type: 0x%x length %d\n", - sub_type_hdr, - (sub_type_hdr->flags & - CRAT_SUBTYPE_FLAGS_ENABLED) - ? "true" : "false", - sub_type_hdr->type, - sub_type_hdr->length); - - if (sub_type_hdr->length == 0) { - pr_err("Parsing wrong CRAT's subtype header last header type: %d last header len %d\n", - last_header_type, last_header_type); - pr_err("Current header type %d length %d\n", - sub_type_hdr->type, sub_type_hdr->length); - break; - } - - if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { - ret = kfd_parse_subtype(sub_type_hdr, device_list); - if (ret != 0) - break; - } - - last_header_type = sub_type_hdr->type; - last_header_length = sub_type_hdr->length; - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - } - -err: - if (ret) - kfd_release_topology_device_list(device_list); - - return ret; -} - -/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ -static int fill_in_pcache(struct crat_subtype_cache *pcache, - struct kfd_gpu_cache_info *pcache_info, - struct kfd_cu_info *cu_info, - int mem_available, - int cu_bitmask, - int cache_type, unsigned int cu_processor_id, - int cu_block) -{ - unsigned int cu_sibling_map_mask; - int first_active_cu; - - /* First check if enough memory is available */ - if (mem_available - sizeof(struct crat_subtype_cache) < 0) - return -ENOMEM; - - cu_sibling_map_mask = cu_bitmask; - cu_sibling_map_mask >>= cu_block; - cu_sibling_map_mask &= - ((1 << pcache_info[cache_type].num_cu_shared) - 1); - first_active_cu = ffs(cu_sibling_map_mask); - - /* CU could be inactive. In case of shared cache find the first active - * CU. and incase of non-shared cache check if the CU is inactive. If - * inactive active skip it - */ - if (first_active_cu) { - memset(pcache, 0, sizeof(struct crat_subtype_cache)); - pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; - pcache->length = sizeof(struct crat_subtype_cache); - pcache->flags = pcache_info[cache_type].flags; - pcache->processor_id_low = cu_processor_id - + (first_active_cu - 1); - pcache->cache_level = pcache_info[cache_type].cache_level; - pcache->cache_size = pcache_info[cache_type].cache_size; - - /* Sibling map is w.r.t processor_id_low, so shift out - * inactive CU - */ - cu_sibling_map_mask = - cu_sibling_map_mask >> (first_active_cu - 1); - - pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); - pcache->sibling_map[1] = - (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); - pcache->sibling_map[2] = - (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); - pcache->sibling_map[3] = - (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); - return 0; - } - return 1; -} - -/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info - * tables - * - * @kdev - [IN] GPU device - * @gpu_processor_id - [IN] GPU processor ID to which these caches - * associate - * @available_size - [IN] Amount of memory available in pcache - * @cu_info - [IN] Compute Unit info obtained from KGD - * @pcache - [OUT] memory into which cache data is to be filled in. - * @size_filled - [OUT] amount of data used up in pcache. - * @num_of_entries - [OUT] number of caches added - */ -static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, - int gpu_processor_id, - int available_size, - struct kfd_cu_info *cu_info, - struct crat_subtype_cache *pcache, - int *size_filled, - int *num_of_entries) -{ - struct kfd_gpu_cache_info *pcache_info; - int num_of_cache_types = 0; - int i, j, k; - int ct = 0; - int mem_available = available_size; - unsigned int cu_processor_id; - int ret; - - switch (kdev->device_info->asic_family) { - case CHIP_KAVERI: - pcache_info = kaveri_cache_info; - num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); - break; - case CHIP_HAWAII: - pcache_info = hawaii_cache_info; - num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); - break; - case CHIP_CARRIZO: - pcache_info = carrizo_cache_info; - num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); - break; - case CHIP_TONGA: - pcache_info = tonga_cache_info; - num_of_cache_types = ARRAY_SIZE(tonga_cache_info); - break; - case CHIP_FIJI: - pcache_info = fiji_cache_info; - num_of_cache_types = ARRAY_SIZE(fiji_cache_info); - break; - case CHIP_POLARIS10: - pcache_info = polaris10_cache_info; - num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); - break; - case CHIP_POLARIS11: - pcache_info = polaris11_cache_info; - num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); - break; - case CHIP_VEGA10: - pcache_info = vega10_cache_info; - num_of_cache_types = ARRAY_SIZE(vega10_cache_info); - break; - case CHIP_RAVEN: - pcache_info = raven_cache_info; - num_of_cache_types = ARRAY_SIZE(raven_cache_info); - break; - default: - return -EINVAL; - } - - *size_filled = 0; - *num_of_entries = 0; - - /* For each type of cache listed in the kfd_gpu_cache_info table, - * go through all available Compute Units. - * The [i,j,k] loop will - * if kfd_gpu_cache_info.num_cu_shared = 1 - * will parse through all available CU - * If (kfd_gpu_cache_info.num_cu_shared != 1) - * then it will consider only one CU from - * the shared unit - */ - - for (ct = 0; ct < num_of_cache_types; ct++) { - cu_processor_id = gpu_processor_id; - for (i = 0; i < cu_info->num_shader_engines; i++) { - for (j = 0; j < cu_info->num_shader_arrays_per_engine; - j++) { - for (k = 0; k < cu_info->num_cu_per_sh; - k += pcache_info[ct].num_cu_shared) { - - ret = fill_in_pcache(pcache, - pcache_info, - cu_info, - mem_available, - cu_info->cu_bitmap[i][j], - ct, - cu_processor_id, - k); - - if (ret < 0) - break; - - if (!ret) { - pcache++; - (*num_of_entries)++; - mem_available -= - sizeof(*pcache); - (*size_filled) += - sizeof(*pcache); - } - - /* Move to next CU block */ - cu_processor_id += - pcache_info[ct].num_cu_shared; - } - } - } - } - - pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); - - return 0; -} - -/* - * kfd_create_crat_image_acpi - Allocates memory for CRAT image and - * copies CRAT from ACPI (if available). - * NOTE: Call kfd_destroy_crat_image to free CRAT image memory - * - * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then - * crat_image will be NULL - * @size: [OUT] size of crat_image - * - * Return 0 if successful else return -ve value - */ -#ifdef CONFIG_ACPI -int kfd_create_crat_image_acpi(void **crat_image, size_t *size) -{ - struct acpi_table_header *crat_table; - acpi_status status; - void *pcrat_image; - - if (!crat_image) - return -EINVAL; - - *crat_image = NULL; - - /* Fetch the CRAT table from ACPI */ - status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); - if (status == AE_NOT_FOUND) { - pr_warn("CRAT table not found\n"); - return -ENODATA; - } else if (ACPI_FAILURE(status)) { - const char *err = acpi_format_exception(status); - - pr_err("CRAT table error: %s\n", err); - return -EINVAL; - } - - if (ignore_crat) { - pr_info("CRAT table disabled by module option\n"); - return -ENODATA; - } - - pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); - if (!pcrat_image) { - pr_err("No memory for allocating CRAT image\n"); - return -ENOMEM; - } - - memcpy(pcrat_image, crat_table, crat_table->length); - - *crat_image = pcrat_image; - *size = crat_table->length; - - return 0; -} -#endif - -/* Memory required to create Virtual CRAT. - * Since there is no easy way to predict the amount of memory required, the - * following amount are allocated for CPU and GPU Virtual CRAT. This is - * expected to cover all known conditions. But to be safe additional check - * is put in the code to ensure we don't overwrite. - */ -#define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) -#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) - -/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node - * - * @numa_node_id: CPU NUMA node id - * @avail_size: Available size in the memory - * @sub_type_hdr: Memory into which compute info will be filled in - * - * Return 0 if successful else return -ve value - */ -static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, - int proximity_domain, - struct crat_subtype_computeunit *sub_type_hdr) -{ - const struct cpumask *cpumask; - - *avail_size -= sizeof(struct crat_subtype_computeunit); - if (*avail_size < 0) - return -ENOMEM; - - memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); - - /* Fill in subtype header data */ - sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); - sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; - - cpumask = cpumask_of_node(numa_node_id); - - /* Fill in CU data */ - sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; - sub_type_hdr->proximity_domain = proximity_domain; - sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); - if (sub_type_hdr->processor_id_low == -1) - return -EINVAL; - - sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); - - return 0; -} - -/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node - * - * @numa_node_id: CPU NUMA node id - * @avail_size: Available size in the memory - * @sub_type_hdr: Memory into which compute info will be filled in - * - * Return 0 if successful else return -ve value - */ -static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, - int proximity_domain, - struct crat_subtype_memory *sub_type_hdr) -{ - uint64_t mem_in_bytes = 0; - pg_data_t *pgdat; - int zone_type; - - *avail_size -= sizeof(struct crat_subtype_memory); - if (*avail_size < 0) - return -ENOMEM; - - memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); - - /* Fill in subtype header data */ - sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_memory); - sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; - - /* Fill in Memory Subunit data */ - - /* Unlike si_meminfo, si_meminfo_node is not exported. So - * the following lines are duplicated from si_meminfo_node - * function - */ - pgdat = NODE_DATA(numa_node_id); - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; - mem_in_bytes <<= PAGE_SHIFT; - - sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); - sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); - sub_type_hdr->proximity_domain = proximity_domain; - - return 0; -} - -#ifdef CONFIG_X86_64 -static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, - uint32_t *num_entries, - struct crat_subtype_iolink *sub_type_hdr) -{ - int nid; - struct cpuinfo_x86 *c = &cpu_data(0); - uint8_t link_type; - - if (c->x86_vendor == X86_VENDOR_AMD) - link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; - else - link_type = CRAT_IOLINK_TYPE_QPI_1_1; - - *num_entries = 0; - - /* Create IO links from this node to other CPU nodes */ - for_each_online_node(nid) { - if (nid == numa_node_id) /* node itself */ - continue; - - *avail_size -= sizeof(struct crat_subtype_iolink); - if (*avail_size < 0) - return -ENOMEM; - - memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); - - /* Fill in subtype header data */ - sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_iolink); - sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; - - /* Fill in IO link data */ - sub_type_hdr->proximity_domain_from = numa_node_id; - sub_type_hdr->proximity_domain_to = nid; - sub_type_hdr->io_interface_type = link_type; - - (*num_entries)++; - sub_type_hdr++; - } - - return 0; -} -#endif - -/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU - * - * @pcrat_image: Fill in VCRAT for CPU - * @size: [IN] allocated size of crat_image. - * [OUT] actual size of data filled in crat_image - */ -static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) -{ - struct crat_header *crat_table = (struct crat_header *)pcrat_image; - struct crat_subtype_generic *sub_type_hdr; - int avail_size = *size; - int numa_node_id; - int ret = 0; -#ifdef CONFIG_ACPI - struct acpi_table_header *acpi_table; - acpi_status status; -#endif -#ifdef CONFIG_X86_64 - uint32_t entries = 0; -#endif - - if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) - return -EINVAL; - - /* Fill in CRAT Header. - * Modify length and total_entries as subunits are added. - */ - avail_size -= sizeof(struct crat_header); - if (avail_size < 0) - return -ENOMEM; - - memset(crat_table, 0, sizeof(struct crat_header)); - memcpy(&crat_table->signature, CRAT_SIGNATURE, - sizeof(crat_table->signature)); - crat_table->length = sizeof(struct crat_header); - -#ifdef CONFIG_ACPI - status = acpi_get_table("DSDT", 0, &acpi_table); - if (status == AE_NOT_FOUND) - pr_warn("DSDT table not found for OEM information\n"); - else { - crat_table->oem_revision = acpi_table->revision; - memcpy(crat_table->oem_id, acpi_table->oem_id, - CRAT_OEMID_LENGTH); - memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, - CRAT_OEMTABLEID_LENGTH); - } -#else - crat_table->oem_revision = 0; - memcpy(crat_table->oem_id, "INV", CRAT_OEMID_LENGTH); - memcpy(crat_table->oem_table_id, "UNAVAIL", CRAT_OEMTABLEID_LENGTH); -#endif - crat_table->total_entries = 0; - crat_table->num_domains = 0; - - sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); - - for_each_online_node(numa_node_id) { - if (kfd_numa_node_to_apic_id(numa_node_id) == -1) - continue; - - /* Fill in Subtype: Compute Unit */ - ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, - crat_table->num_domains, - (struct crat_subtype_computeunit *)sub_type_hdr); - if (ret < 0) - return ret; - crat_table->length += sub_type_hdr->length; - crat_table->total_entries++; - - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - - /* Fill in Subtype: Memory */ - ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, - crat_table->num_domains, - (struct crat_subtype_memory *)sub_type_hdr); - if (ret < 0) - return ret; - crat_table->length += sub_type_hdr->length; - crat_table->total_entries++; - - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - - /* Fill in Subtype: IO Link */ -#ifdef CONFIG_X86_64 - ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, - &entries, - (struct crat_subtype_iolink *)sub_type_hdr); - if (ret < 0) - return ret; - crat_table->length += (sub_type_hdr->length * entries); - crat_table->total_entries += entries; - - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length * entries); -#else - pr_info("IO link not available for non x86 platforms\n"); -#endif - - crat_table->num_domains++; - } - - /* TODO: Add cache Subtype for CPU. - * Currently, CPU cache information is available in function - * detect_cache_attributes(cpu) defined in the file - * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not - * exported and to get the same information the code needs to be - * duplicated. - */ - - *size = crat_table->length; - pr_info("Virtual CRAT table created for CPU\n"); - - return 0; -} - -static int kfd_fill_gpu_memory_affinity(int *avail_size, - struct kfd_dev *kdev, uint8_t type, uint64_t size, - struct crat_subtype_memory *sub_type_hdr, - uint32_t proximity_domain, - const struct kfd_local_mem_info *local_mem_info) -{ - *avail_size -= sizeof(struct crat_subtype_memory); - if (*avail_size < 0) - return -ENOMEM; - - memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); - sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_memory); - sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; - - sub_type_hdr->proximity_domain = proximity_domain; - - pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", - type, size); - - sub_type_hdr->length_low = lower_32_bits(size); - sub_type_hdr->length_high = upper_32_bits(size); - - sub_type_hdr->width = local_mem_info->vram_width; - sub_type_hdr->visibility_type = type; - - return 0; -} - -/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU - * to its NUMA node - * @avail_size: Available size in the memory - * @kdev - [IN] GPU device - * @sub_type_hdr: Memory into which io link info will be filled in - * @proximity_domain - proximity domain of the GPU node - * - * Return 0 if successful else return -ve value - */ -static int kfd_fill_gpu_direct_io_link(int *avail_size, - struct kfd_dev *kdev, - struct crat_subtype_iolink *sub_type_hdr, - uint32_t proximity_domain) -{ - *avail_size -= sizeof(struct crat_subtype_iolink); - if (*avail_size < 0) - return -ENOMEM; - - memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); - - /* Fill in subtype header data */ - sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_iolink); - sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; - - /* Fill in IOLINK subtype. - * TODO: Fill-in other fields of iolink subtype - */ - sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; - sub_type_hdr->proximity_domain_from = proximity_domain; -#ifdef CONFIG_NUMA - if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) - sub_type_hdr->proximity_domain_to = 0; - else - sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; -#else - sub_type_hdr->proximity_domain_to = 0; -#endif - return 0; -} - -/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU - * - * @pcrat_image: Fill in VCRAT for GPU - * @size: [IN] allocated size of crat_image. - * [OUT] actual size of data filled in crat_image - */ -static int kfd_create_vcrat_image_gpu(void *pcrat_image, - size_t *size, struct kfd_dev *kdev, - uint32_t proximity_domain) -{ - struct crat_header *crat_table = (struct crat_header *)pcrat_image; - struct crat_subtype_generic *sub_type_hdr; - struct crat_subtype_computeunit *cu; - struct kfd_cu_info cu_info; - int avail_size = *size; - uint32_t total_num_of_cu; - int num_of_cache_entries = 0; - int cache_mem_filled = 0; - int ret = 0; -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - struct amd_iommu_device_info iommu_info; - const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | - AMD_IOMMU_DEVICE_FLAG_PRI_SUP | - AMD_IOMMU_DEVICE_FLAG_PASID_SUP; -#endif - struct kfd_local_mem_info local_mem_info; - - if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) - return -EINVAL; - - /* Fill the CRAT Header. - * Modify length and total_entries as subunits are added. - */ - avail_size -= sizeof(struct crat_header); - if (avail_size < 0) - return -ENOMEM; - - memset(crat_table, 0, sizeof(struct crat_header)); - - memcpy(&crat_table->signature, CRAT_SIGNATURE, - sizeof(crat_table->signature)); - /* Change length as we add more subtypes*/ - crat_table->length = sizeof(struct crat_header); - crat_table->num_domains = 1; - crat_table->total_entries = 0; - - /* Fill in Subtype: Compute Unit - * First fill in the sub type header and then sub type data - */ - avail_size -= sizeof(struct crat_subtype_computeunit); - if (avail_size < 0) - return -ENOMEM; - - sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); - memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); - - sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); - sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; - - /* Fill CU subtype data */ - cu = (struct crat_subtype_computeunit *)sub_type_hdr; - cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; - cu->proximity_domain = proximity_domain; - - kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); - cu->num_simd_per_cu = cu_info.simd_per_cu; - cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; - cu->max_waves_simd = cu_info.max_waves_per_simd; - - cu->wave_front_size = cu_info.wave_front_size; - cu->array_count = cu_info.num_shader_arrays_per_engine * - cu_info.num_shader_engines; - total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); - cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); - cu->num_cu_per_array = cu_info.num_cu_per_sh; - cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; - cu->num_banks = cu_info.num_shader_engines; - cu->lds_size_in_kb = cu_info.lds_size; - - cu->hsa_capability = 0; - - /* Check if this node supports IOMMU. During parsing this flag will - * translate to HSA_CAP_ATS_PRESENT - */ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - iommu_info.flags = 0; - if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) { - if ((iommu_info.flags & required_iommu_flags) == - required_iommu_flags) - cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; - } -#endif - - crat_table->length += sub_type_hdr->length; - crat_table->total_entries++; - - /* Fill in Subtype: Memory. Only on systems with large BAR (no - * private FB), report memory as public. On other systems - * report the total FB size (public+private) as a single - * private heap. - */ - kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - - if (debug_largebar) - local_mem_info.local_mem_size_private = 0; - - if (local_mem_info.local_mem_size_private == 0) - ret = kfd_fill_gpu_memory_affinity(&avail_size, - kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, - local_mem_info.local_mem_size_public, - (struct crat_subtype_memory *)sub_type_hdr, - proximity_domain, - &local_mem_info); - else - ret = kfd_fill_gpu_memory_affinity(&avail_size, - kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, - local_mem_info.local_mem_size_public + - local_mem_info.local_mem_size_private, - (struct crat_subtype_memory *)sub_type_hdr, - proximity_domain, - &local_mem_info); - if (ret < 0) - return ret; - - crat_table->length += sizeof(struct crat_subtype_memory); - crat_table->total_entries++; - - /* TODO: Fill in cache information. This information is NOT readily - * available in KGD - */ - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, - avail_size, - &cu_info, - (struct crat_subtype_cache *)sub_type_hdr, - &cache_mem_filled, - &num_of_cache_entries); - - if (ret < 0) - return ret; - - crat_table->length += cache_mem_filled; - crat_table->total_entries += num_of_cache_entries; - avail_size -= cache_mem_filled; - - /* Fill in Subtype: IO_LINKS - * Only direct links are added here which is Link from GPU to - * to its NUMA node. Indirect links are added by userspace. - */ - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - cache_mem_filled); - ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, - (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); - - if (ret < 0) - return ret; - - crat_table->length += sub_type_hdr->length; - crat_table->total_entries++; - - *size = crat_table->length; - pr_info("Virtual CRAT table created for GPU\n"); - - return ret; -} - -/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and - * creates a Virtual CRAT (VCRAT) image - * - * NOTE: Call kfd_destroy_crat_image to free CRAT image memory - * - * @crat_image: VCRAT image created because ACPI does not have a - * CRAT for this device - * @size: [OUT] size of virtual crat_image - * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device - * COMPUTE_UNIT_GPU - Create VCRAT for GPU - * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU - * -- this option is not currently implemented. - * The assumption is that all AMD APUs will have CRAT - * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU - * - * Return 0 if successful else return -ve value - */ -int kfd_create_crat_image_virtual(void **crat_image, size_t *size, - int flags, struct kfd_dev *kdev, uint32_t proximity_domain) -{ - void *pcrat_image = NULL; - int ret = 0; - - if (!crat_image) - return -EINVAL; - - *crat_image = NULL; - - /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and - * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover - * all the current conditions. A check is put not to overwrite beyond - * allocated size - */ - switch (flags) { - case COMPUTE_UNIT_CPU: - pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); - if (!pcrat_image) - return -ENOMEM; - *size = VCRAT_SIZE_FOR_CPU; - ret = kfd_create_vcrat_image_cpu(pcrat_image, size); - break; - case COMPUTE_UNIT_GPU: - if (!kdev) - return -EINVAL; - pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); - if (!pcrat_image) - return -ENOMEM; - *size = VCRAT_SIZE_FOR_GPU; - ret = kfd_create_vcrat_image_gpu(pcrat_image, size, - kdev, proximity_domain); - break; - case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): - /* TODO: */ - ret = -EINVAL; - pr_err("VCRAT not implemented for APU\n"); - break; - default: - ret = -EINVAL; - } - - if (!ret) - *crat_image = pcrat_image; - else - kfree(pcrat_image); - - return ret; -} - - -/* kfd_destroy_crat_image - * - * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) - * - */ -void kfd_destroy_crat_image(void *crat_image) -{ - kfree(crat_image); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index 00de41f..a374fa3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -24,7 +24,6 @@ #define KFD_CRAT_H_INCLUDED #include -#include "kfd_priv.h" #pragma pack(1) @@ -45,10 +44,6 @@ #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) -/* Compute Unit flags */ -#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ -#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ - struct crat_header { uint32_t signature; uint32_t length; @@ -110,7 +105,7 @@ struct crat_subtype_computeunit { uint8_t wave_front_size; uint8_t num_banks; uint16_t micro_engine_id; - uint8_t array_count; + uint8_t num_arrays; uint8_t num_cu_per_array; uint8_t num_simd_per_cu; uint8_t max_slots_scatch_cu; @@ -132,14 +127,13 @@ struct crat_subtype_memory { uint8_t length; uint16_t reserved; uint32_t flags; - uint32_t proximity_domain; + uint32_t promixity_domain; uint32_t base_addr_low; uint32_t base_addr_high; uint32_t length_low; uint32_t length_high; uint32_t width; - uint8_t visibility_type; /* for virtual (dGPU) CRAT */ - uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; + uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; }; /* @@ -228,12 +222,9 @@ struct crat_subtype_ccompute { /* * HSA IO Link Affinity structure and definitions */ -#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) -#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) -#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) -#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) -#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) -#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 +#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 +#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 +#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc /* * IO interface types @@ -241,16 +232,8 @@ struct crat_subtype_ccompute { #define CRAT_IOLINK_TYPE_UNDEFINED 0 #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 -#define CRAT_IOLINK_TYPE_AMBA 3 -#define CRAT_IOLINK_TYPE_MIPI 4 -#define CRAT_IOLINK_TYPE_QPI_1_1 5 -#define CRAT_IOLINK_TYPE_RESERVED1 6 -#define CRAT_IOLINK_TYPE_RESERVED2 7 -#define CRAT_IOLINK_TYPE_RAPID_IO 8 -#define CRAT_IOLINK_TYPE_INFINIBAND 9 -#define CRAT_IOLINK_TYPE_RESERVED3 10 -#define CRAT_IOLINK_TYPE_OTHER 11 -#define CRAT_IOLINK_TYPE_MAX 255 +#define CRAT_IOLINK_TYPE_OTHER 3 +#define CRAT_IOLINK_TYPE_MAX 255 #define CRAT_IOLINK_RESERVED_LENGTH 24 @@ -308,13 +291,4 @@ struct cdit_header { #pragma pack() -#ifdef CONFIG_ACPI -int kfd_create_crat_image_acpi(void **crat_image, size_t *size); -#endif -void kfd_destroy_crat_image(void *crat_image); -int kfd_parse_crat_table(void *crat_image, - struct list_head *device_list, - uint32_t proximity_domain); -int kfd_create_crat_image_virtual(void **crat_image, size_t *size, - int flags, struct kfd_dev *kdev, uint32_t proximity_domain); #endif /* KFD_CRAT_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c index af6d736..d5e19b5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c @@ -29,7 +29,7 @@ #include #include -#include "kfd_pm4_headers_vi.h" +#include "kfd_pm4_headers.h" #include "kfd_pm4_headers_diq.h" #include "kfd_kernel_queue.h" #include "kfd_priv.h" @@ -42,15 +42,16 @@ static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev) { + BUG_ON(!dev || !dev->kfd2kgd); + dev->kfd2kgd->address_watch_disable(dev->kgd); } static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, unsigned int pasid, uint64_t vmid0_address, - uint32_t *packet_buff, size_t size_in_bytes, - bool sync) + uint32_t *packet_buff, size_t size_in_bytes) { - struct pm4_mec_release_mem *rm_packet; + struct pm4__release_mem *rm_packet; struct pm4__indirect_buffer_pasid *ib_packet; struct kfd_mem_obj *mem_obj; size_t pq_packets_size_in_bytes; @@ -61,14 +62,12 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, unsigned int *ib_packet_buff; int status; - if (WARN_ON(!size_in_bytes)) - return -EINVAL; + BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes); kq = dbgdev->kq; - pq_packets_size_in_bytes = sizeof(struct pm4__indirect_buffer_pasid); - if (sync) - pq_packets_size_in_bytes += sizeof(struct pm4_mec_release_mem); + pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + + sizeof(struct pm4__indirect_buffer_pasid); /* * We acquire a buffer from DIQ @@ -78,8 +77,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, status = kq->ops.acquire_packet_buffer(kq, pq_packets_size_in_bytes / sizeof(uint32_t), &ib_packet_buff); - if (status) { - pr_err("acquire_packet_buffer failed\n"); + if (status != 0) { + pr_err("amdkfd: acquire_packet_buffer failed\n"); return status; } @@ -101,11 +100,6 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, ib_packet->bitfields5.pasid = pasid; - if (!sync) { - kq->ops.submit_packet(kq); - return status; - } - /* * for now we use release mem for GPU-CPU synchronization * Consider WaitRegMem + WriteData as a better alternative @@ -114,15 +108,15 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, * (a) Sync with HW * (b) Sync var is written by CP to mem. */ - rm_packet = (struct pm4_mec_release_mem *) (ib_packet_buff + + rm_packet = (struct pm4__release_mem *) (ib_packet_buff + (sizeof(struct pm4__indirect_buffer_pasid) / sizeof(unsigned int))); status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), &mem_obj); - if (status) { - pr_err("Failed to allocate GART memory\n"); + if (status != 0) { + pr_err("amdkfd: Failed to allocate GART memory\n"); kq->ops.rollback_packet(kq); return status; } @@ -133,7 +127,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, rm_packet->header.opcode = IT_RELEASE_MEM; rm_packet->header.type = PM4_TYPE_3; - rm_packet->header.count = sizeof(struct pm4_mec_release_mem) / + rm_packet->header.count = sizeof(struct pm4__release_mem) / sizeof(unsigned int) - 2; rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; @@ -174,6 +168,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) { + BUG_ON(!dbgdev); + /* * no action is needed in this case, * just make sure diq will not be used @@ -191,12 +187,14 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) struct kernel_queue *kq = NULL; int status; - properties.type = KFD_QUEUE_TYPE_DIQ; + BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->dev); + status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, - &properties, &qid); + &properties, 0, KFD_QUEUE_TYPE_DIQ, + &qid); if (status) { - pr_err("Failed to create DIQ\n"); + pr_err("amdkfd: Failed to create DIQ\n"); return status; } @@ -204,8 +202,8 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) kq = pqm_get_kernel_queue(dbgdev->pqm, qid); - if (!kq) { - pr_err("Error getting DIQ\n"); + if (kq == NULL) { + pr_err("amdkfd: Error getting DIQ\n"); pqm_destroy_queue(dbgdev->pqm, qid); return -EFAULT; } @@ -217,6 +215,8 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev) { + BUG_ON(!dbgdev || !dbgdev->dev); + /* disable watch address */ dbgdev_address_watch_disable_nodiq(dbgdev->dev); return 0; @@ -227,6 +227,8 @@ static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev) /* todo - disable address watch */ int status; + BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->kq); + status = pqm_destroy_queue(dbgdev->pqm, dbgdev->kq->queue->properties.queue_id); dbgdev->kq = NULL; @@ -239,17 +241,18 @@ static void dbgdev_address_watch_set_registers( union TCP_WATCH_ADDR_H_BITS *addrHi, union TCP_WATCH_ADDR_L_BITS *addrLo, union TCP_WATCH_CNTL_BITS *cntl, - unsigned int index, unsigned int vmid, - bool is_apu) + unsigned int index, unsigned int vmid) { union ULARGE_INTEGER addr; + BUG_ON(!adw_info || !addrHi || !addrLo || !cntl); + addr.quad_part = 0; addrHi->u32All = 0; addrLo->u32All = 0; cntl->u32All = 0; - if (adw_info->watch_mask) + if (adw_info->watch_mask != NULL) cntl->bitfields.mask = (uint32_t) (adw_info->watch_mask[index] & ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); @@ -265,9 +268,9 @@ static void dbgdev_address_watch_set_registers( cntl->bitfields.mode = adw_info->watch_mode[index]; cntl->bitfields.vmid = (uint32_t) vmid; - /* for APU assume it is an ATC address */ - if (is_apu) - cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; + /* for now assume it is an ATC address */ + cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; + pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask); pr_debug("\t\t%20s %08x\n", "set reg add high :", addrHi->bitfields.addr); @@ -276,7 +279,7 @@ static void dbgdev_address_watch_set_registers( } static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, - struct dbg_address_watch_info *adw_info) + struct dbg_address_watch_info *adw_info) { union TCP_WATCH_ADDR_H_BITS addrHi; union TCP_WATCH_ADDR_L_BITS addrLo; @@ -284,11 +287,13 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, struct kfd_process_device *pdd; unsigned int i; + BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); + /* taking the vmid for that process on the safe way using pdd */ pdd = kfd_get_process_device_data(dbgdev->dev, adw_info->process); if (!pdd) { - pr_err("Failed to get pdd for wave control no DIQ\n"); + pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); return -EFAULT; } @@ -298,19 +303,19 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || (adw_info->num_watch_points == 0)) { - pr_err("num_watch_points is invalid\n"); + pr_err("amdkfd: num_watch_points is invalid\n"); return -EINVAL; } - if (!adw_info->watch_mode || !adw_info->watch_address) { - pr_err("adw_info fields are not valid\n"); + if ((adw_info->watch_mode == NULL) || + (adw_info->watch_address == NULL)) { + pr_err("amdkfd: adw_info fields are not valid\n"); return -EINVAL; } - for (i = 0; i < adw_info->num_watch_points; i++) { + for (i = 0 ; i < adw_info->num_watch_points ; i++) { dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, - &cntl, i, pdd->qpd.vmid, - dbgdev->dev->device_info->is_need_iommu_device); + &cntl, i, pdd->qpd.vmid); pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); pr_debug("\t\t%20s %08x\n", "register index :", i); @@ -343,43 +348,48 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, } static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, - struct dbg_address_watch_info *adw_info) + struct dbg_address_watch_info *adw_info) { struct pm4__set_config_reg *packets_vec; union TCP_WATCH_ADDR_H_BITS addrHi; union TCP_WATCH_ADDR_L_BITS addrLo; union TCP_WATCH_CNTL_BITS cntl; + struct kfd_mem_obj *mem_obj; unsigned int aw_reg_add_dword; uint32_t *packet_buff_uint; - uint64_t packet_buff_gpu_addr; unsigned int i; int status; size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; /* we do not control the vmid in DIQ mode, just a place holder */ unsigned int vmid = 0; + BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); + addrHi.u32All = 0; addrLo.u32All = 0; cntl.u32All = 0; if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || (adw_info->num_watch_points == 0)) { - pr_err("num_watch_points is invalid\n"); + pr_err("amdkfd: num_watch_points is invalid\n"); return -EINVAL; } - if (!adw_info->watch_mode || !adw_info->watch_address) { - pr_err("adw_info fields are not valid\n"); + if ((NULL == adw_info->watch_mode) || + (NULL == adw_info->watch_address)) { + pr_err("amdkfd: adw_info fields are not valid\n"); return -EINVAL; } - status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, - ib_size/sizeof(uint32_t), - &packet_buff_uint, &packet_buff_gpu_addr); - if (status) { - pr_err("Failed to allocate IB from DIQ ring\n"); + status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); + + if (status != 0) { + pr_err("amdkfd: Failed to allocate GART memory\n"); return status; } + + packet_buff_uint = mem_obj->cpu_ptr; + memset(packet_buff_uint, 0, ib_size); packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); @@ -398,9 +408,12 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, packets_vec[3].bitfields2.insert_vmid = 1; for (i = 0; i < adw_info->num_watch_points; i++) { - dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, - &cntl, i, vmid, - dbgdev->dev->device_info->is_need_iommu_device); + dbgdev_address_watch_set_registers(adw_info, + &addrHi, + &addrLo, + &cntl, + i, + vmid); pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); pr_debug("\t\t%20s %08x\n", "register index :", i); @@ -429,6 +442,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, i, ADDRESS_WATCH_REG_CNTL); + aw_reg_add_dword /= sizeof(uint32_t); + packets_vec[0].bitfields2.reg_offset = aw_reg_add_dword - AMD_CONFIG_REG_BASE; @@ -440,6 +455,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, i, ADDRESS_WATCH_REG_ADDR_HI); + aw_reg_add_dword /= sizeof(uint32_t); + packets_vec[1].bitfields2.reg_offset = aw_reg_add_dword - AMD_CONFIG_REG_BASE; packets_vec[1].reg_data[0] = addrHi.u32All; @@ -450,6 +467,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, i, ADDRESS_WATCH_REG_ADDR_LO); + aw_reg_add_dword /= sizeof(uint32_t); + packets_vec[2].bitfields2.reg_offset = aw_reg_add_dword - AMD_CONFIG_REG_BASE; packets_vec[2].reg_data[0] = addrLo.u32All; @@ -466,6 +485,8 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, i, ADDRESS_WATCH_REG_CNTL); + aw_reg_add_dword /= sizeof(uint32_t); + packets_vec[3].bitfields2.reg_offset = aw_reg_add_dword - AMD_CONFIG_REG_BASE; packets_vec[3].reg_data[0] = cntl.u32All; @@ -473,30 +494,32 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, status = dbgdev_diq_submit_ib( dbgdev, adw_info->process->pasid, - packet_buff_gpu_addr, + mem_obj->gpu_addr, packet_buff_uint, - ib_size, true); + ib_size); - if (status) { - pr_err("Failed to submit IB to DIQ\n"); - return status; + if (status != 0) { + pr_err("amdkfd: Failed to submit IB to DIQ\n"); + break; } } + kfd_gtt_sa_free(dbgdev->dev, mem_obj); return status; } static int dbgdev_wave_control_set_registers( struct dbg_wave_control_info *wac_info, union SQ_CMD_BITS *in_reg_sq_cmd, - union GRBM_GFX_INDEX_BITS *in_reg_gfx_index, - unsigned int asic_family) + union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) { int status = 0; union SQ_CMD_BITS reg_sq_cmd; union GRBM_GFX_INDEX_BITS reg_gfx_index; struct HsaDbgWaveMsgAMDGen2 *pMsg; + BUG_ON(!wac_info || !in_reg_sq_cmd || !in_reg_gfx_index); + reg_sq_cmd.u32All = 0; reg_gfx_index.u32All = 0; pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2; @@ -548,25 +571,11 @@ static int dbgdev_wave_control_set_registers( switch (wac_info->operand) { case HSA_DBG_WAVEOP_HALT: - if (asic_family == CHIP_KAVERI) { - reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; - pr_debug("Halting KV\n"); - } else { - reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; - reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT; - pr_debug("Halting CZ\n"); - } + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; break; case HSA_DBG_WAVEOP_RESUME: - if (asic_family == CHIP_KAVERI) { - reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; - pr_debug("Resuming KV\n"); - } else { - reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; - reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME; - pr_debug("Resuming CZ\n"); - } + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; break; case HSA_DBG_WAVEOP_KILL: @@ -606,21 +615,23 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, int status; union SQ_CMD_BITS reg_sq_cmd; union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct kfd_mem_obj *mem_obj; uint32_t *packet_buff_uint; - uint64_t packet_buff_gpu_addr; struct pm4__set_config_reg *packets_vec; size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; + BUG_ON(!dbgdev || !wac_info); + reg_sq_cmd.u32All = 0; status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, - ®_gfx_index, dbgdev->dev->device_info->asic_family); + ®_gfx_index); if (status) { - pr_err("Failed to set wave control registers\n"); + pr_err("amdkfd: Failed to set wave control registers\n"); return status; } - /* we do not control the VMID in DIQ, so reset it to a known value */ + /* we do not control the VMID in DIQ,so reset it to a known value */ reg_sq_cmd.bits.vm_id = 0; pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); @@ -653,13 +664,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); - status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, - ib_size / sizeof(uint32_t), - &packet_buff_uint, &packet_buff_gpu_addr); - if (status) { - pr_err("Failed to allocate IB from DIQ ring\n"); + status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); + + if (status != 0) { + pr_err("amdkfd: Failed to allocate GART memory\n"); return status; } + + packet_buff_uint = mem_obj->cpu_ptr; + memset(packet_buff_uint, 0, ib_size); packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; @@ -702,12 +715,14 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, status = dbgdev_diq_submit_ib( dbgdev, wac_info->process->pasid, - packet_buff_gpu_addr, + mem_obj->gpu_addr, packet_buff_uint, - ib_size, false); + ib_size); - if (status) - pr_err("Failed to submit IB to DIQ\n"); + if (status != 0) + pr_err("amdkfd: Failed to submit IB to DIQ\n"); + + kfd_gtt_sa_free(dbgdev->dev, mem_obj); return status; } @@ -720,19 +735,21 @@ static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, union GRBM_GFX_INDEX_BITS reg_gfx_index; struct kfd_process_device *pdd; + BUG_ON(!dbgdev || !dbgdev->dev || !wac_info); + reg_sq_cmd.u32All = 0; /* taking the VMID for that process on the safe way using PDD */ pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process); if (!pdd) { - pr_err("Failed to get pdd for wave control no DIQ\n"); + pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); return -EFAULT; } status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, - ®_gfx_index, dbgdev->dev->device_info->asic_family); + ®_gfx_index); if (status) { - pr_err("Failed to set wave control registers\n"); + pr_err("amdkfd: Failed to set wave control registers\n"); return status; } @@ -783,8 +800,13 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) union GRBM_GFX_INDEX_BITS reg_gfx_index; struct kfd_process_device *pdd; struct dbg_wave_control_info wac_info; - int first_vmid_to_scan = dev->vm_info.first_vmid_kfd; - int last_vmid_to_scan = dev->vm_info.last_vmid_kfd; + int temp; + int first_vmid_to_scan = 8; + int last_vmid_to_scan = 15; + + first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1; + temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan; + last_vmid_to_scan = first_vmid_to_scan + ffz(temp); reg_sq_cmd.u32All = 0; status = 0; @@ -796,13 +818,12 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. * ATC_VMID15_PASID_MAPPING - * to check which VMID the current process is mapped to. - */ + * to check which VMID the current process is mapped to. */ for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid (dev->kgd, vmid)) { - if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid + if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid (dev->kgd, vmid) == p->pasid) { pr_debug("Killing wave fronts of vmid %d and pasid %d\n", vmid, p->pasid); @@ -812,7 +833,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) } if (vmid > last_vmid_to_scan) { - pr_err("Didn't find vmid for pasid %d\n", p->pasid); + pr_err("amdkfd: didn't found vmid for pasid (%d)\n", p->pasid); return -EFAULT; } @@ -822,7 +843,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) return -EFAULT; status = dbgdev_wave_control_set_registers(&wac_info, ®_sq_cmd, - ®_gfx_index, dev->device_info->asic_family); + ®_gfx_index); if (status != 0) return -EINVAL; @@ -839,6 +860,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, enum DBGDEV_TYPE type) { + BUG_ON(!pdbgdev || !pdev); + pdbgdev->dev = pdev; pdbgdev->kq = NULL; pdbgdev->type = type; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h index 583aaa9..03424c2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h @@ -60,24 +60,6 @@ enum { SH_REG_SIZE = SH_REG_END - SH_REG_BASE }; -/* SQ_CMD definitions */ - -enum { - SQ_IND_CMD_DATA_RESUME = 0, - SQ_IND_CMD_DATA_HALT = 1 -}; - -enum SQ_IND_CMD_NEW { - SQ_IND_CMD_NEW_NULL = 0x00000000, - SQ_IND_CMD_NEW_SETHALT = 0x00000001, - SQ_IND_CMD_NEW_SAVECTX = 0x00000002, - SQ_IND_CMD_NEW_KILL = 0x00000003, - SQ_IND_CMD_NEW_DEBUG = 0x00000004, - SQ_IND_CMD_NEW_TRAP = 0x00000005, - SQ_IND_CMD_NEW_SET_PRIO = 0x00000006 - -}; - enum SQ_IND_CMD_CMD { SQ_IND_CMD_CMD_NULL = 0x00000000, SQ_IND_CMD_CMD_HALT = 0x00000001, @@ -136,20 +118,6 @@ union SQ_CMD_BITS { uint32_t:1; uint32_t vm_id:4; } bitfields, bits; - struct { - uint32_t cmd:3; - uint32_t:1; - uint32_t mode:3; - uint32_t check_vmid:1; - uint32_t data:3; - uint32_t:5; - uint32_t wave_id:4; - uint32_t simd_id:2; - uint32_t:2; - uint32_t queue_id:3; - uint32_t:1; - uint32_t vm_id:4; - } bitfields_sethalt, bits_sethalt; uint32_t u32All; signed int i32All; float f32All; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c index 9d4af96..56d6763 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c @@ -33,7 +33,6 @@ #include "kfd_pm4_headers_diq.h" #include "kfd_dbgmgr.h" #include "kfd_dbgdev.h" -#include "kfd_device_queue_manager.h" static DEFINE_MUTEX(kfd_dbgmgr_mutex); @@ -45,6 +44,8 @@ struct mutex *kfd_get_dbgmgr_mutex(void) static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) { + BUG_ON(!pmgr); + kfree(pmgr->dbgdev); pmgr->dbgdev = NULL; @@ -54,7 +55,7 @@ static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) { - if (pmgr) { + if (pmgr != NULL) { kfd_dbgmgr_uninitialize(pmgr); kfree(pmgr); } @@ -65,12 +66,12 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; struct kfd_dbgmgr *new_buff; - if (WARN_ON(!pdev->init_complete)) - return false; + BUG_ON(pdev == NULL); + BUG_ON(!pdev->init_complete); new_buff = kfd_alloc_struct(new_buff); if (!new_buff) { - pr_err("Failed to allocate dbgmgr instance\n"); + pr_err("amdkfd: Failed to allocate dbgmgr instance\n"); return false; } @@ -78,13 +79,13 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) new_buff->dev = pdev; new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev); if (!new_buff->dbgdev) { - pr_err("Failed to allocate dbgdev instance\n"); + pr_err("amdkfd: Failed to allocate dbgdev instance\n"); kfree(new_buff); return false; } /* get actual type of DBGDevice cpsch or not */ - if (pdev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) + if (sched_policy == KFD_SCHED_POLICY_NO_HWS) type = DBGDEV_TYPE_NODIQ; kfd_dbgdev_init(new_buff->dbgdev, pdev, type); @@ -95,6 +96,8 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) { + BUG_ON(!p || !pmgr || !pmgr->dbgdev); + if (pmgr->pasid != 0) { pr_debug("H/W debugger is already active using pasid %d\n", pmgr->pasid); @@ -115,6 +118,8 @@ long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) { + BUG_ON(!p || !pmgr || !pmgr->dbgdev); + /* Is the requests coming from the already registered process? */ if (pmgr->pasid != p->pasid) { pr_debug("H/W debugger is not registered by calling pasid %d\n", @@ -132,6 +137,8 @@ long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info) { + BUG_ON(!pmgr || !pmgr->dbgdev || !wac_info); + /* Is the requests coming from the already registered process? */ if (pmgr->pasid != wac_info->process->pasid) { pr_debug("H/W debugger support was not registered for requester pasid %d\n", @@ -145,6 +152,9 @@ long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info) { + BUG_ON(!pmgr || !pmgr->dbgdev || !adw_info); + + /* Is the requests coming from the already registered process? */ if (pmgr->pasid != adw_info->process->pasid) { pr_debug("H/W debugger support was not registered for requester pasid %d\n", diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h index a04a1fe..257a745 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h @@ -30,11 +30,13 @@ #pragma pack(push, 4) enum HSA_DBG_WAVEOP { - HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ - HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ - HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ - HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter dbg mode */ - HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take a trap */ + HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ + HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ + HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ + HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter + debug mode */ + HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take + a trap */ HSA_DBG_NUM_WAVEOP = 5, HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF }; @@ -79,13 +81,15 @@ struct HsaDbgWaveMsgAMDGen2 { uint32_t UserData:8; /* user data */ uint32_t ShaderArray:1; /* Shader array */ uint32_t Priv:1; /* Privileged */ - uint32_t Reserved0:4; /* Reserved, should be 0 */ + uint32_t Reserved0:4; /* This field is reserved, + should be 0 */ uint32_t WaveId:4; /* wave id */ uint32_t SIMD:2; /* SIMD id */ uint32_t HSACU:4; /* Compute unit */ uint32_t ShaderEngine:2;/* Shader engine */ uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ - uint32_t Reserved1:4; /* Reserved, should be 0 */ + uint32_t Reserved1:4; /* This field is reserved, + should be 0 */ } ui32; uint32_t Value; }; @@ -117,23 +121,20 @@ struct HsaDbgWaveMessage { * in the user mode instruction stream. The OS scheduler event is typically * associated and signaled by an interrupt issued by the GPU, but other HSA * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced - * by the KFD by this mechanism, too. - */ + * by the KFD by this mechanism, too. */ /* these are the new definitions for events */ enum HSA_EVENTTYPE { HSA_EVENTTYPE_SIGNAL = 0, /* user-mode generated GPU signal */ HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change - * (start/stop) - */ + (start/stop) */ HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */ HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state - * (EOP pm4) - */ + (EOP pm4) */ /* ... */ HSA_EVENTTYPE_MAXID, HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c deleted file mode 100644 index 232e28f..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include "kfd_priv.h" - -static struct dentry *debugfs_root; - -static int kfd_debugfs_open(struct inode *inode, struct file *file) -{ - int (*show)(struct seq_file *, void *) = inode->i_private; - - return single_open(file, show, NULL); -} - -static const struct file_operations kfd_debugfs_fops = { - .owner = THIS_MODULE, - .open = kfd_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -void kfd_debugfs_init(void) -{ - struct dentry *ent; - - debugfs_root = debugfs_create_dir("kfd", NULL); - if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) { - pr_warn("Failed to create kfd debugfs dir\n"); - return; - } - - ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, - kfd_debugfs_mqds_by_process, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create mqds in kfd debugfs\n"); - - ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, - kfd_debugfs_hqds_by_device, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create hqds in kfd debugfs\n"); - - ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, - kfd_debugfs_rls_by_device, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create rls in kfd debugfs\n"); -} - -void kfd_debugfs_fini(void) -{ - debugfs_remove_recursive(debugfs_root); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 6b3a1fa..3f95f7c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -20,209 +20,36 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) #include -#endif +#include #include #include -#include #include "kfd_priv.h" #include "kfd_device_queue_manager.h" -#include "kfd_pm4_headers_vi.h" -#include "cwsr_trap_handler_carrizo.h" -#include "cwsr_trap_handler_gfx9.asm" +#include "kfd_pm4_headers.h" #define MQD_SIZE_ALIGNED 768 -static atomic_t kfd_device_suspended = ATOMIC_INIT(0); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) static const struct kfd_device_info kaveri_device_info = { .asic_family = CHIP_KAVERI, .max_pasid_bits = 16, /* max num of queues for KV.TODO should be a dynamic value */ .max_no_of_hqd = 24, - .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = true, - .supports_cwsr = false, - .needs_pci_atomics = false, + .mqd_size_aligned = MQD_SIZE_ALIGNED }; -#endif -static const struct kfd_device_info hawaii_device_info = { - .asic_family = CHIP_HAWAII, - .max_pasid_bits = 16, - /* max num of queues for KV.TODO should be a dynamic value */ - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = false, - .needs_pci_atomics = false, -}; - -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) static const struct kfd_device_info carrizo_device_info = { .asic_family = CHIP_CARRIZO, .max_pasid_bits = 16, /* max num of queues for CZ.TODO should be a dynamic value */ .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = true, - .supports_cwsr = true, - .needs_pci_atomics = false, -}; -#endif - -static const struct kfd_device_info tonga_device_info = { - .asic_family = CHIP_TONGA, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = false, - .needs_pci_atomics = true, -}; - -static const struct kfd_device_info tonga_vf_device_info = { - .asic_family = CHIP_TONGA, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = false, - .needs_pci_atomics = false, -}; - -static const struct kfd_device_info fiji_device_info = { - .asic_family = CHIP_FIJI, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = true, -}; - -static const struct kfd_device_info fiji_vf_device_info = { - .asic_family = CHIP_FIJI, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = false, -}; - - -static const struct kfd_device_info polaris10_device_info = { - .asic_family = CHIP_POLARIS10, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = true, -}; - -static const struct kfd_device_info polaris10_vf_device_info = { - .asic_family = CHIP_POLARIS10, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = false, -}; - -static const struct kfd_device_info polaris11_device_info = { - .asic_family = CHIP_POLARIS11, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = true, -}; - -static const struct kfd_device_info vega10_device_info = { - .asic_family = CHIP_VEGA10, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 8, - .ih_ring_entry_size = 8 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_v9, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = true, -}; - -static const struct kfd_device_info vega10_vf_device_info = { - .asic_family = CHIP_VEGA10, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 8, - .ih_ring_entry_size = 8 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_v9, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = false, -}; - -static const struct kfd_device_info raven_device_info = { - .asic_family = CHIP_RAVEN, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 8, - .ih_ring_entry_size = 8 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_v9, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = true, - .supports_cwsr = true, - .needs_pci_atomics = true, + .mqd_size_aligned = MQD_SIZE_ALIGNED }; struct kfd_deviceid { @@ -232,7 +59,6 @@ struct kfd_deviceid { /* Please keep this sorted by increasing device id. */ static const struct kfd_deviceid supported_devices[] = { -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) { 0x1304, &kaveri_device_info }, /* Kaveri */ { 0x1305, &kaveri_device_info }, /* Kaveri */ { 0x1306, &kaveri_device_info }, /* Kaveri */ @@ -255,90 +81,28 @@ static const struct kfd_deviceid supported_devices[] = { { 0x131B, &kaveri_device_info }, /* Kaveri */ { 0x131C, &kaveri_device_info }, /* Kaveri */ { 0x131D, &kaveri_device_info }, /* Kaveri */ -#endif - { 0x67A0, &hawaii_device_info }, /* Hawaii */ - { 0x67A1, &hawaii_device_info }, /* Hawaii */ - { 0x67A2, &hawaii_device_info }, /* Hawaii */ - { 0x67A8, &hawaii_device_info }, /* Hawaii */ - { 0x67A9, &hawaii_device_info }, /* Hawaii */ - { 0x67AA, &hawaii_device_info }, /* Hawaii */ - { 0x67B0, &hawaii_device_info }, /* Hawaii */ - { 0x67B1, &hawaii_device_info }, /* Hawaii */ - { 0x67B8, &hawaii_device_info }, /* Hawaii */ - { 0x67B9, &hawaii_device_info }, /* Hawaii */ - { 0x67BA, &hawaii_device_info }, /* Hawaii */ - { 0x67BE, &hawaii_device_info }, /* Hawaii */ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) { 0x9870, &carrizo_device_info }, /* Carrizo */ { 0x9874, &carrizo_device_info }, /* Carrizo */ { 0x9875, &carrizo_device_info }, /* Carrizo */ { 0x9876, &carrizo_device_info }, /* Carrizo */ - { 0x9877, &carrizo_device_info }, /* Carrizo */ -#endif - { 0x6920, &tonga_device_info }, /* Tonga */ - { 0x6921, &tonga_device_info }, /* Tonga */ - { 0x6928, &tonga_device_info }, /* Tonga */ - { 0x6929, &tonga_device_info }, /* Tonga */ - { 0x692B, &tonga_device_info }, /* Tonga */ - { 0x692F, &tonga_vf_device_info }, /* Tonga vf */ - { 0x6938, &tonga_device_info }, /* Tonga */ - { 0x6939, &tonga_device_info }, /* Tonga */ - { 0x7300, &fiji_device_info }, /* Fiji */ - { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ - { 0x67C0, &polaris10_device_info }, /* Polaris10 */ - { 0x67C1, &polaris10_device_info }, /* Polaris10 */ - { 0x67C2, &polaris10_device_info }, /* Polaris10 */ - { 0x67C4, &polaris10_device_info }, /* Polaris10 */ - { 0x67C7, &polaris10_device_info }, /* Polaris10 */ - { 0x67C8, &polaris10_device_info }, /* Polaris10 */ - { 0x67C9, &polaris10_device_info }, /* Polaris10 */ - { 0x67CA, &polaris10_device_info }, /* Polaris10 */ - { 0x67CC, &polaris10_device_info }, /* Polaris10 */ - { 0x67CF, &polaris10_device_info }, /* Polaris10 */ - { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ - { 0x67DF, &polaris10_device_info }, /* Polaris10 */ - { 0x67E0, &polaris11_device_info }, /* Polaris11 */ - { 0x67E1, &polaris11_device_info }, /* Polaris11 */ - { 0x67E3, &polaris11_device_info }, /* Polaris11 */ - { 0x67E7, &polaris11_device_info }, /* Polaris11 */ - { 0x67E8, &polaris11_device_info }, /* Polaris11 */ - { 0x67E9, &polaris11_device_info }, /* Polaris11 */ - { 0x67EB, &polaris11_device_info }, /* Polaris11 */ - { 0x67EF, &polaris11_device_info }, /* Polaris11 */ - { 0x67FF, &polaris11_device_info }, /* Polaris11 */ - { 0x6860, &vega10_device_info }, /* Vega10 */ - { 0x6861, &vega10_device_info }, /* Vega10 */ - { 0x6862, &vega10_device_info }, /* Vega10 */ - { 0x6863, &vega10_device_info }, /* Vega10 */ - { 0x6864, &vega10_device_info }, /* Vega10 */ - { 0x6867, &vega10_device_info }, /* Vega10 */ - { 0x6868, &vega10_device_info }, /* Vega10 */ - { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ - { 0x687F, &vega10_device_info }, /* Vega10 */ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - { 0x15DD, &raven_device_info } /* Raven */ -#endif + { 0x9877, &carrizo_device_info } /* Carrizo */ }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size); static void kfd_gtt_sa_fini(struct kfd_dev *kfd); -static int kfd_resume(struct kfd_dev *kfd); - static const struct kfd_device_info *lookup_device_info(unsigned short did) { size_t i; for (i = 0; i < ARRAY_SIZE(supported_devices); i++) { if (supported_devices[i].did == did) { - WARN_ON(!supported_devices[i].device_info); + BUG_ON(supported_devices[i].device_info == NULL); return supported_devices[i].device_info; } } - WARN(1, "device is not added to supported_devices\n"); - return NULL; } @@ -350,21 +114,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, const struct kfd_device_info *device_info = lookup_device_info(pdev->device); - if (!device_info) { - dev_err(kfd_device, "kgd2kfd_probe failed\n"); + if (!device_info) return NULL; - } - - if (device_info->needs_pci_atomics) { - /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. - */ - if (pci_enable_atomic_ops_to_root(pdev) < 0) { - dev_info(kfd_device, - "skipped device %x:%x, PCI rejects atomics", - pdev->vendor, pdev->device); - return NULL; - } - } kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); if (!kfd) @@ -383,7 +134,6 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, return kfd; } -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) static bool device_iommu_pasid_init(struct kfd_dev *kfd) { const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | @@ -402,16 +152,15 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) } if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) { - dev_err(kfd_device, "error required iommu flags ats %i, pri %i, pasid %i\n", + dev_err(kfd_device, "error required iommu flags ats(%i), pri(%i), pasid(%i)\n", (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0, (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0, - (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) - != 0); + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) != 0); return false; } pasid_limit = min_t(unsigned int, - (unsigned int)(1 << kfd->device_info->max_pasid_bits), + (unsigned int)1 << kfd->device_info->max_pasid_bits, iommu_info.max_pasids); /* * last pasid is used for kernel queues doorbells @@ -421,8 +170,15 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) pasid_limit, kfd->doorbell_process_limit - 1); + err = amd_iommu_init_device(kfd->pdev, pasid_limit); + if (err < 0) { + dev_err(kfd_device, "error initializing iommu device\n"); + return false; + } + if (!kfd_set_pasid_limit(pasid_limit)) { dev_err(kfd_device, "error setting pasid limit\n"); + amd_iommu_free_device(kfd->pdev); return false; } @@ -434,7 +190,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); if (dev) - kfd_process_iommu_unbind_callback(dev, pasid); + kfd_unbind_process_from_device(dev, pasid); } /* @@ -455,108 +211,21 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, flags); dev = kfd_device_by_pci_dev(pdev); - if (!WARN_ON(!dev)) - kfd_signal_iommu_event(dev, pasid, address, + BUG_ON(dev == NULL); + + kfd_signal_iommu_event(dev, pasid, address, flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC); return AMD_IOMMU_INV_PRI_RSP_INVALID; } -#endif /* CONFIG_AMD_IOMMU_V2 */ - -static int kfd_cwsr_init(struct kfd_dev *kfd) -{ - /* - * Initialize the CWSR required memory for TBA and TMA - */ - if (cwsr_enable && kfd->device_info->supports_cwsr) { - const uint32_t *cwsr_hex; - void *cwsr_addr = NULL; - unsigned int size; - - if (kfd->device_info->asic_family < CHIP_VEGA10) { - cwsr_hex = cwsr_trap_carrizo_hex; - size = sizeof(cwsr_trap_carrizo_hex); - } else { - cwsr_hex = cwsr_trap_gfx9_hex; - size = sizeof(cwsr_trap_gfx9_hex); - } - - if (size > PAGE_SIZE) { - pr_err("Wrong CWSR ISA size.\n"); - return -EINVAL; - } - kfd->cwsr_size = - ALIGN(size, PAGE_SIZE) + PAGE_SIZE; - kfd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, - get_order(kfd->cwsr_size)); - if (!kfd->cwsr_pages) { - pr_err("Failed to allocate CWSR isa memory.\n"); - return -ENOMEM; - } - /*Only first page used for cwsr ISA code */ - cwsr_addr = kmap(kfd->cwsr_pages); - memset(cwsr_addr, 0, PAGE_SIZE); - memcpy(cwsr_addr, cwsr_hex, size); - kunmap(kfd->cwsr_pages); - kfd->tma_offset = ALIGN(size, PAGE_SIZE); - kfd->cwsr_enabled = true; - dev_info(kfd_device, - "Reserved %d pages for cwsr.\n", - (kfd->cwsr_size >> PAGE_SHIFT)); - } - - return 0; -} - -static void kfd_cwsr_fini(struct kfd_dev *kfd) -{ - if (kfd->cwsr_pages) - __free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size)); -} - -static void kfd_ib_mem_init(struct kfd_dev *kdev) -{ - /* In certain cases we need to send IB from kernel using the GPU address - * space created by user applications. - * For example, on GFX v7, we need to flush TC associated to the VMID - * before tearing down the VMID. In order to do so, we need an address - * valid to the VMID to place the IB while this space was created on - * the user's side, not the kernel. - * Since kfd_set_process_dgpu_aperture reserves "cwsr_base + cwsr_size" - * but CWSR only uses pages above cwsr_base, we'll use one page memory - * under cwsr_base for IB submissions - */ - kdev->ib_size = PAGE_SIZE; -} bool kgd2kfd_device_init(struct kfd_dev *kfd, const struct kgd2kfd_shared_resources *gpu_resources) { unsigned int size; - unsigned int vmid_bitmap_kfd, vmid_num_kfd; - - kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, - KGD_ENGINE_MEC1); kfd->shared_resources = *gpu_resources; - vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap; - kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1; - kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1; - vmid_num_kfd = kfd->vm_info.last_vmid_kfd - - kfd->vm_info.first_vmid_kfd + 1; - kfd->vm_info.vmid_num_kfd = vmid_num_kfd; - - /* Verify module parameters regarding mapped process number*/ - if ((hws_max_conc_proc < 0) - || (hws_max_conc_proc > vmid_num_kfd)) { - dev_err(kfd_device, - "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n", - hws_max_conc_proc, vmid_num_kfd, vmid_num_kfd); - kfd->max_proc_per_quantum = vmid_num_kfd; - } else - kfd->max_proc_per_quantum = hws_max_conc_proc; - /* calculate max size of mqds needed for queues */ size = max_num_of_queues_per_device * kfd->device_info->mqd_size_aligned; @@ -565,9 +234,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, * calculate max size of runlist packet. * There can be only 2 packets at once */ - size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_mes_map_process) + - max_num_of_queues_per_device * sizeof(struct pm4_mes_map_queues) - + sizeof(struct pm4_mes_runlist)) * 2; + size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_map_process) + + max_num_of_queues_per_device * + sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2; /* Add size of HIQ & DIQ */ size += KFD_KERNEL_QUEUE_SIZE * 2; @@ -578,88 +247,89 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, if (kfd->kfd2kgd->init_gtt_mem_allocation( kfd->kgd, size, &kfd->gtt_mem, &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)){ - dev_err(kfd_device, "Could not allocate %d bytes\n", size); + dev_err(kfd_device, + "Could not allocate %d bytes for device (%x:%x)\n", + size, kfd->pdev->vendor, kfd->pdev->device); goto out; } - dev_info(kfd_device, "Allocated %d bytes on gart\n", size); + dev_info(kfd_device, + "Allocated %d bytes on gart for device(%x:%x)\n", + size, kfd->pdev->vendor, kfd->pdev->device); /* Initialize GTT sa with 512 byte chunk size */ if (kfd_gtt_sa_init(kfd, size, 512) != 0) { - dev_err(kfd_device, "Error initializing gtt sub-allocator\n"); + dev_err(kfd_device, + "Error initializing gtt sub-allocator\n"); goto kfd_gtt_sa_init_error; } - if (kfd_doorbell_init(kfd)) { - dev_err(kfd_device, - "Error initializing doorbell aperture\n"); - goto kfd_doorbell_error; - } + kfd_doorbell_init(kfd); - if (kfd_topology_add_device(kfd)) { - dev_err(kfd_device, "Error adding device to topology\n"); + if (kfd_topology_add_device(kfd) != 0) { + dev_err(kfd_device, + "Error adding device (%x:%x) to topology\n", + kfd->pdev->vendor, kfd->pdev->device); goto kfd_topology_add_device_error; } if (kfd_interrupt_init(kfd)) { - dev_err(kfd_device, "Error initializing interrupts\n"); + dev_err(kfd_device, + "Error initializing interrupts for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); goto kfd_interrupt_error; } + if (!device_iommu_pasid_init(kfd)) { + dev_err(kfd_device, + "Error initializing iommuv2 for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto device_iommu_pasid_error; + } + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, + iommu_pasid_shutdown_callback); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); + kfd->dqm = device_queue_manager_init(kfd); if (!kfd->dqm) { - dev_err(kfd_device, "Error initializing queue manager\n"); + dev_err(kfd_device, + "Error initializing queue manager for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); goto device_queue_manager_error; } -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (kfd->device_info->is_need_iommu_device) { - if (!device_iommu_pasid_init(kfd)) { - dev_err(kfd_device, "Error initializing iommuv2\n"); - goto device_iommu_pasid_error; - } - } -#endif - - if (kfd_cwsr_init(kfd)) { - dev_err(kfd_device, "Error initializing cwsr\n"); - goto device_iommu_pasid_error; - } - - kfd_ib_mem_init(kfd); - - if (kfd_resume(kfd)) { - dev_err(kfd_device, "Error resuming kfd\n"); - goto kfd_resume_error; + if (kfd->dqm->ops.start(kfd->dqm) != 0) { + dev_err(kfd_device, + "Error starting queuen manager for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto dqm_start_error; } kfd->dbgmgr = NULL; kfd->init_complete = true; - dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor, + dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor, kfd->pdev->device); - pr_debug("Starting kfd with the following scheduling policy %d\n", - kfd->dqm->sched_policy); + pr_debug("kfd: Starting kfd with the following scheduling policy %d\n", + sched_policy); goto out; -kfd_resume_error: - kfd_cwsr_fini(kfd); -device_iommu_pasid_error: +dqm_start_error: device_queue_manager_uninit(kfd->dqm); device_queue_manager_error: + amd_iommu_free_device(kfd->pdev); +device_iommu_pasid_error: kfd_interrupt_exit(kfd); kfd_interrupt_error: kfd_topology_remove_device(kfd); kfd_topology_add_device_error: - kfd_doorbell_fini(kfd); -kfd_doorbell_error: kfd_gtt_sa_fini(kfd); kfd_gtt_sa_init_error: kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); dev_err(kfd_device, - "device %x:%x NOT added due to errors\n", + "device (%x:%x) NOT added due to errors\n", kfd->pdev->vendor, kfd->pdev->device); out: return kfd->init_complete; @@ -668,12 +338,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, void kgd2kfd_device_exit(struct kfd_dev *kfd) { if (kfd->init_complete) { - kgd2kfd_suspend(kfd); - kfd_cwsr_fini(kfd); device_queue_manager_uninit(kfd->dqm); + amd_iommu_free_device(kfd->pdev); kfd_interrupt_exit(kfd); kfd_topology_remove_device(kfd); - kfd_doorbell_fini(kfd); kfd_gtt_sa_fini(kfd); kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); } @@ -683,419 +351,77 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) void kgd2kfd_suspend(struct kfd_dev *kfd) { - if (!kfd->init_complete) - return; - - /* For first KFD device suspend all the KFD processes */ - if (atomic_inc_return(&kfd_device_suspended) == 1) - kfd_suspend_all_processes(); - - kfd->dqm->ops.stop(kfd->dqm); - -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (!kfd->device_info->is_need_iommu_device) - return; + BUG_ON(kfd == NULL); - kfd_unbind_processes_from_device(kfd); - - amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); - amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); - amd_iommu_free_device(kfd->pdev); -#endif + if (kfd->init_complete) { + kfd->dqm->ops.stop(kfd->dqm); + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); + amd_iommu_free_device(kfd->pdev); + } } int kgd2kfd_resume(struct kfd_dev *kfd) { - int ret; - - if (!kfd->init_complete) - return 0; - - ret = kfd_resume(kfd); - if (ret) - return ret; - - if (atomic_dec_return(&kfd_device_suspended) == 0) - ret = kfd_resume_all_processes(); - WARN(atomic_read(&kfd_device_suspended) < 0, - "KFD suspend / resume ref. error\n"); - return ret; -} + unsigned int pasid_limit; + int err; -static int kfd_resume(struct kfd_dev *kfd) -{ - int err = 0; + BUG_ON(kfd == NULL); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (kfd->device_info->is_need_iommu_device) { - unsigned int pasid_limit = kfd_get_pasid_limit(); + pasid_limit = kfd_get_pasid_limit(); + if (kfd->init_complete) { err = amd_iommu_init_device(kfd->pdev, pasid_limit); - if (err) { - dev_err(kfd_device, "failed to initialize iommu\n"); + if (err < 0) return -ENXIO; - } - amd_iommu_set_invalidate_ctx_cb(kfd->pdev, - iommu_pasid_shutdown_callback); - amd_iommu_set_invalid_ppr_cb(kfd->pdev, - iommu_invalid_ppr_cb); - - err = kfd_bind_processes_to_device(kfd); - if (err) { - dev_err(kfd_device, - "failed to bind process to device\n"); - return -ENXIO; - } - } -#endif - - err = kfd->dqm->ops.start(kfd->dqm); - if (err) { - dev_err(kfd_device, - "Error starting queue manager for device %x:%x\n", - kfd->pdev->vendor, kfd->pdev->device); - goto dqm_start_error; + iommu_pasid_shutdown_callback); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); + kfd->dqm->ops.start(kfd->dqm); } - kfd->kfd2kgd->write_config_static_mem(kfd->kgd, true, 1, 3, 0); - - return err; - -dqm_start_error: -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (kfd->device_info->is_need_iommu_device) - amd_iommu_free_device(kfd->pdev); -#endif - - return err; + return 0; } /* This is called directly from KGD at ISR. */ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) { - uint32_t patched_ihre[DIV_ROUND_UP( - kfd->device_info->ih_ring_entry_size, - sizeof(uint32_t))]; - bool is_patched = false; - if (!kfd->init_complete) return; spin_lock(&kfd->interrupt_lock); - if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry, - patched_ihre, &is_patched) - && enqueue_ih_ring_entry(kfd, - is_patched ? patched_ihre : ih_ring_entry)) - queue_work(kfd->ih_wq, &kfd->interrupt_work); + if (kfd->interrupts_active + && interrupt_is_wanted(kfd, ih_ring_entry) + && enqueue_ih_ring_entry(kfd, ih_ring_entry)) + schedule_work(&kfd->interrupt_work); spin_unlock(&kfd->interrupt_lock); } -/* quiesce_process_mm - - * Quiesce all user queues that belongs to given process p - */ -int quiesce_process_mm(struct kfd_process *p) -{ - struct kfd_process_device *pdd; - int r = 0; - unsigned int n_evicted = 0; - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - r = process_evict_queues(pdd->dev->dqm, &pdd->qpd); - if (r != 0) { - pr_err("Failed to evict process queues\n"); - goto fail; - } - n_evicted++; - } - - return r; - -fail: - /* To keep state consistent, roll back partial eviction by - * restoring queues - */ - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - if (n_evicted == 0) - break; - if (process_restore_queues(pdd->dev->dqm, &pdd->qpd)) - pr_err("Failed to restore queues\n"); - - n_evicted--; - } - - return r; -} - -/* resume_process_mm - - * Resume all user queues that belongs to given process p. The caller must - * ensure that process p context is valid. - */ -static int resume_process_mm(struct kfd_process *p) -{ - struct kfd_process_device *pdd; - struct mm_struct *mm = (struct mm_struct *)p->mm; - int r, ret = 0; - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) - down_read(&mm->mmap_sem); - - r = process_restore_queues(pdd->dev->dqm, &pdd->qpd); - if (r != 0) { - pr_err("Failed to restore process queues\n"); - if (ret == 0) - ret = r; - } - - if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) - up_read(&mm->mmap_sem); - } - - return ret; -} - -int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm) -{ - struct kfd_process *p; - struct kfd_process_device *pdd; - int r; - - /* Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. - */ - p = kfd_lookup_process_by_mm(mm); - if (!p) - return -ENODEV; - - if (kfd) { - r = -ENODEV; - pdd = kfd_get_process_device_data(kfd, p); - if (pdd) - r = process_evict_queues(kfd->dqm, &pdd->qpd); - } else { - r = quiesce_process_mm(p); - } - - kfd_unref_process(p); - return r; -} - -int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) -{ - struct kfd_process *p; - struct kfd_process_device *pdd; - int r; - - /* Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. - */ - p = kfd_lookup_process_by_mm(mm); - if (!p) - return -ENODEV; - - if (kfd) { - r = -ENODEV; - pdd = kfd_get_process_device_data(kfd, p); - if (pdd) - r = process_restore_queues(kfd->dqm, &pdd->qpd); - } else { - r = resume_process_mm(p); - } - - kfd_unref_process(p); - return r; -} - - -void kfd_restore_bo_worker(struct work_struct *work) -{ - struct delayed_work *dwork; - struct kfd_process *p; - struct kfd_process_device *pdd; - int ret = 0; - - dwork = to_delayed_work(work); - - /* Process termination destroys this worker thread. So during the - * lifetime of this thread, kfd_process p will be valid - */ - p = container_of(dwork, struct kfd_process, restore_work); - - /* Call restore_process_bos on the first KGD device. This function - * takes care of restoring the whole process including other devices. - * Restore can fail if enough memory is not available. If so, - * reschedule again. - */ - pdd = list_first_entry(&p->per_device_data, - struct kfd_process_device, - per_device_list); - - pr_info("Started restoring process of pasid %d\n", p->pasid); - - /* Setting last_restore_timestamp before successful restoration. - * Otherwise this would have to be set by KGD (restore_process_bos) - * before KFD BOs are unreserved. If not, the process can be evicted - * again before the timestamp is set. - * If restore fails, the timestamp will be set again in the next - * attempt. This would mean that the minimum GPU quanta would be - * PROCESS_ACTIVE_TIME_MS - (time to execute the following two - * functions) - */ - - p->last_restore_timestamp = get_jiffies_64(); - ret = pdd->dev->kfd2kgd->restore_process_bos(p->process_info, &p->ef); - if (ret) { - pr_info("Restore failed, try again after %d ms\n", - PROCESS_BACK_OFF_TIME_MS); - ret = schedule_delayed_work(&p->restore_work, - msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); - WARN(!ret, "reschedule restore work failed\n"); - return; - } - - ret = resume_process_mm(p); - if (ret) - pr_err("Failed to resume user queues\n"); - - pr_info("Finished restoring process of pasid %d\n", p->pasid); -} - -/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will - * prepare for safe eviction of KFD BOs that belong to the specified - * process. - * - * @mm: mm_struct that identifies the specified KFD process - * @fence: eviction fence attached to KFD process BOs - * - */ -int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, - struct dma_fence *fence) -{ - struct kfd_process *p; - unsigned long active_time; - unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS); - - if (!fence) - return -EINVAL; - - if (dma_fence_is_signaled(fence)) - return 0; - - p = kfd_lookup_process_by_mm(mm); - if (!p) - return -ENODEV; - - if (delayed_work_pending(&p->eviction_work.dwork)) { - /* It is possible has TTM has lined up couple of BOs of the same - * process to be evicted. Check if the fence is same which - * indicates that previous work item scheduled is not completed - */ - if (p->eviction_work.quiesce_fence == fence) - goto out; - else { - WARN(1, "Starting new evict with previous evict is not completed\n"); - if (cancel_delayed_work_sync(&p->eviction_work.dwork)) - dma_fence_put(p->eviction_work.quiesce_fence); - } - } - - p->eviction_work.quiesce_fence = dma_fence_get(fence); - - /* Avoid KFD process starvation. Wait for at least - * PROCESS_ACTIVE_TIME_MS before evicting the process again - */ - active_time = get_jiffies_64() - p->last_restore_timestamp; - if (delay_jiffies > active_time) - delay_jiffies -= active_time; - else - delay_jiffies = 0; - - /* During process initialization eviction_work.dwork is initialized - * to kfd_evict_bo_worker - */ - schedule_delayed_work(&p->eviction_work.dwork, delay_jiffies); -out: - kfd_unref_process(p); - return 0; -} - -void kfd_evict_bo_worker(struct work_struct *work) -{ - int ret; - struct kfd_process *p; - struct kfd_eviction_work *eviction_work; - struct delayed_work *dwork; - - dwork = to_delayed_work(work); - eviction_work = container_of(dwork, struct kfd_eviction_work, - dwork); - - /* Process termination destroys this worker thread. So during the - * lifetime of this thread, kfd_process p will be valid - */ - p = container_of(eviction_work, struct kfd_process, eviction_work); - - /* Narrow window of overlap between restore and evict work item is - * possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos unreserves - * KFD BOs, it is possible to evicted again. But restore has few more - * steps of finish. So lets wait for the restore work to complete - */ - if (delayed_work_pending(&p->restore_work)) - flush_delayed_work(&p->restore_work); - - pr_info("Started evicting process of pasid %d\n", p->pasid); - ret = quiesce_process_mm(p); - if (!ret) { - dma_fence_signal(eviction_work->quiesce_fence); - WARN_ONCE(eviction_work->quiesce_fence != p->ef, - "Eviction fence mismatch\n"); - dma_fence_put(p->ef); - /* TODO: quiesce_fence is same as kfd_process->ef. But - * quiesce_fence is also used to avoid starting multiple - * eviction work items. This might not be necessary and - * one of the variables could be removed - */ - p->ef = NULL; - schedule_delayed_work(&p->restore_work, - msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); - } else - pr_err("Failed to quiesce user queues. Cannot evict BOs\n"); - - dma_fence_put(eviction_work->quiesce_fence); - - pr_info("Finished evicting process of pasid %d\n", p->pasid); - -} - static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size) { - unsigned int num_of_longs; + unsigned int num_of_bits; - if (WARN_ON(buf_size < chunk_size)) - return -EINVAL; - if (WARN_ON(buf_size == 0)) - return -EINVAL; - if (WARN_ON(chunk_size == 0)) - return -EINVAL; + BUG_ON(!kfd); + BUG_ON(!kfd->gtt_mem); + BUG_ON(buf_size < chunk_size); + BUG_ON(buf_size == 0); + BUG_ON(chunk_size == 0); kfd->gtt_sa_chunk_size = chunk_size; kfd->gtt_sa_num_of_chunks = buf_size / chunk_size; - num_of_longs = (kfd->gtt_sa_num_of_chunks + BITS_PER_LONG - 1) / - BITS_PER_LONG; + num_of_bits = kfd->gtt_sa_num_of_chunks / BITS_PER_BYTE; + BUG_ON(num_of_bits == 0); - kfd->gtt_sa_bitmap = kcalloc(num_of_longs, sizeof(long), GFP_KERNEL); + kfd->gtt_sa_bitmap = kzalloc(num_of_bits, GFP_KERNEL); if (!kfd->gtt_sa_bitmap) return -ENOMEM; - pr_debug("gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n", + pr_debug("kfd: gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n", kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap); mutex_init(&kfd->gtt_sa_lock); @@ -1129,17 +455,19 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, { unsigned int found, start_search, cur_size; + BUG_ON(!kfd); + if (size == 0) return -EINVAL; if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) return -ENOMEM; - *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); - if (!(*mem_obj)) + *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if ((*mem_obj) == NULL) return -ENOMEM; - pr_debug("Allocated mem_obj = %p for size = %d\n", *mem_obj, size); + pr_debug("kfd: allocated mem_obj = %p for size = %d\n", *mem_obj, size); start_search = 0; @@ -1151,7 +479,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, kfd->gtt_sa_num_of_chunks, start_search); - pr_debug("Found = %d\n", found); + pr_debug("kfd: found = %d\n", found); /* If there wasn't any free chunk, bail out */ if (found == kfd->gtt_sa_num_of_chunks) @@ -1169,12 +497,12 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, found, kfd->gtt_sa_chunk_size); - pr_debug("gpu_addr = %p, cpu_addr = %p\n", + pr_debug("kfd: gpu_addr = %p, cpu_addr = %p\n", (uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr); /* If we need only one chunk, mark it as allocated and get out */ if (size <= kfd->gtt_sa_chunk_size) { - pr_debug("Single bit\n"); + pr_debug("kfd: single bit\n"); set_bit(found, kfd->gtt_sa_bitmap); goto kfd_gtt_out; } @@ -1209,7 +537,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, } while (cur_size > 0); - pr_debug("range_start = %d, range_end = %d\n", + pr_debug("kfd: range_start = %d, range_end = %d\n", (*mem_obj)->range_start, (*mem_obj)->range_end); /* Mark the chunks as allocated */ @@ -1223,7 +551,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, return 0; kfd_gtt_no_free_chunk: - pr_debug("Allocation failed with mem_obj = %p\n", mem_obj); + pr_debug("kfd: allocation failed with mem_obj = %p\n", mem_obj); mutex_unlock(&kfd->gtt_sa_lock); kfree(mem_obj); return -ENOMEM; @@ -1233,11 +561,13 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) { unsigned int bit; + BUG_ON(!kfd); + /* Act like kfree when trying to free a NULL object */ if (!mem_obj) return 0; - pr_debug("Free mem_obj = %p, range_start = %d, range_end = %d\n", + pr_debug("kfd: free mem_obj = %p, range_start = %d, range_end = %d\n", mem_obj, mem_obj->range_start, mem_obj->range_end); mutex_lock(&kfd->gtt_sa_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 1abbaa0..42de22b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -44,13 +44,9 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -static int execute_queues_cpsch(struct device_queue_manager *dqm, - bool static_queues_included); -static int unmap_queues_cpsch(struct device_queue_manager *dqm, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param); - -static int map_queues_cpsch(struct device_queue_manager *dqm); +static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); +static int destroy_queues_cpsch(struct device_queue_manager *dqm, + bool preempt_static_queues, bool lock); static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, @@ -83,17 +79,20 @@ static bool is_pipe_enabled(struct device_queue_manager *dqm, int mec, int pipe) unsigned int get_queues_num(struct device_queue_manager *dqm) { + BUG_ON(!dqm || !dqm->dev); return bitmap_weight(dqm->dev->shared_resources.queue_bitmap, KGD_MAX_QUEUES); } unsigned int get_queues_per_pipe(struct device_queue_manager *dqm) { + BUG_ON(!dqm || !dqm->dev); return dqm->dev->shared_resources.num_queue_per_pipe; } unsigned int get_pipes_per_mec(struct device_queue_manager *dqm) { + BUG_ON(!dqm || !dqm->dev); return dqm->dev->shared_resources.num_pipe_per_mec; } @@ -108,57 +107,6 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, qpd->sh_mem_bases); } -static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) -{ - struct kfd_dev *dev = qpd->dqm->dev; - - if (!KFD_IS_SOC15(dev->device_info->asic_family)) { - /* On pre-SOC15 chips we need to use the queue ID to - * preserve the user mode ABI. - */ - q->doorbell_id = q->properties.queue_id; - } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - /* For SDMA queues on SOC15, use static doorbell - * assignments based on the engine and queue. - */ - q->doorbell_id = dev->shared_resources.sdma_doorbell - [q->properties.sdma_engine_id] - [q->properties.sdma_queue_id]; - } else { - /* For CP queues on SOC15 reserve a free doorbell ID */ - unsigned int found; - - found = find_first_zero_bit(qpd->doorbell_bitmap, - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); - if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { - pr_debug("No doorbells available"); - return -EBUSY; - } - set_bit(found, qpd->doorbell_bitmap); - q->doorbell_id = found; - } - - q->properties.doorbell_off = - kfd_doorbell_id_to_offset(dev, q->process, - q->doorbell_id); - - return 0; -} - -static void deallocate_doorbell(struct qcm_process_device *qpd, - struct queue *q) -{ - unsigned int old; - struct kfd_dev *dev = qpd->dqm->dev; - - if (!KFD_IS_SOC15(dev->device_info->asic_family) || - q->properties.type == KFD_QUEUE_TYPE_SDMA) - return; - - old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); - WARN_ON(!old); -} - static int allocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) @@ -168,59 +116,31 @@ static int allocate_vmid(struct device_queue_manager *dqm, if (dqm->vmid_bitmap == 0) return -ENOMEM; - bit = ffs(dqm->vmid_bitmap) - 1; - dqm->vmid_bitmap &= ~(1 << bit); + bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM); + clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); - allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; - pr_debug("vmid allocation %d\n", allocated_vmid); + /* Kaveri kfd vmid's starts from vmid 8 */ + allocated_vmid = bit + KFD_VMID_START_OFFSET; + pr_debug("kfd: vmid allocation %d\n", allocated_vmid); qpd->vmid = allocated_vmid; q->properties.vmid = allocated_vmid; set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); program_sh_mem_settings(dqm, qpd); - /* qpd->page_table_base is set earlier when register_process() - * is called, i.e. when the first queue is created. - */ - dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, - qpd->vmid, - qpd->page_table_base); - /*invalidate the VM context after pasid and vmid mapping is set up*/ - kfd_flush_tlb(dqm->dev, qpd->pqm->process->pasid); - return 0; } -static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, - struct qcm_process_device *qpd) -{ - uint32_t len; - - if (!qpd->ib_kaddr) - return -ENOMEM; - - len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base, - (uint32_t *)qpd->ib_kaddr); - - return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, - qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); -} - static void deallocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { - int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; - - /* On GFX v7, CP doesn't flush TC at dequeue */ - if (q->device->device_info->asic_family == CHIP_HAWAII) - if (flush_texture_cache_nocpsch(q->device, qpd)) - pr_err("Failed to flush TC\n"); + int bit = qpd->vmid - KFD_VMID_START_OFFSET; /* Release the vmid mapping */ set_pasid_vmid_mapping(dqm, 0, qpd->vmid); - dqm->vmid_bitmap |= (1 << bit); + set_bit(bit, (unsigned long *)&dqm->vmid_bitmap); qpd->vmid = 0; q->properties.vmid = 0; } @@ -230,53 +150,47 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd, int *allocated_vmid) { - int retval = 0; + int retval; + + BUG_ON(!dqm || !q || !qpd || !allocated_vmid); + pr_debug("kfd: In func %s\n", __func__); print_queue(q); mutex_lock(&dqm->lock); if (dqm->total_queue_count >= max_num_of_queues_per_device) { - pr_warn("Can't create new usermode queue because %d queues were already created\n", + pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", dqm->total_queue_count); - retval = -EPERM; - goto out_unlock; + mutex_unlock(&dqm->lock); + return -EPERM; } if (list_empty(&qpd->queues_list)) { retval = allocate_vmid(dqm, qpd, q); - if (retval) - goto out_unlock; + if (retval != 0) { + mutex_unlock(&dqm->lock); + return retval; + } } *allocated_vmid = qpd->vmid; q->properties.vmid = qpd->vmid; - /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later - */ - if (qpd->evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); - - q->properties.tba_addr = qpd->tba_addr; - q->properties.tma_addr = qpd->tma_addr; if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) retval = create_compute_queue_nocpsch(dqm, q, qpd); - else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) retval = create_sdma_queue_nocpsch(dqm, q, qpd); - if (retval) { + if (retval != 0) { if (list_empty(&qpd->queues_list)) { deallocate_vmid(dqm, qpd, q); *allocated_vmid = 0; } - goto out_unlock; + mutex_unlock(&dqm->lock); + return retval; } list_add(&q->list, &qpd->queues_list); - qpd->queue_count++; if (q->properties.is_active) dqm->queue_count++; @@ -291,9 +205,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, pr_debug("Total of %d queues are accountable so far\n", dqm->total_queue_count); -out_unlock: mutex_unlock(&dqm->lock); - return retval; + return 0; } static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) @@ -303,16 +216,19 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) set = false; - for (pipe = dqm->next_pipe_to_allocate, i = 0; - i < get_pipes_per_mec(dqm); + for (pipe = dqm->next_pipe_to_allocate, i = 0; i < get_pipes_per_mec(dqm); pipe = ((pipe + 1) % get_pipes_per_mec(dqm)), ++i) { if (!is_pipe_enabled(dqm, 0, pipe)) continue; if (dqm->allocated_queues[pipe] != 0) { - bit = ffs(dqm->allocated_queues[pipe]) - 1; - dqm->allocated_queues[pipe] &= ~(1 << bit); + bit = find_first_bit( + (unsigned long *)&dqm->allocated_queues[pipe], + get_queues_per_pipe(dqm)); + + clear_bit(bit, + (unsigned long *)&dqm->allocated_queues[pipe]); q->pipe = pipe; q->queue = bit; set = true; @@ -323,7 +239,8 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) if (!set) return -EBUSY; - pr_debug("hqd slot - pipe %d, queue %d\n", q->pipe, q->queue); + pr_debug("kfd: DQM %s hqd slot - pipe (%d) queue(%d)\n", + __func__, q->pipe, q->queue); /* horizontal hqd allocation */ dqm->next_pipe_to_allocate = (pipe + 1) % get_pipes_per_mec(dqm); @@ -333,7 +250,7 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) static inline void deallocate_hqd(struct device_queue_manager *dqm, struct queue *q) { - dqm->allocated_queues[q->pipe] |= (1 << q->queue); + set_bit(q->queue, (unsigned long *)&dqm->allocated_queues[q->pipe]); } static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, @@ -343,203 +260,138 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, int retval; struct mqd_manager *mqd; + BUG_ON(!dqm || !q || !qpd); + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd) + if (mqd == NULL) return -ENOMEM; retval = allocate_hqd(dqm, q); - if (retval) + if (retval != 0) return retval; - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_hqd; - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); - if (retval) - goto out_deallocate_doorbell; - - pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", - q->pipe, q->queue); - - dqm->dev->kfd2kgd->alloc_memory_of_scratch( - dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); + if (retval != 0) { + deallocate_hqd(dqm, q); + return retval; + } - if (!q->properties.is_active) - return 0; + pr_debug("kfd: loading mqd to hqd on pipe (%d) queue (%d)\n", + q->pipe, + q->queue); - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, - q->process->mm); - if (retval) - goto out_uninit_mqd; + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, + q->queue, (uint32_t __user *) q->properties.write_ptr); + if (retval != 0) { + deallocate_hqd(dqm, q); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + return retval; + } return 0; - -out_uninit_mqd: - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); -out_deallocate_hqd: - deallocate_hqd(dqm, q); - - return retval; } -/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked - * to avoid asynchronized access - */ -static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, +static int destroy_queue_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { int retval; struct mqd_manager *mqd; - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) - return -ENOMEM; + BUG_ON(!dqm || !q || !q->mqd || !qpd); + + retval = 0; - deallocate_doorbell(qpd, q); + pr_debug("kfd: In Func %s\n", __func__); - if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + mutex_lock(&dqm->lock); + + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (mqd == NULL) { + retval = -ENOMEM; + goto out; + } deallocate_hqd(dqm, q); - else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); + if (mqd == NULL) { + retval = -ENOMEM; + goto out; + } dqm->sdma_queue_count--; deallocate_sdma_queue(dqm, q->sdma_id); } else { - pr_debug("q->properties.type %d is invalid\n", + pr_debug("q->properties.type is invalid (%d)\n", q->properties.type); retval = -EINVAL; + goto out; } - dqm->total_queue_count--; retval = mqd->destroy_mqd(mqd, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, - KFD_HIQ_TIMEOUT, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, q->pipe, q->queue); - if (retval == -ETIME) - qpd->reset_wavefronts = true; + + if (retval != 0) + goto out; mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); list_del(&q->list); - if (list_empty(&qpd->queues_list)) { - if (qpd->reset_wavefronts) { - pr_warn("Resetting wave fronts (nocpsch) on dev %p\n", - dqm->dev); - /* dbgdev_wave_reset_wavefronts has to be called before - * deallocate_vmid(), i.e. when vmid is still in use. - */ - dbgdev_wave_reset_wavefronts(dqm->dev, - qpd->pqm->process); - qpd->reset_wavefronts = false; - } - + if (list_empty(&qpd->queues_list)) deallocate_vmid(dqm, qpd, q); - } - qpd->queue_count--; if (q->properties.is_active) dqm->queue_count--; - return retval; -} - -static int destroy_queue_nocpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - struct queue *q) -{ - int retval; + /* + * Unconditionally decrement this counter, regardless of the queue's + * type + */ + dqm->total_queue_count--; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); - mutex_lock(&dqm->lock); - retval = destroy_queue_nocpsch_locked(dqm, qpd, q); +out: mutex_unlock(&dqm->lock); - return retval; } -static bool is_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q) -{ - return (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && - (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)); -} - static int update_queue(struct device_queue_manager *dqm, struct queue *q) { int retval; struct mqd_manager *mqd; - struct kfd_process_device *pdd; - bool prev_active = false; - mutex_lock(&dqm->lock); + BUG_ON(!dqm || !q || !q->mqd); - pdd = kfd_get_process_device_data(q->device, q->process); - if (!pdd) { - retval = -ENODEV; - goto out_unlock; - } + mutex_lock(&dqm->lock); mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { - retval = -ENOMEM; - goto out_unlock; - } - /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later - */ - if (pdd->qpd.evicted > 0) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); - - /* save previous activity state for counters */ - prev_active = q->properties.is_active; - - /* HWS mode, unmap first to own mqd */ - if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) { - retval = unmap_queues_cpsch(dqm, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - if (retval) { - pr_err("unmap queue failed"); - goto out_unlock; - } - } else if (is_queue_nocpsch(dqm, q) && prev_active) { - retval = mqd->destroy_mqd(mqd, q->mqd, - KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, - KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); - if (retval) { - pr_err("destroy mqd failed"); - goto out_unlock; - } + if (mqd == NULL) { + mutex_unlock(&dqm->lock); + return -ENOMEM; } - retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + if (q->properties.is_active) + prev_active = true; - if (is_queue_nocpsch(dqm, q)) { - if (q->properties.is_active) - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, - &q->properties, q->process->mm); - } /* * * check active state vs. the previous state * and modify counter accordingly */ - if (q->properties.is_active && !prev_active) + retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + if ((q->properties.is_active) && (!prev_active)) dqm->queue_count++; - else if (!q->properties.is_active && prev_active) + else if ((!q->properties.is_active) && (prev_active)) dqm->queue_count--; - if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) - retval = map_queues_cpsch(dqm); + if (sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = execute_queues_cpsch(dqm, false); -out_unlock: mutex_unlock(&dqm->lock); - return retval; } @@ -548,169 +400,41 @@ static struct mqd_manager *get_mqd_manager_nocpsch( { struct mqd_manager *mqd; - if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) - return NULL; + BUG_ON(!dqm || type >= KFD_MQD_TYPE_MAX); - pr_debug("mqd type %d\n", type); + pr_debug("kfd: In func %s mqd type %d\n", __func__, type); mqd = dqm->mqds[type]; if (!mqd) { mqd = mqd_manager_init(type, dqm->dev); - if (!mqd) - pr_err("mqd manager is NULL"); + if (mqd == NULL) + pr_err("kfd: mqd manager is NULL"); dqm->mqds[type] = mqd; } return mqd; } -int process_evict_queues(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct queue *q, *next; - struct mqd_manager *mqd; - struct kfd_process_device *pdd; - int retval = 0; - - mutex_lock(&dqm->lock); - if (qpd->evicted++ > 0) /* already evicted, do nothing */ - goto out; - - pdd = qpd_to_pdd(qpd); - pr_info_ratelimited("Evicting PASID %u queues\n", - pdd->process->pasid); - - /* unactivate all active queues on the qpd */ - list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { /* should not be here */ - pr_err("Cannot evict queue, mqd is NULL\n"); - retval = -ENOMEM; - goto out; - } - /* if the queue is not active anyway, it is not evicted */ - if (q->properties.is_active) { - q->properties.is_evicted = true; - q->properties.is_active = false; - } - - if (is_queue_nocpsch(dqm, q) && - q->properties.is_evicted) - retval = mqd->destroy_mqd(mqd, q->mqd, - KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, - KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); - if (q->properties.is_evicted) - dqm->queue_count--; - } - if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) - retval = execute_queues_cpsch(dqm, qpd->is_debug); - -out: - mutex_unlock(&dqm->lock); - return retval; - -} - -int process_restore_queues(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct queue *q, *next; - struct mqd_manager *mqd; - int retval = 0; - struct kfd_process_device *pdd; - uint32_t pd_base; - - pdd = qpd_to_pdd(qpd); - /* Retrieve PD base */ - pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); - - mutex_lock(&dqm->lock); - if (qpd->evicted == 0) /* already restored, do nothing */ - goto out_unlock; - - if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ - qpd->evicted--; - goto out_unlock; - } - - pr_info_ratelimited("Restoring PASID %u queues\n", - pdd->process->pasid); - - /* Update PD Base in QPD */ - qpd->page_table_base = pd_base; - pr_debug("Updated PD address to 0x%08x\n", pd_base); - - if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && - !list_empty(&qpd->queues_list)) { - dqm->dev->kfd2kgd->set_vm_context_page_table_base( - dqm->dev->kgd, - qpd->vmid, - qpd->page_table_base); - - kfd_flush_tlb(dqm->dev, pdd->process->pasid); - } - - /* activate all active queues on the qpd */ - list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { /* should not be here */ - pr_err("Cannot restore queue, mqd is NULL\n"); - retval = -ENOMEM; - goto out_unlock; - } - if (q->properties.is_evicted) { - q->properties.is_evicted = false; - q->properties.is_active = true; - - if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && - (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, - q->queue, &q->properties, - q->process->mm); - dqm->queue_count++; - } - } - if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) - retval = execute_queues_cpsch(dqm, false); - - if (retval == 0) - qpd->evicted = 0; - -out_unlock: - mutex_unlock(&dqm->lock); - - return retval; -} - -static int register_process(struct device_queue_manager *dqm, +static int register_process_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct device_process_node *n; int retval; - struct kfd_process_device *pdd; - uint32_t pd_base; - n = kzalloc(sizeof(*n), GFP_KERNEL); + BUG_ON(!dqm || !qpd); + + pr_debug("kfd: In func %s\n", __func__); + + n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL); if (!n) return -ENOMEM; n->qpd = qpd; - pdd = qpd_to_pdd(qpd); - /* Retrieve PD base */ - pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); - mutex_lock(&dqm->lock); list_add(&n->list, &dqm->queues); - /* Update PD Base in QPD */ - qpd->page_table_base = pd_base; - pr_debug("Updated PD address to 0x%08x\n", pd_base); - - retval = dqm->asic_ops.update_qpd(dqm, qpd); + retval = dqm->ops_asic_specific.register_process(dqm, qpd); dqm->processes_count++; @@ -719,12 +443,16 @@ static int register_process(struct device_queue_manager *dqm, return retval; } -static int unregister_process(struct device_queue_manager *dqm, +static int unregister_process_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { int retval; struct device_process_node *cur, *next; + BUG_ON(!dqm || !qpd); + + pr_debug("In func %s\n", __func__); + pr_debug("qpd->queues_list is %s\n", list_empty(&qpd->queues_list) ? "empty" : "not empty"); @@ -765,41 +493,48 @@ static void init_interrupts(struct device_queue_manager *dqm) { unsigned int i; + BUG_ON(dqm == NULL); + for (i = 0 ; i < get_pipes_per_mec(dqm) ; i++) if (is_pipe_enabled(dqm, 0, i)) dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i); } + static int init_scheduler(struct device_queue_manager *dqm) { - return 0; + int retval = 0; + + BUG_ON(!dqm); + + pr_debug("kfd: In %s\n", __func__); + + return retval; } static int initialize_nocpsch(struct device_queue_manager *dqm) { - int pipe, queue; + int i; - pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); + BUG_ON(!dqm); - dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm), - sizeof(unsigned int), GFP_KERNEL); - if (!dqm->allocated_queues) - return -ENOMEM; + pr_debug("kfd: In func %s num of pipes: %d\n", + __func__, get_pipes_per_mec(dqm)); mutex_init(&dqm->lock); INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->next_pipe_to_allocate = 0; dqm->sdma_queue_count = 0; - - for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { - int pipe_offset = pipe * get_queues_per_pipe(dqm); - - for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) - if (test_bit(pipe_offset + queue, - dqm->dev->shared_resources.queue_bitmap)) - dqm->allocated_queues[pipe] |= 1 << queue; + dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm), + sizeof(unsigned int), GFP_KERNEL); + if (!dqm->allocated_queues) { + mutex_destroy(&dqm->lock); + return -ENOMEM; } - dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; + for (i = 0; i < get_pipes_per_mec(dqm); i++) + dqm->allocated_queues[i] = (1 << get_queues_per_pipe(dqm)) - 1; + + dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; init_scheduler(dqm); @@ -810,7 +545,9 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm) { int i; - WARN_ON(dqm->queue_count > 0 || dqm->processes_count > 0); + BUG_ON(!dqm); + + BUG_ON(dqm->queue_count > 0 || dqm->processes_count > 0); kfree(dqm->allocated_queues); for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) @@ -822,12 +559,11 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm) static int start_nocpsch(struct device_queue_manager *dqm) { init_interrupts(dqm); - return pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); + return 0; } static int stop_nocpsch(struct device_queue_manager *dqm) { - pm_uninit(&dqm->packets); return 0; } @@ -839,8 +575,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, if (dqm->sdma_bitmap == 0) return -ENOMEM; - bit = ffs(dqm->sdma_bitmap) - 1; - dqm->sdma_bitmap &= ~(1 << bit); + bit = find_first_bit((unsigned long *)&dqm->sdma_bitmap, + CIK_SDMA_QUEUES); + + clear_bit(bit, (unsigned long *)&dqm->sdma_bitmap); *sdma_queue_id = bit; return 0; @@ -851,7 +589,7 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm, { if (sdma_queue_id >= CIK_SDMA_QUEUES) return; - dqm->sdma_bitmap |= (1 << sdma_queue_id); + set_bit(sdma_queue_id, (unsigned long *)&dqm->sdma_bitmap); } static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, @@ -866,40 +604,33 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, return -ENOMEM; retval = allocate_sdma_queue(dqm, &q->sdma_id); - if (retval) + if (retval != 0) return retval; - q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; - q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; - - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_sdma_queue; + q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM; - pr_debug("SDMA id is: %d\n", q->sdma_id); - pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); - pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); + pr_debug("kfd: sdma id is: %d\n", q->sdma_id); + pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); + pr_debug(" sdma engine id: %d\n", q->properties.sdma_engine_id); - dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); - if (retval) - goto out_deallocate_doorbell; + if (retval != 0) { + deallocate_sdma_queue(dqm, q->sdma_id); + return retval; + } - retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); - if (retval) - goto out_uninit_mqd; + retval = mqd->load_mqd(mqd, q->mqd, 0, + 0, NULL); + if (retval != 0) { + deallocate_sdma_queue(dqm, q->sdma_id); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + return retval; + } return 0; - -out_uninit_mqd: - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); -out_deallocate_sdma_queue: - deallocate_sdma_queue(dqm, q->sdma_id); - - return retval; } /* @@ -911,7 +642,12 @@ static int set_sched_resources(struct device_queue_manager *dqm) int i, mec; struct scheduling_resources res; - res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap; + BUG_ON(!dqm); + + pr_debug("kfd: In func %s\n", __func__); + + res.vmid_mask = (1 << VMID_PER_DEVICE) - 1; + res.vmid_mask <<= KFD_VMID_START_OFFSET; res.queue_mask = 0; for (i = 0; i < KGD_MAX_QUEUES; ++i) { @@ -927,8 +663,7 @@ static int set_sched_resources(struct device_queue_manager *dqm) /* This situation may be hit in the future if a new HW * generation exposes more than 64 queues. If so, the - * definition of res.queue_mask needs updating - */ + * definition of res.queue_mask needs updating */ if (WARN_ON(i >= (sizeof(res.queue_mask)*8))) { pr_err("Invalid queue enabled by amdgpu: %d\n", i); break; @@ -939,9 +674,9 @@ static int set_sched_resources(struct device_queue_manager *dqm) res.gws_mask = res.oac_mask = res.gds_heap_base = res.gds_heap_size = 0; - pr_debug("Scheduling resources:\n" - "vmid mask: 0x%8X\n" - "queue mask: 0x%8llX\n", + pr_debug("kfd: scheduling resources:\n" + " vmid mask: 0x%8X\n" + " queue mask: 0x%8llX\n", res.vmid_mask, res.queue_mask); return pm_send_set_resources(&dqm->packets, &res); @@ -951,42 +686,51 @@ static int initialize_cpsch(struct device_queue_manager *dqm) { int retval; - pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); + BUG_ON(!dqm); + + pr_debug("kfd: In func %s num of pipes: %d\n", + __func__, get_pipes_per_mec(dqm)); mutex_init(&dqm->lock); INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->processes_count = 0; dqm->sdma_queue_count = 0; dqm->active_runlist = false; - dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; - retval = dqm->asic_ops.init_cpsch(dqm); - if (retval) - mutex_destroy(&dqm->lock); + retval = dqm->ops_asic_specific.initialize(dqm); + if (retval != 0) + goto fail_init_pipelines; + + return 0; +fail_init_pipelines: + mutex_destroy(&dqm->lock); return retval; } static int start_cpsch(struct device_queue_manager *dqm) { + struct device_process_node *node; int retval; + BUG_ON(!dqm); + retval = 0; - retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); - if (retval) + retval = pm_init(&dqm->packets, dqm); + if (retval != 0) goto fail_packet_manager_init; retval = set_sched_resources(dqm); - if (retval) + if (retval != 0) goto fail_set_sched_resources; - pr_debug("Allocating fence memory\n"); + pr_debug("kfd: allocating fence memory\n"); /* allocate fence memory on the gart */ retval = kfd_gtt_sa_allocate(dqm->dev, sizeof(*dqm->fence_addr), &dqm->fence_mem); - if (retval) + if (retval != 0) goto fail_allocate_vidmem; dqm->fence_addr = dqm->fence_mem->cpu_ptr; @@ -994,9 +738,12 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - mutex_lock(&dqm->lock); - execute_queues_cpsch(dqm, false); - mutex_unlock(&dqm->lock); + list_for_each_entry(node, &dqm->queues, list) + if (node->qpd->pqm->process && dqm->dev) + kfd_bind_process_to_device(dqm->dev, + node->qpd->pqm->process); + + execute_queues_cpsch(dqm, true); return 0; fail_allocate_vidmem: @@ -1008,12 +755,17 @@ static int start_cpsch(struct device_queue_manager *dqm) static int stop_cpsch(struct device_queue_manager *dqm) { - mutex_lock(&dqm->lock); + struct device_process_node *node; + struct kfd_process_device *pdd; - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); + BUG_ON(!dqm); - mutex_unlock(&dqm->lock); + destroy_queues_cpsch(dqm, true, true); + list_for_each_entry(node, &dqm->queues, list) { + pdd = qpd_to_pdd(node->qpd); + pdd->bound = false; + } kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->packets); @@ -1024,9 +776,13 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, struct kernel_queue *kq, struct qcm_process_device *qpd) { + BUG_ON(!dqm || !kq || !qpd); + + pr_debug("kfd: In func %s\n", __func__); + mutex_lock(&dqm->lock); if (dqm->total_queue_count >= max_num_of_queues_per_device) { - pr_warn("Can't create new kernel queue because %d queues were already created\n", + pr_warn("amdkfd: Can't create new kernel queue because %d queues were already created\n", dqm->total_queue_count); mutex_unlock(&dqm->lock); return -EPERM; @@ -1053,12 +809,17 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, struct kernel_queue *kq, struct qcm_process_device *qpd) { + BUG_ON(!dqm || !kq); + + pr_debug("kfd: In %s\n", __func__); + mutex_lock(&dqm->lock); /* here we actually preempt the DIQ */ + destroy_queues_cpsch(dqm, true, false); list_del(&kq->list); dqm->queue_count--; qpd->is_debug = false; - execute_queues_cpsch(dqm, true); + execute_queues_cpsch(dqm, false); /* * Unconditionally decrement this counter, regardless of the queue's * type. @@ -1069,12 +830,22 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, mutex_unlock(&dqm->lock); } +static void select_sdma_engine_id(struct queue *q) +{ + static int sdma_id; + + q->sdma_id = sdma_id; + sdma_id = (sdma_id + 1) % 2; +} + static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd, int *allocate_vmid) { int retval; struct mqd_manager *mqd; + BUG_ON(!dqm || !q || !qpd); + retval = 0; if (allocate_vmid) @@ -1083,60 +854,37 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, mutex_lock(&dqm->lock); if (dqm->total_queue_count >= max_num_of_queues_per_device) { - pr_warn("Can't create new usermode queue because %d queues were already created\n", + pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", dqm->total_queue_count); retval = -EPERM; - goto out_unlock; - } - - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - retval = allocate_sdma_queue(dqm, &q->sdma_id); - if (retval) - goto out_unlock; - q->properties.sdma_queue_id = - q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; - q->properties.sdma_engine_id = - q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + goto out; } - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_sdma_queue; + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + select_sdma_engine_id(q); mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { - retval = -ENOMEM; - goto out_deallocate_doorbell; + if (mqd == NULL) { + mutex_unlock(&dqm->lock); + return -ENOMEM; } - /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later - */ - if (qpd->evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); - - dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - q->properties.tba_addr = qpd->tba_addr; - q->properties.tma_addr = qpd->tma_addr; + dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); - if (retval) - goto out_deallocate_doorbell; + if (retval != 0) + goto out; list_add(&q->list, &qpd->queues_list); - qpd->queue_count++; if (q->properties.is_active) { dqm->queue_count++; retval = execute_queues_cpsch(dqm, false); } if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - dqm->sdma_queue_count++; + dqm->sdma_queue_count++; /* * Unconditionally increment this counter, regardless of the queue's * type or whether the queue is active. @@ -1146,31 +894,21 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, pr_debug("Total of %d queues are accountable so far\n", dqm->total_queue_count); +out: mutex_unlock(&dqm->lock); return retval; - -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); -out_deallocate_sdma_queue: - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - deallocate_sdma_queue(dqm, q->sdma_id); -out_unlock: - mutex_unlock(&dqm->lock); - - return retval; } int amdkfd_fence_wait_timeout(unsigned int *fence_addr, unsigned int fence_value, - unsigned long timeout_ms) + unsigned long timeout) { - unsigned long end_jiffies; - - end_jiffies = (timeout_ms * HZ / 1000) + jiffies; + BUG_ON(!fence_addr); + timeout += jiffies; while (*fence_addr != fence_value) { - if (time_after(jiffies, end_jiffies)) { - pr_err("qcm fence wait loop timeout expired\n"); + if (time_after(jiffies, timeout)) { + pr_err("kfd: qcm fence wait loop timeout expired\n"); return -ETIME; } schedule(); @@ -1179,63 +917,46 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, return 0; } -static int unmap_sdma_queues(struct device_queue_manager *dqm, - unsigned int sdma_engine) +static int destroy_sdma_queues(struct device_queue_manager *dqm, + unsigned int sdma_engine) { return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, + KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false, sdma_engine); } -/* dqm->lock mutex has to be locked before calling this function */ -static int map_queues_cpsch(struct device_queue_manager *dqm) +static int destroy_queues_cpsch(struct device_queue_manager *dqm, + bool preempt_static_queues, bool lock) { int retval; + enum kfd_preempt_type_filter preempt_type; + struct kfd_process_device *pdd; - if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { - retval = 0; - return retval; - } - - if (dqm->active_runlist) { - retval = 0; - return retval; - } - - retval = pm_send_runlist(&dqm->packets, &dqm->queues); - if (retval) { - pr_err("failed to execute runlist\n"); - return retval; - } - dqm->active_runlist = true; - - return retval; -} - -/* dqm->lock mutex has to be locked before calling this function */ -static int unmap_queues_cpsch(struct device_queue_manager *dqm, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param) -{ - int retval; + BUG_ON(!dqm); retval = 0; + if (lock) + mutex_lock(&dqm->lock); if (!dqm->active_runlist) - return retval; + goto out; - pr_debug("Before destroying queues, sdma queue count is : %u\n", + pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n", dqm->sdma_queue_count); if (dqm->sdma_queue_count > 0) { - unmap_sdma_queues(dqm, 0); - unmap_sdma_queues(dqm, 1); + destroy_sdma_queues(dqm, 0); + destroy_sdma_queues(dqm, 1); } + preempt_type = preempt_static_queues ? + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES : + KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES; + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, - filter, filter_param, false, 0); - if (retval) - return retval; + preempt_type, 0, false, 0); + if (retval != 0) + goto out; *dqm->fence_addr = KFD_FENCE_INIT; pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, @@ -1243,36 +964,56 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, /* should be timed out */ retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); - if (retval) { - pr_err("Unmapping queues failed.\n"); - return retval; + if (retval != 0) { + pdd = kfd_get_process_device_data(dqm->dev, + kfd_get_process(current)); + pdd->reset_wavefronts = true; + goto out; } - pm_release_ib(&dqm->packets); dqm->active_runlist = false; +out: + if (lock) + mutex_unlock(&dqm->lock); return retval; } -/* dqm->lock mutex has to be locked before calling this function */ -static int execute_queues_cpsch(struct device_queue_manager *dqm, - bool static_queues_included) +static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock) { int retval; - enum kfd_unmap_queues_filter filter; - filter = static_queues_included ? - KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; + BUG_ON(!dqm); - retval = unmap_queues_cpsch(dqm, filter, 0); - if (retval) { - pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); - return retval; + if (lock) + mutex_lock(&dqm->lock); + + retval = destroy_queues_cpsch(dqm, false, false); + if (retval != 0) { + pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption"); + goto out; + } + + if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { + retval = 0; + goto out; } - retval = map_queues_cpsch(dqm); + if (dqm->active_runlist) { + retval = 0; + goto out; + } + retval = pm_send_runlist(&dqm->packets, &dqm->queues); + if (retval != 0) { + pr_err("kfd: failed to execute runlist"); + goto out; + } + dqm->active_runlist = true; + +out: + if (lock) + mutex_unlock(&dqm->lock); return retval; } @@ -1284,6 +1025,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, struct mqd_manager *mqd; bool preempt_all_queues; + BUG_ON(!dqm || !qpd || !q); + preempt_all_queues = false; retval = 0; @@ -1308,21 +1051,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, goto failed; } - deallocate_doorbell(qpd, q); - - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); - } list_del(&q->list); - qpd->queue_count--; if (q->properties.is_active) dqm->queue_count--; - retval = execute_queues_cpsch(dqm, false); - if (retval == -ETIME) - qpd->reset_wavefronts = true; + execute_queues_cpsch(dqm, false); mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); @@ -1336,7 +1072,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, mutex_unlock(&dqm->lock); - return retval; + return 0; failed: failed_try_destroy_debugged_queue: @@ -1360,10 +1096,9 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, void __user *alternate_aperture_base, uint64_t alternate_aperture_size) { - bool retval = true; + bool retval; - if (!dqm->asic_ops.set_cache_memory_policy) - return retval; + pr_debug("kfd: In func %s\n", __func__); mutex_lock(&dqm->lock); @@ -1385,17 +1120,20 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, uint64_t base = (uintptr_t)alternate_aperture_base; uint64_t limit = base + alternate_aperture_size - 1; - if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || - (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { - retval = false; + if (limit <= base) + goto out; + + if ((base & APE1_FIXED_BITS_MASK) != 0) + goto out; + + if ((limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) goto out; - } qpd->sh_mem_ape1_base = base >> 16; qpd->sh_mem_ape1_limit = limit >> 16; } - retval = dqm->asic_ops.set_cache_memory_policy( + retval = dqm->ops_asic_specific.set_cache_memory_policy( dqm, qpd, default_policy, @@ -1403,199 +1141,35 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, alternate_aperture_base, alternate_aperture_size); - if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) + if ((sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) program_sh_mem_settings(dqm, qpd); - pr_debug("sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", + pr_debug("kfd: sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", qpd->sh_mem_config, qpd->sh_mem_ape1_base, qpd->sh_mem_ape1_limit); -out: mutex_unlock(&dqm->lock); return retval; -} - -static int set_trap_handler(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - uint64_t tba_addr, - uint64_t tma_addr) -{ - uint64_t *tma; - - if (dqm->dev->cwsr_enabled) { - /* Jump from CWSR trap handler to user trap */ - tma = (uint64_t *)(qpd->cwsr_kaddr + dqm->dev->tma_offset); - tma[0] = tba_addr; - tma[1] = tma_addr; - } else { - qpd->tba_addr = tba_addr; - qpd->tma_addr = tma_addr; - } - - return 0; -} - -static int process_termination_nocpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct queue *q, *next; - struct device_process_node *cur, *next_dpn; - int retval = 0; - - mutex_lock(&dqm->lock); - - /* Clear all user mode queues */ - list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - int ret; - - ret = destroy_queue_nocpsch_locked(dqm, qpd, q); - if (ret) - retval = ret; - } - - /* Unregister process */ - list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { - if (qpd == cur->qpd) { - list_del(&cur->list); - kfree(cur); - dqm->processes_count--; - break; - } - } - - mutex_unlock(&dqm->lock); - return retval; -} - -static int get_wave_state(struct device_queue_manager *dqm, - struct queue *q, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size) -{ - struct mqd_manager *mqd; - int r; - - mutex_lock(&dqm->lock); - - if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE || - q->properties.is_active || !q->device->cwsr_enabled) { - r = -EINVAL; - goto dqm_unlock; - } - - mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd) { - r = -ENOMEM; - goto dqm_unlock; - } - - if (!mqd->get_wave_state) { - r = -EINVAL; - goto dqm_unlock; - } - - r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size, - save_area_used_size); - -dqm_unlock: - mutex_unlock(&dqm->lock); - return r; -} - -static int process_termination_cpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - int retval; - struct queue *q, *next; - struct kernel_queue *kq, *kq_next; - struct mqd_manager *mqd; - struct device_process_node *cur, *next_dpn; - bool unmap_static_queues = false; - - retval = 0; - - mutex_lock(&dqm->lock); - - /* Clean all kernel queues */ - list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) { - list_del(&kq->list); - dqm->queue_count--; - qpd->is_debug = false; - dqm->total_queue_count--; - unmap_static_queues = true; - } - - /* Clear all user mode queues */ - list_for_each_entry(q, &qpd->queues_list, list) { - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); - } - - if (q->properties.is_active) - dqm->queue_count--; - - dqm->total_queue_count--; - } - - /* Unregister process */ - list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { - if (qpd == cur->qpd) { - list_del(&cur->list); - kfree(cur); - dqm->processes_count--; - break; - } - } - - retval = execute_queues_cpsch(dqm, unmap_static_queues); - if (retval || qpd->reset_wavefronts) { - pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); - dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); - qpd->reset_wavefronts = false; - } - - /* lastly, free mqd resources */ - list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { - retval = -ENOMEM; - goto out; - } - list_del(&q->list); - qpd->queue_count--; - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); - } out: mutex_unlock(&dqm->lock); - return retval; + return false; } struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) { struct device_queue_manager *dqm; - pr_debug("Loading device queue manager\n"); + BUG_ON(!dev); + + pr_debug("kfd: loading device queue manager\n"); - dqm = kzalloc(sizeof(*dqm), GFP_KERNEL); + dqm = kzalloc(sizeof(struct device_queue_manager), GFP_KERNEL); if (!dqm) return NULL; - switch (dev->device_info->asic_family) { - case CHIP_HAWAII: - case CHIP_TONGA: - dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS; - break; - default: - dqm->sched_policy = sched_policy; - break; - } - dqm->dev = dev; - switch (dqm->sched_policy) { + switch (sched_policy) { case KFD_SCHED_POLICY_HWS: case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: /* initialize dqm for cp scheduling */ @@ -1606,15 +1180,12 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.update_queue = update_queue; dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; - dqm->ops.register_process = register_process; - dqm->ops.unregister_process = unregister_process; + dqm->ops.register_process = register_process_nocpsch; + dqm->ops.unregister_process = unregister_process_nocpsch; dqm->ops.uninitialize = uninitialize_nocpsch; dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; dqm->ops.set_cache_memory_policy = set_cache_memory_policy; - dqm->ops.set_trap_handler = set_trap_handler; - dqm->ops.process_termination = process_termination_cpsch; - dqm->ops.get_wave_state = get_wave_state; break; case KFD_SCHED_POLICY_NO_HWS: /* initialize dqm for no cp scheduling */ @@ -1624,142 +1195,39 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.update_queue = update_queue; dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; - dqm->ops.register_process = register_process; - dqm->ops.unregister_process = unregister_process; + dqm->ops.register_process = register_process_nocpsch; + dqm->ops.unregister_process = unregister_process_nocpsch; dqm->ops.initialize = initialize_nocpsch; dqm->ops.uninitialize = uninitialize_nocpsch; dqm->ops.set_cache_memory_policy = set_cache_memory_policy; - dqm->ops.set_trap_handler = set_trap_handler; - dqm->ops.process_termination = process_termination_nocpsch; - dqm->ops.get_wave_state = get_wave_state; break; default: - WARN(1, "Invalid scheduling policy %d", dqm->sched_policy); - goto out_free; + BUG(); + break; } switch (dev->device_info->asic_family) { case CHIP_CARRIZO: - device_queue_manager_init_vi(&dqm->asic_ops); + device_queue_manager_init_vi(&dqm->ops_asic_specific); break; case CHIP_KAVERI: - device_queue_manager_init_cik(&dqm->asic_ops); - break; - - case CHIP_HAWAII: - device_queue_manager_init_cik_hawaii(&dqm->asic_ops); - break; - - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - device_queue_manager_init_vi_tonga(&dqm->asic_ops); + device_queue_manager_init_cik(&dqm->ops_asic_specific); break; - - case CHIP_VEGA10: - case CHIP_RAVEN: - device_queue_manager_init_v9_vega10(&dqm->asic_ops); - break; - default: - BUG(); } - if (!dqm->ops.initialize(dqm)) - return dqm; + if (dqm->ops.initialize(dqm) != 0) { + kfree(dqm); + return NULL; + } -out_free: - kfree(dqm); - return NULL; + return dqm; } void device_queue_manager_uninit(struct device_queue_manager *dqm) { + BUG_ON(!dqm); + dqm->ops.uninitialize(dqm); kfree(dqm); } - -int kfd_process_vm_fault(struct device_queue_manager *dqm, - unsigned int pasid) -{ - struct kfd_process_device *pdd; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - int ret = 0; - - if (!p) - return -EINVAL; - pdd = kfd_get_process_device_data(dqm->dev, p); - if (pdd) - ret = process_evict_queues(dqm, &pdd->qpd); - kfd_unref_process(p); - - return ret; -} - -static void seq_reg_dump(struct seq_file *m, - uint32_t (*dump)[2], uint32_t n_regs) -{ - uint32_t i, count; - - for (i = 0, count = 0; i < n_regs; i++) { - if (count == 0 || - dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) { - seq_printf(m, "%s %08x: %08x", - i ? "\n" : "", - dump[i][0], dump[i][1]); - count = 7; - } else { - seq_printf(m, " %08x", dump[i][1]); - count--; - } - } - - seq_puts(m, "\n"); -} - -int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data) -{ - struct device_queue_manager *dqm = data; - uint32_t (*dump)[2], n_regs; - int pipe, queue; - int r = 0; - - for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { - int pipe_offset = pipe * get_queues_per_pipe(dqm); - - for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) { - if (!test_bit(pipe_offset + queue, - dqm->dev->shared_resources.queue_bitmap)) - continue; - - r = dqm->dev->kfd2kgd->hqd_dump( - dqm->dev->kgd, pipe, queue, &dump, &n_regs); - if (r) - break; - - seq_printf(m, " CP Pipe %d, Queue %d\n", - pipe, queue); - seq_reg_dump(m, dump, n_regs); - - kfree(dump); - } - } - - for (pipe = 0; pipe < CIK_SDMA_ENGINE_NUM; pipe++) { - for (queue = 0; queue < CIK_SDMA_QUEUES_PER_ENGINE; queue++) { - r = dqm->dev->kfd2kgd->hqd_sdma_dump( - dqm->dev->kgd, pipe, queue, &dump, &n_regs); - if (r) - break; - - seq_printf(m, " SDMA Engine %d, RLC %d\n", - pipe, queue); - seq_reg_dump(m, dump, n_regs); - - kfree(dump); - } - } - - return r; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 841283a..faf820a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -29,10 +29,10 @@ #include "kfd_priv.h" #include "kfd_mqd_manager.h" -#define KFD_HIQ_TIMEOUT (500) -#define KFD_UNMAP_LATENCY_MS (4000) -#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) - +#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500) +#define CIK_VMID_NUM (8) +#define KFD_VMID_START_OFFSET (8) +#define VMID_PER_DEVICE CIK_VMID_NUM #define KFD_DQM_FIRST_PIPE (0) #define CIK_SDMA_QUEUES (4) #define CIK_SDMA_QUEUES_PER_ENGINE (2) @@ -79,14 +79,6 @@ struct device_process_node { * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the * memory apertures. * - * @set_page_directory_base: Sets the PD base address (GPU local memory) - * in all the queues of the relevant process running on the specified device. - * It preempts the queues, updates the value and execute the runlist again. - * - * @process_termination: Clears all process queues belongs to that device. - * - * @get_wave_state: Retrieves context save state and optionally copies the - * control stack, if kept in the MQD, to the given userspace address. */ struct device_queue_manager_ops { @@ -130,26 +122,12 @@ struct device_queue_manager_ops { enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); - - int (*set_trap_handler)(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - uint64_t tba_addr, - uint64_t tma_addr); - - int (*process_termination)(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); - - int (*get_wave_state)(struct device_queue_manager *dqm, - struct queue *q, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size); }; struct device_queue_manager_asic_ops { - int (*update_qpd)(struct device_queue_manager *dqm, + int (*register_process)(struct device_queue_manager *dqm, struct qcm_process_device *qpd); - int (*init_cpsch)(struct device_queue_manager *dqm); + int (*initialize)(struct device_queue_manager *dqm); bool (*set_cache_memory_policy)(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, @@ -175,7 +153,7 @@ struct device_queue_manager_asic_ops { struct device_queue_manager { struct device_queue_manager_ops ops; - struct device_queue_manager_asic_ops asic_ops; + struct device_queue_manager_asic_ops ops_asic_specific; struct mqd_manager *mqds[KFD_MQD_TYPE_MAX]; struct packet_manager packets; @@ -196,37 +174,21 @@ struct device_queue_manager { unsigned int *fence_addr; struct kfd_mem_obj *fence_mem; bool active_runlist; - int sched_policy; }; -void device_queue_manager_init_cik( - struct device_queue_manager_asic_ops *asic_ops); -void device_queue_manager_init_cik_hawaii( - struct device_queue_manager_asic_ops *asic_ops); -void device_queue_manager_init_vi( - struct device_queue_manager_asic_ops *asic_ops); -void device_queue_manager_init_vi_tonga( - struct device_queue_manager_asic_ops *asic_ops); -void device_queue_manager_init_v9_vega10( - struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops); +void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_queues_num(struct device_queue_manager *dqm); unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); -int process_evict_queues(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); -int process_restore_queues(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); - - static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { return (pdd->lds_base >> 16) & 0xFF; } -/* This function is only useful for GFXv7 and v8 */ static inline unsigned int get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c index 8e1eb24..48dc056 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -24,7 +24,6 @@ #include "kfd_device_queue_manager.h" #include "cik_regs.h" #include "oss/oss_2_4_sh_mask.h" -#include "gca/gfx_7_2_sh_mask.h" static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd, @@ -32,33 +31,18 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); -static int update_qpd_cik(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); -static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, +static int register_process_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static int initialize_cpsch_cik(struct device_queue_manager *dqm); static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - -void device_queue_manager_init_cik( - struct device_queue_manager_asic_ops *asic_ops) -{ - asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; - asic_ops->update_qpd = update_qpd_cik; - asic_ops->init_cpsch = initialize_cpsch_cik; - asic_ops->init_sdma_vm = init_sdma_vm; -} -void device_queue_manager_init_cik_hawaii( - struct device_queue_manager_asic_ops *asic_ops) +void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops) { - asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; - asic_ops->update_qpd = update_qpd_cik_hawaii; - asic_ops->init_cpsch = initialize_cpsch_cik; - asic_ops->init_sdma_vm = init_sdma_vm_hawaii; + ops->set_cache_memory_policy = set_cache_memory_policy_cik; + ops->register_process = register_process_cik; + ops->initialize = initialize_cpsch_cik; + ops->init_sdma_vm = init_sdma_vm; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) @@ -81,7 +65,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) * for LDS/Scratch and GPUVM. */ - WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE || + BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE || top_address_nybble == 0); return PRIVATE_BASE(top_address_nybble << 12) | @@ -114,12 +98,14 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, return true; } -static int update_qpd_cik(struct device_queue_manager *dqm, +static int register_process_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct kfd_process_device *pdd; unsigned int temp; + BUG_ON(!dqm || !qpd); + pdd = qpd_to_pdd(qpd); /* check if sh_mem_config register already configured */ @@ -139,40 +125,9 @@ static int update_qpd_cik(struct device_queue_manager *dqm, } else { temp = get_sh_mem_bases_nybble_64(pdd); qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); - qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; } - pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", - qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); - - return 0; -} - -static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - unsigned int temp; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | - DEFAULT_MTYPE(MTYPE_NONCACHED) | - APE1_MTYPE(MTYPE_NONCACHED); - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit - * aperture addresses. - */ - temp = get_sh_mem_bases_nybble_64(pdd); - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); - - pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", + pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); return 0; @@ -194,19 +149,6 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, q->properties.sdma_vm_addr = value; } -static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd) -{ - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit - * aperture addresses. - */ - q->properties.sdma_vm_addr = - ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << - SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & - SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; -} - static int initialize_cpsch_cik(struct device_queue_manager *dqm) { return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c deleted file mode 100644 index dde5882..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "kfd_device_queue_manager.h" -#include "vega10/vega10_enum.h" -#include "vega10/GC/gc_9_0_offset.h" -#include "vega10/GC/gc_9_0_sh_mask.h" -#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" - -static int update_qpd_v9(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); -static int initialize_cpsch_v9(struct device_queue_manager *dqm); -static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd); - -void device_queue_manager_init_v9_vega10( - struct device_queue_manager_asic_ops *asic_ops) -{ - asic_ops->update_qpd = update_qpd_v9; - asic_ops->init_cpsch = initialize_cpsch_v9; - asic_ops->init_sdma_vm = init_sdma_vm_v9; -} - -static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) -{ - uint32_t shared_base = pdd->lds_base >> 48; - uint32_t private_base = pdd->scratch_base >> 48; - - return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | - private_base; -} - -static int update_qpd_v9(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; - if (vega10_noretry) - qpd->sh_mem_config |= - 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; - - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); - - pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); - - return 0; -} - -static int initialize_cpsch_v9(struct device_queue_manager *dqm) -{ - return 0; -} - -static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd) -{ - /* Not needed on SDMAv4 any more */ - q->properties.sdma_vm_addr = 0; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c index ac8d852..7e9cae9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -33,44 +33,18 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); -static int update_qpd_vi(struct device_queue_manager *dqm, +static int register_process_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static int initialize_cpsch_vi(struct device_queue_manager *dqm); static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -/* - * Tonga device queue manager functions - */ -static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - enum cache_policy default_policy, - enum cache_policy alternate_policy, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); -static int update_qpd_vi_tonga(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); -static void init_sdma_vm_tonga(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - -void device_queue_manager_init_vi_tonga( - struct device_queue_manager_asic_ops *asic_ops) -{ - asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; - asic_ops->update_qpd = update_qpd_vi_tonga; - asic_ops->init_cpsch = initialize_cpsch_vi; - asic_ops->init_sdma_vm = init_sdma_vm_tonga; -} - - -void device_queue_manager_init_vi( - struct device_queue_manager_asic_ops *asic_ops) +void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops) { - asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; - asic_ops->update_qpd = update_qpd_vi; - asic_ops->init_cpsch = initialize_cpsch_vi; - asic_ops->init_sdma_vm = init_sdma_vm; + ops->set_cache_memory_policy = set_cache_memory_policy_vi; + ops->register_process = register_process_vi; + ops->initialize = initialize_cpsch_vi; + ops->init_sdma_vm = init_sdma_vm; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) @@ -93,7 +67,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) * for LDS/Scratch and GPUVM. */ - WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE || + BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE || top_address_nybble == 0); return top_address_nybble << 12 | @@ -130,39 +104,14 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, return true; } -static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - enum cache_policy default_policy, - enum cache_policy alternate_policy, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) -{ - uint32_t default_mtype; - uint32_t ape1_mtype; - - default_mtype = (default_policy == cache_policy_coherent) ? - MTYPE_UC : - MTYPE_NC; - - ape1_mtype = (alternate_policy == cache_policy_coherent) ? - MTYPE_UC : - MTYPE_NC; - - qpd->sh_mem_config = - SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | - default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | - ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; - - return true; -} - -static int update_qpd_vi(struct device_queue_manager *dqm, +static int register_process_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct kfd_process_device *pdd; unsigned int temp; + BUG_ON(!dqm || !qpd); + pdd = qpd_to_pdd(qpd); /* check if sh_mem_config register already configured */ @@ -188,50 +137,14 @@ static int update_qpd_vi(struct device_queue_manager *dqm, qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 << SH_MEM_CONFIG__ADDRESS_MODE__SHIFT; - qpd->sh_mem_config |= 1 << - SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; } - pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", + pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); return 0; } -static int update_qpd_vi_tonga(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - unsigned int temp; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | - MTYPE_UC << - SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | - MTYPE_UC << - SH_MEM_CONFIG__APE1_MTYPE__SHIFT; - - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit - * aperture addresses. - */ - temp = get_sh_mem_bases_nybble_64(pdd); - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); - - pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n", - temp, qpd->sh_mem_bases); - - return 0; -} - static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { @@ -248,20 +161,6 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, q->properties.sdma_vm_addr = value; } -static void init_sdma_vm_tonga(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd) -{ - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit - * aperture addresses. - */ - q->properties.sdma_vm_addr = - ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << - SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & - SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; -} - - static int initialize_cpsch_vi(struct device_queue_manager *dqm) { return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index 008d258..453c5d6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -34,6 +34,7 @@ */ #define KERNEL_DOORBELL_PASID 1 +#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 /* * Each device exposes a doorbell aperture, a PCI MMIO aperture that @@ -50,15 +51,15 @@ */ /* # of doorbell bytes allocated for each process. */ -size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) +static inline size_t doorbell_process_allocation(void) { - return roundup(kfd->device_info->doorbell_size * + return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, PAGE_SIZE); } /* Doorbell calculations for device init. */ -int kfd_doorbell_init(struct kfd_dev *kfd) +void kfd_doorbell_init(struct kfd_dev *kfd) { size_t doorbell_start_offset; size_t doorbell_aperture_size; @@ -72,16 +73,16 @@ int kfd_doorbell_init(struct kfd_dev *kfd) doorbell_start_offset = roundup(kfd->shared_resources.doorbell_start_offset, - kfd_doorbell_process_slice(kfd)); + doorbell_process_allocation()); doorbell_aperture_size = rounddown(kfd->shared_resources.doorbell_aperture_size, - kfd_doorbell_process_slice(kfd)); + doorbell_process_allocation()); if (doorbell_aperture_size > doorbell_start_offset) doorbell_process_limit = (doorbell_aperture_size - doorbell_start_offset) / - kfd_doorbell_process_slice(kfd); + doorbell_process_allocation(); else doorbell_process_limit = 0; @@ -92,49 +93,45 @@ int kfd_doorbell_init(struct kfd_dev *kfd) kfd->doorbell_process_limit = doorbell_process_limit - 1; kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, - kfd_doorbell_process_slice(kfd)); + doorbell_process_allocation()); - if (!kfd->doorbell_kernel_ptr) - return -ENOMEM; + BUG_ON(!kfd->doorbell_kernel_ptr); - pr_debug("Doorbell initialization:\n"); - pr_debug("doorbell base == 0x%08lX\n", + pr_debug("kfd: doorbell initialization:\n"); + pr_debug("kfd: doorbell base == 0x%08lX\n", (uintptr_t)kfd->doorbell_base); - pr_debug("doorbell_id_offset == 0x%08lX\n", + pr_debug("kfd: doorbell_id_offset == 0x%08lX\n", kfd->doorbell_id_offset); - pr_debug("doorbell_process_limit == 0x%08lX\n", + pr_debug("kfd: doorbell_process_limit == 0x%08lX\n", doorbell_process_limit); - pr_debug("doorbell_kernel_offset == 0x%08lX\n", + pr_debug("kfd: doorbell_kernel_offset == 0x%08lX\n", (uintptr_t)kfd->doorbell_base); - pr_debug("doorbell aperture size == 0x%08lX\n", + pr_debug("kfd: doorbell aperture size == 0x%08lX\n", kfd->shared_resources.doorbell_aperture_size); - pr_debug("doorbell kernel address == 0x%08lX\n", + pr_debug("kfd: doorbell kernel address == 0x%08lX\n", (uintptr_t)kfd->doorbell_kernel_ptr); - - return 0; -} - -void kfd_doorbell_fini(struct kfd_dev *kfd) -{ - if (kfd->doorbell_kernel_ptr) - iounmap(kfd->doorbell_kernel_ptr); } -int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, - struct vm_area_struct *vma) +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) { phys_addr_t address; + struct kfd_dev *dev; /* * For simplicitly we only allow mapping of the entire doorbell * allocation of a single device & process. */ - if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) + if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) + return -EINVAL; + + /* Find kfd device according to gpu id */ + dev = kfd_device_by_id(vma->vm_pgoff); + if (dev == NULL) return -EINVAL; /* Calculate physical address of doorbell */ @@ -145,29 +142,32 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - pr_debug("Mapping doorbell page\n" + pr_debug("kfd: mapping doorbell page in %s\n" " target user address == 0x%08llX\n" " physical address == 0x%08llX\n" " vm_flags == 0x%04lX\n" " size == 0x%04lX\n", + __func__, (unsigned long long) vma->vm_start, address, vma->vm_flags, - kfd_doorbell_process_slice(dev)); + doorbell_process_allocation()); return io_remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, - kfd_doorbell_process_slice(dev), + doorbell_process_allocation(), vma->vm_page_prot); } /* get kernel iomem pointer for a doorbell */ -void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, unsigned int *doorbell_off) { u32 inx; + BUG_ON(!kfd || !doorbell_off); + mutex_lock(&kfd->doorbell_mutex); inx = find_first_zero_bit(kfd->doorbell_available_index, KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); @@ -178,17 +178,14 @@ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) return NULL; - inx *= kfd->device_info->doorbell_size / sizeof(u32); - /* * Calculating the kernel doorbell offset using "faked" kernel - * pasid that allocated for kernel queues only. Offset is in - * dword units regardless of the ASIC-dependent doorbell size. + * pasid that allocated for kernel queues only */ - *doorbell_off = KERNEL_DOORBELL_PASID * - (kfd_doorbell_process_slice(kfd) / sizeof(u32)) + inx; + *doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() / + sizeof(u32)) + inx; - pr_debug("Get kernel queue doorbell\n" + pr_debug("kfd: get kernel queue doorbell\n" " doorbell offset == 0x%08X\n" " kernel address == 0x%08lX\n", *doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx)); @@ -200,6 +197,8 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) { unsigned int inx; + BUG_ON(!kfd || !db_addr); + inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr); mutex_lock(&kfd->doorbell_mutex); @@ -207,21 +206,11 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) mutex_unlock(&kfd->doorbell_mutex); } -void write_kernel_doorbell(void __iomem *db, u32 value) +inline void write_kernel_doorbell(u32 __iomem *db, u32 value) { if (db) { writel(value, db); - pr_debug("Writing %d to doorbell address 0x%p\n", value, db); - } -} - -void write_kernel_doorbell64(void __iomem *db, u64 value) -{ - if (db) { - WARN(((unsigned long)db & 7) != 0, - "Unaligned 64-bit doorbell"); - writeq(value, (u64 __iomem *)db); - pr_debug("writing %llu to doorbell address 0x%p\n", value, db); + pr_debug("writing %d to doorbell address 0x%p\n", value, db); } } @@ -229,26 +218,25 @@ void write_kernel_doorbell64(void __iomem *db, u64 value) * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 * to doorbells with the process's doorbell page */ -unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, struct kfd_process *process, - unsigned int doorbell_id) + unsigned int queue_id) { /* * doorbell_id_offset accounts for doorbells taken by KGD. - * pasid * kfd_doorbell_process_slice/sizeof(u32) adjusts to - * the process's doorbells. The offset returned is in dword - * units regardless of the ASIC-dependent doorbell size. + * pasid * doorbell_process_allocation/sizeof(u32) adjusts + * to the process's doorbells */ return kfd->doorbell_id_offset + - process->pasid * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) + - doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); + process->pasid * (doorbell_process_allocation()/sizeof(u32)) + + queue_id; } uint64_t kfd_get_number_elems(struct kfd_dev *kfd) { uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - kfd->shared_resources.doorbell_start_offset) / - kfd_doorbell_process_slice(kfd) + 1; + doorbell_process_allocation() + 1; return num_of_elems; @@ -258,5 +246,5 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, struct kfd_process *process) { return dev->doorbell_base + - process->pasid * kfd_doorbell_process_slice(dev); + process->pasid * doorbell_process_allocation(); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 7eacf42..d1ce83d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -23,9 +23,9 @@ #include #include #include -#include -#include #include +#include +#include #include #include #include "kfd_priv.h" @@ -52,9 +52,6 @@ struct kfd_event_waiter { uint32_t input_index; }; -#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT -#define SLOT_BITMAP_LONGS BITS_TO_LONGS(SLOTS_PER_PAGE) - /* * Over-complicated pooled allocator for event notification slots. * @@ -68,19 +65,24 @@ struct kfd_event_waiter { struct signal_page { struct list_head event_pages; /* kfd_process.signal_event_pages */ uint64_t *kernel_address; - uint64_t handle; uint64_t __user *user_address; uint32_t page_index; /* Index into the mmap aperture. */ unsigned int free_slots; - unsigned long used_slot_bitmap[SLOT_BITMAP_LONGS]; + unsigned long used_slot_bitmap[0]; }; +#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT +#define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE) +#define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1) +#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \ + SLOT_BITMAP_SIZE * sizeof(long)) + /* * For signal events, the event ID is used as the interrupt user data. * For SQ s_sendmsg interrupts, this is limited to 8 bits. */ -#define INTERRUPT_DATA_BITS 12 +#define INTERRUPT_DATA_BITS 8 #define SIGNAL_EVENT_ID_SLOT_SHIFT 0 static uint64_t *page_slots(struct signal_page *page) @@ -108,7 +110,7 @@ static bool allocate_free_slot(struct kfd_process *process, *out_page = page; *out_slot_index = slot; - pr_debug("Allocated event signal slot in page %p, slot %d\n", + pr_debug("allocated event signal slot in page %p, slot %d\n", page, slot); return true; @@ -129,7 +131,7 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) void *backing_store; struct signal_page *page; - page = kzalloc(sizeof(*page), GFP_KERNEL); + page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); if (!page) goto fail_alloc_signal_page; @@ -153,9 +155,9 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) struct signal_page, event_pages)->page_index + 1; - pr_debug("Allocated new event signal page at %p, for process %p\n", + pr_debug("allocated new event signal page at %p, for process %p\n", page, p); - pr_debug("Page index is %d\n", page->page_index); + pr_debug("page index is %d\n", page->page_index); list_add(&page->event_pages, &p->signal_event_pages); @@ -184,53 +186,6 @@ static bool allocate_event_notification_slot(struct file *devkfd, return ret; } -static bool allocate_signal_page_dgpu(struct kfd_process *p, - uint64_t *kernel_address, uint64_t handle) -{ - struct signal_page *my_page; - - my_page = kzalloc(sizeof(*my_page), GFP_KERNEL); - if (!my_page) - return false; - - /* prevent user-mode info leaks */ - memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, - KFD_SIGNAL_EVENT_LIMIT * 8); - - my_page->kernel_address = kernel_address; - my_page->handle = handle; - my_page->user_address = NULL; - my_page->free_slots = SLOTS_PER_PAGE; - if (list_empty(&p->signal_event_pages)) - my_page->page_index = 0; - else - my_page->page_index = list_tail_entry(&p->signal_event_pages, - struct signal_page, - event_pages)->page_index + 1; - - pr_debug("Allocated new event signal page at %p, for process %p\n", - my_page, p); - pr_debug("Page index is %d\n", my_page->page_index); - - list_add(&my_page->event_pages, &p->signal_event_pages); - - return true; -} - -void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle) -{ - struct signal_page *page, *tmp; - - list_for_each_entry_safe(page, tmp, &p->signal_event_pages, - event_pages) { - if (page->handle == handle) { - list_del(&page->event_pages); - kfree(page); - break; - } - } -} - /* Assumes that the process's event_mutex is locked. */ static void release_event_notification_slot(struct signal_page *page, size_t slot_index) @@ -239,8 +194,7 @@ static void release_event_notification_slot(struct signal_page *page, page->free_slots++; /* We don't free signal pages, they are retained by the process - * and reused until it exits. - */ + * and reused until it exits. */ } static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, @@ -292,7 +246,7 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) for (id = p->next_nonsignal_event_id; id < KFD_LAST_NONSIGNAL_EVENT_ID && - lookup_event_by_id(p, id); + lookup_event_by_id(p, id) != NULL; id++) ; @@ -311,7 +265,7 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) for (id = KFD_FIRST_NONSIGNAL_EVENT_ID; id < KFD_LAST_NONSIGNAL_EVENT_ID && - lookup_event_by_id(p, id); + lookup_event_by_id(p, id) != NULL; id++) ; @@ -337,16 +291,13 @@ static int create_signal_event(struct file *devkfd, struct kfd_event *ev) { if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { - if (!p->signal_event_limit_reached) { - pr_warn("Signal event wasn't created because limit was reached\n"); - p->signal_event_limit_reached = true; - } + pr_warn("amdkfd: Signal event wasn't created because limit was reached\n"); return -ENOMEM; } if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page, - &ev->signal_slot_index)) { - pr_warn("Signal event wasn't created because out of kernel memory\n"); + &ev->signal_slot_index)) { + pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n"); return -ENOMEM; } @@ -358,7 +309,11 @@ static int create_signal_event(struct file *devkfd, ev->event_id = make_signal_event_id(ev->signal_page, ev->signal_slot_index); - pr_debug("Signal event number %zu created with id %d, address %p\n", + pr_debug("signal event number %zu created with id %d, address %p\n", + p->signal_event_count, ev->event_id, + ev->user_signal_address); + + pr_debug("signal event number %zu created with id %d, address %p\n", p->signal_event_count, ev->event_id, ev->user_signal_address); @@ -390,7 +345,7 @@ void kfd_event_init_process(struct kfd_process *p) static void destroy_event(struct kfd_process *p, struct kfd_event *ev) { - if (ev->signal_page) { + if (ev->signal_page != NULL) { release_event_notification_slot(ev->signal_page, ev->signal_slot_index); p->signal_event_count--; @@ -426,9 +381,8 @@ static void shutdown_signal_pages(struct kfd_process *p) list_for_each_entry_safe(page, tmp, &p->signal_event_pages, event_pages) { - if (page->user_address) - free_pages((unsigned long)page->kernel_address, - get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + free_pages((unsigned long)page->kernel_address, + get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); kfree(page); } } @@ -453,8 +407,7 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint32_t event_type, bool auto_reset, uint32_t node_id, uint32_t *event_id, uint32_t *event_trigger_data, - uint64_t *event_page_offset, uint32_t *event_slot_index, - void *kern_addr) + uint64_t *event_page_offset, uint32_t *event_slot_index) { int ret = 0; struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); @@ -468,20 +421,17 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, INIT_LIST_HEAD(&ev->waiters); - mutex_lock(&p->event_mutex); - - if (kern_addr && list_empty(&p->signal_event_pages)) - allocate_signal_page_dgpu(p, kern_addr, *event_page_offset); - *event_page_offset = 0; + mutex_lock(&p->event_mutex); + switch (event_type) { case KFD_EVENT_TYPE_SIGNAL: case KFD_EVENT_TYPE_DEBUG: ret = create_signal_event(devkfd, p, ev); if (!ret) { *event_page_offset = (ev->signal_page->page_index | - KFD_MMAP_TYPE_EVENTS); + KFD_MMAP_EVENTS_MASK); *event_page_offset <<= PAGE_SHIFT; *event_slot_index = ev->signal_slot_index; } @@ -614,7 +564,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. + * running so the lookup function returns a locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -634,7 +584,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, * search faster. */ struct signal_page *page; - unsigned int i; + unsigned i; list_for_each_entry(page, &p->signal_event_pages, event_pages) for (i = 0; i < SLOTS_PER_PAGE; i++) @@ -646,7 +596,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, } mutex_unlock(&p->event_mutex); - kfd_unref_process(p); + mutex_unlock(&p->mutex); } static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) @@ -667,7 +617,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) return event_waiters; } -static int init_event_waiter_get_status(struct kfd_process *p, +static int init_event_waiter(struct kfd_process *p, struct kfd_event_waiter *waiter, uint32_t event_id, uint32_t input_index) @@ -682,18 +632,9 @@ static int init_event_waiter_get_status(struct kfd_process *p, waiter->activated = ev->signaled; ev->signaled = ev->signaled && !ev->auto_reset; - return 0; -} + list_add(&waiter->waiters, &ev->waiters); -static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) -{ - struct kfd_event *ev = waiter->event; - - /* Only add to the wait list if we actually need to - * wait on this event. - */ - if (!waiter->activated) - list_add(&waiter->waiters, &ev->waiters); + return 0; } static bool test_event_condition(bool all, uint32_t num_events, @@ -788,11 +729,6 @@ int kfd_wait_on_events(struct kfd_process *p, mutex_lock(&p->event_mutex); - /* Set to something unreasonable - this is really - * just a bool for now. - */ - *wait_result = KFD_WAIT_TIMEOUT; - event_waiters = alloc_event_waiters(num_events); if (!event_waiters) { ret = -ENOMEM; @@ -808,34 +744,14 @@ int kfd_wait_on_events(struct kfd_process *p, goto fail; } - ret = init_event_waiter_get_status(p, &event_waiters[i], + ret = init_event_waiter(p, &event_waiters[i], event_data.event_id, i); if (ret) goto fail; } - /* Check condition once. */ - if (test_event_condition(all, num_events, event_waiters)) { - if (copy_signaled_event_data(num_events, - event_waiters, events)) - *wait_result = KFD_WAIT_COMPLETE; - else - *wait_result = KFD_WAIT_ERROR; - free_waiters(num_events, event_waiters); - } else { - /* Add to wait lists if we need to wait. */ - for (i = 0; i < num_events; i++) - init_event_waiter_add_to_waitlist(&event_waiters[i]); - } - mutex_unlock(&p->event_mutex); - /* Return if all waits were already satisfied. */ - if (*wait_result != KFD_WAIT_TIMEOUT) { - __set_current_state(TASK_RUNNING); - return ret; - } - while (true) { if (fatal_signal_pending(current)) { ret = -EINTR; @@ -855,17 +771,6 @@ int kfd_wait_on_events(struct kfd_process *p, break; } - /* Set task state to interruptible sleep before - * checking wake-up conditions. A concurrent wake-up - * will put the task back into runnable state. In that - * case schedule_timeout will not put the task to - * sleep and we'll get a chance to re-check the - * updated conditions almost immediately. Otherwise, - * this race condition would lead to a soft hang or a - * very long sleep. - */ - set_current_state(TASK_INTERRUPTIBLE); - if (test_event_condition(all, num_events, event_waiters)) { if (copy_signaled_event_data(num_events, event_waiters, events)) @@ -880,7 +785,7 @@ int kfd_wait_on_events(struct kfd_process *p, break; } - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); } __set_current_state(TASK_RUNNING); @@ -911,7 +816,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) /* check required size is logical */ if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) != get_order(vma->vm_end - vma->vm_start)) { - pr_err("Event page mmap requested illegal size\n"); + pr_err("amdkfd: event page mmap requested illegal size\n"); return -EINVAL; } @@ -920,7 +825,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) page = lookup_signal_page_by_index(p, page_index); if (!page) { /* Probably KFD bug, but mmap is user-accessible. */ - pr_debug("Signal page could not be found for page_index %u\n", + pr_debug("signal page could not be found for page_index %u\n", page_index); return -EINVAL; } @@ -931,7 +836,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; - pr_debug("Mapping signal page\n"); + pr_debug("mapping signal page\n"); pr_debug(" start user address == 0x%08lx\n", vma->vm_start); pr_debug(" end user address == 0x%08lx\n", vma->vm_end); pr_debug(" pfn == 0x%016lX\n", pfn); @@ -971,13 +876,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, ev->memory_exception_data = *ev_data; } - if (type == KFD_EVENT_TYPE_MEMORY) { - dev_warn(kfd_device, - "Sending SIGSEGV to HSA Process with PID %d ", - p->lead_thread->pid); - send_sig(SIGSEGV, p->lead_thread, 0); - } - /* Send SIGTERM no event of type "type" has been found*/ if (send_signal) { if (send_sigterm) { @@ -993,7 +891,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, } } -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, unsigned long address, bool is_write_requested, bool is_execute_requested) @@ -1004,27 +901,17 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. + * running so the lookup function returns a locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - struct mm_struct *mm; if (!p) return; /* Presumably process exited. */ - /* Take a safe reference to the mm_struct, which may otherwise - * disappear even while the kfd_process is still referenced. - */ - mm = get_task_mm(p->lead_thread); - if (!mm) { - kfd_unref_process(p); - return; /* Process is exiting */ - } - memset(&memory_exception_data, 0, sizeof(memory_exception_data)); - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); + down_read(&p->mm->mmap_sem); + vma = find_vma(p->mm, address); memory_exception_data.gpu_id = dev->id; memory_exception_data.va = address; @@ -1050,8 +937,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, } } - up_read(&mm->mmap_sem); - mmput(mm); + up_read(&p->mm->mmap_sem); mutex_lock(&p->event_mutex); @@ -1060,17 +946,15 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, &memory_exception_data); mutex_unlock(&p->event_mutex); - - kfd_unref_process(p); + mutex_unlock(&p->mutex); } -#endif /* CONFIG_AMD_IOMMU_V2_MODULE */ void kfd_signal_hw_exception_event(unsigned int pasid) { /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. + * running so the lookup function returns a locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -1083,42 +967,5 @@ void kfd_signal_hw_exception_event(unsigned int pasid) lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL); mutex_unlock(&p->event_mutex); - kfd_unref_process(p); -} - -void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, - struct kfd_vm_fault_info *info) -{ - struct kfd_event *ev; - int bkt; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - struct kfd_hsa_memory_exception_data memory_exception_data; - - if (!p) - return; /* Presumably process exited. */ - memset(&memory_exception_data, 0, sizeof(memory_exception_data)); - memory_exception_data.gpu_id = dev->id; - memory_exception_data.failure.imprecise = true; - /* Set failure reason */ - if (info) { - memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; - memory_exception_data.failure.NotPresent = - info->prot_valid ? 1 : 0; - memory_exception_data.failure.NoExecute = - info->prot_exec ? 1 : 0; - memory_exception_data.failure.ReadOnly = - info->prot_write ? 1 : 0; - memory_exception_data.failure.imprecise = 0; - } - mutex_lock(&p->event_mutex); - - hash_for_each(p->events, bkt, ev, events) { - if (ev->type == KFD_EVENT_TYPE_MEMORY) { - ev->memory_exception_data = memory_exception_data; - set_event(ev); - } - } - - mutex_unlock(&p->event_mutex); - kfd_unref_process(p); + mutex_unlock(&p->mutex); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 09595a9..2b65510 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -275,80 +275,24 @@ * for FLAT_* / S_LOAD operations. */ -#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ +#define MAKE_GPUVM_APP_BASE(gpu_num) \ (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) -#define MAKE_GPUVM_APP_LIMIT(base, size) \ - (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) +#define MAKE_GPUVM_APP_LIMIT(base) \ + (((uint64_t)(base) & \ + 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) -#define MAKE_SCRATCH_APP_BASE_VI() \ - (((uint64_t)(0x1UL) << 61) + 0x100000000L) +#define MAKE_SCRATCH_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x100000000L) #define MAKE_SCRATCH_APP_LIMIT(base) \ (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) -#define MAKE_LDS_APP_BASE_VI() \ - (((uint64_t)(0x1UL) << 61) + 0x0) - +#define MAKE_LDS_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x0) #define MAKE_LDS_APP_LIMIT(base) \ (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) -/* On GFXv9 the LDS and scratch apertures are programmed independently - * using the high 16 bits of the 64-bit virtual address. They must be - * in the hole, which will be the case as long as the high 16 bits are - * not 0. - * - * The aperture sizes are still 4GB implicitly. - * - * A GPUVM aperture is not applicable on GFXv9. - */ -#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) -#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) - -/* Some VM address space reserved for kernel use (CWSR trap handlers - * and kernel IBs) - */ -#define DGPU_VM_BASE_DEFAULT 0x100000 -#define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE) - -int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, - uint64_t base, uint64_t limit) -{ - if (base < (pdd->qpd.cwsr_base + pdd->dev->cwsr_size)) { - pr_err("Set dgpu vm base 0x%llx failed.\n", base); - return -EINVAL; - } - pdd->dgpu_base = base; - pdd->dgpu_limit = limit; - return 0; -} - -void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) -{ - /* - * node id couldn't be 0 - the three MSB bits of - * aperture shoudn't be 0 - */ - pdd->lds_base = MAKE_LDS_APP_BASE_VI(); - pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); - - pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); - pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( - pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size); - - pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); - pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); -} - -void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) -{ - pdd->lds_base = MAKE_LDS_APP_BASE_V9(); - pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); - - pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); - pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); -} - int kfd_init_apertures(struct kfd_process *process) { uint8_t id = 0; @@ -356,14 +300,11 @@ int kfd_init_apertures(struct kfd_process *process) struct kfd_process_device *pdd; /*Iterating over all devices*/ - while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { - if (!dev) { - id++; /* Skip non GPU devices */ - continue; - } + while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && + id < NUM_OF_SUPPORTED_GPUS) { pdd = kfd_create_process_device_data(dev, process); - if (!pdd) { + if (pdd == NULL) { pr_err("Failed to create process device data\n"); return -1; } @@ -377,29 +318,23 @@ int kfd_init_apertures(struct kfd_process *process) pdd->gpuvm_base = pdd->gpuvm_limit = 0; pdd->scratch_base = pdd->scratch_limit = 0; } else { - switch (dev->device_info->asic_family) { - case CHIP_KAVERI: - case CHIP_HAWAII: - case CHIP_CARRIZO: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - kfd_init_apertures_vi(pdd, id); - break; - case CHIP_VEGA10: - case CHIP_RAVEN: - kfd_init_apertures_v9(pdd, id); - break; - default: - pr_err("Unknown chip in kfd_init_apertures\n"); - return -1; - } + /* + * node id couldn't be 0 - the three MSB bits of + * aperture shoudn't be 0 + */ + pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); + + pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); + + pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); + + pdd->gpuvm_limit = + MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); - if (!dev->device_info->is_need_iommu_device) { - pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT; - pdd->qpd.ib_base = DGPU_IB_BASE_DEFAULT; - } + pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); + + pdd->scratch_limit = + MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); } dev_dbg(kfd_device, "node id %u\n", id); @@ -417,9 +352,4 @@ int kfd_init_apertures(struct kfd_process *process) return 0; } -void kfd_flush_tlb(struct kfd_dev *dev, uint32_t pasid) -{ - const struct kfd2kgd_calls *f2g = dev->kfd2kgd; - f2g->invalidate_tlbs(dev->kgd, pasid); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c deleted file mode 100644 index b2c6b52..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "kfd_priv.h" -#include "kfd_events.h" -#include "soc15_int.h" - - -static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid) -{ - uint32_t pasid = 0; - const struct kfd2kgd_calls *f2g = dev->kfd2kgd; - - if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid)) - pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); - - return pasid; -} - -static bool event_interrupt_isr_v9(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, - uint32_t *patched_ihre, - bool *patched_flag) -{ - uint16_t source_id, client_id, pasid, vmid; - bool result = false; - - source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); - client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); - pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); - vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); - - if (pasid) { - const uint32_t *data = ih_ring_entry; - - pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", - client_id, source_id, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); - } - - if ((vmid >= dev->vm_info.first_vmid_kfd && - vmid <= dev->vm_info.last_vmid_kfd) && - (source_id == SOC15_INTSRC_CP_END_OF_PIPE || - source_id == SOC15_INTSRC_SDMA_TRAP || - source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || - source_id == SOC15_INTSRC_CP_BAD_OPCODE || - client_id == SOC15_IH_CLIENTID_VMC || - client_id == SOC15_IH_CLIENTID_UTCL2)) { - - /* - * KFD want to handle this INT, but MEC firmware did - * not send pasid. Try to get it from vmid mapping - * and patch the ih entry. It's a temp workaround. - */ - WARN_ONCE((!pasid), "Fix me.\n"); - if (!pasid) { - uint32_t temp = le32_to_cpu(ih_ring_entry[3]); - - pasid = kfd_get_pasid_from_vmid(dev, vmid); - memcpy(patched_ihre, ih_ring_entry, - dev->device_info->ih_ring_entry_size); - patched_ihre[3] = cpu_to_le32(temp | pasid); - *patched_flag = true; - } - result = pasid ? true : false; - } - - /* Do not process in ISR, just request it to be forwarded to WQ. */ - return result; - -} - -static void event_interrupt_wq_v9(struct kfd_dev *dev, - const uint32_t *ih_ring_entry) -{ - uint16_t source_id, client_id, pasid, vmid; - - source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); - client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); - pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); - vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); - - if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(pasid, 0, 0); - else if (source_id == SOC15_INTSRC_SDMA_TRAP) - kfd_signal_event_interrupt(pasid, 0, 0); - else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) - kfd_signal_event_interrupt(pasid, 0, 0); /*todo */ - else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) - kfd_signal_hw_exception_event(pasid); - else if (client_id == SOC15_IH_CLIENTID_VMC || - client_id == SOC15_IH_CLIENTID_UTCL2) { - struct kfd_vm_fault_info info = {0}; - uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); - - info.vmid = vmid; - info.mc_id = client_id; - info.page_addr = ih_ring_entry[4] | - (uint64_t)(ih_ring_entry[5] & 0xf) << 32; - info.prot_valid = ring_id & 0x08; - info.prot_read = ring_id & 0x10; - info.prot_write = ring_id & 0x20; - - kfd_process_vm_fault(dev->dqm, pasid); - kfd_signal_vm_fault_event(dev, pasid, &info); - } -} - -const struct kfd_event_interrupt_class event_interrupt_class_v9 = { - .interrupt_isr = event_interrupt_isr_v9, - .interrupt_wq = event_interrupt_wq_v9, -}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index 47dcf4a..7f134aa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -44,24 +44,24 @@ #include #include "kfd_priv.h" -#define KFD_IH_NUM_ENTRIES 8192 +#define KFD_INTERRUPT_RING_SIZE 1024 static void interrupt_wq(struct work_struct *); int kfd_interrupt_init(struct kfd_dev *kfd) { - int r; - - r = kfifo_alloc(&kfd->ih_fifo, - KFD_IH_NUM_ENTRIES * - kfd->device_info->ih_ring_entry_size, - GFP_KERNEL); - if (r) { - dev_err(kfd_chardev(), "Failed to allocate IH fifo\n"); - return r; - } + void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE, + kfd->device_info->ih_ring_entry_size, + GFP_KERNEL); + if (!interrupt_ring) + return -ENOMEM; + + kfd->interrupt_ring = interrupt_ring; + kfd->interrupt_ring_size = + KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size; + atomic_set(&kfd->interrupt_ring_wptr, 0); + atomic_set(&kfd->interrupt_ring_rptr, 0); - kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); spin_lock_init(&kfd->interrupt_lock); INIT_WORK(&kfd->interrupt_work, interrupt_wq); @@ -92,47 +92,74 @@ void kfd_interrupt_exit(struct kfd_dev *kfd) spin_unlock_irqrestore(&kfd->interrupt_lock, flags); /* - * flush_work ensures that there are no outstanding + * Flush_scheduled_work ensures that there are no outstanding * work-queue items that will access interrupt_ring. New work items * can't be created because we stopped interrupt handling above. */ - flush_workqueue(kfd->ih_wq); + flush_scheduled_work(); - kfifo_free(&kfd->ih_fifo); + kfree(kfd->interrupt_ring); } /* - * Assumption: single reader/writer. This function is not re-entrant + * This assumes that it can't be called concurrently with itself + * but only with dequeue_ih_ring_entry. */ bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) { - int count; + unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); + unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); - count = kfifo_in(&kfd->ih_fifo, ih_ring_entry, - kfd->device_info->ih_ring_entry_size); - if (count != kfd->device_info->ih_ring_entry_size) { + if ((rptr - wptr) % kfd->interrupt_ring_size == + kfd->device_info->ih_ring_entry_size) { + /* This is very bad, the system is likely to hang. */ dev_err_ratelimited(kfd_chardev(), - "Interrupt ring overflow, dropping interrupt %d\n", - count); + "Interrupt ring overflow, dropping interrupt.\n"); return false; } + memcpy(kfd->interrupt_ring + wptr, ih_ring_entry, + kfd->device_info->ih_ring_entry_size); + + wptr = (wptr + kfd->device_info->ih_ring_entry_size) % + kfd->interrupt_ring_size; + smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */ + atomic_set(&kfd->interrupt_ring_wptr, wptr); + return true; } /* - * Assumption: single reader/writer. This function is not re-entrant + * This assumes that it can't be called concurrently with itself + * but only with enqueue_ih_ring_entry. */ static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry) { - int count; + /* + * Assume that wait queues have an implicit barrier, i.e. anything that + * happened in the ISR before it queued work is visible. + */ + + unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); + unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); - count = kfifo_out(&kfd->ih_fifo, ih_ring_entry, - kfd->device_info->ih_ring_entry_size); + if (rptr == wptr) + return false; - WARN_ON(count && count != kfd->device_info->ih_ring_entry_size); + memcpy(ih_ring_entry, kfd->interrupt_ring + rptr, + kfd->device_info->ih_ring_entry_size); - return count == kfd->device_info->ih_ring_entry_size; + rptr = (rptr + kfd->device_info->ih_ring_entry_size) % + kfd->interrupt_ring_size; + + /* + * Ensure the rptr write update is not visible until + * memcpy has finished reading. + */ + smp_mb(); + atomic_set(&kfd->interrupt_ring_rptr, rptr); + + return true; } static void interrupt_wq(struct work_struct *work) @@ -149,15 +176,13 @@ static void interrupt_wq(struct work_struct *work) ih_ring_entry); } -bool interrupt_is_wanted(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, - uint32_t *patched_ihre, bool *flag) +bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) { /* integer and bitwise OR so there is no boolean short-circuiting */ - unsigned int wanted = 0; + unsigned wanted = 0; wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, - ih_ring_entry, patched_ihre, flag); + ih_ring_entry); return wanted != 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c deleted file mode 100644 index e67eb9f..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include - -#include "kfd_ipc.h" -#include "kfd_priv.h" - -#define KFD_IPC_HASH_TABLE_SIZE_SHIFT 4 -#define KFD_IPC_HASH_TABLE_SIZE_MASK ((1 << KFD_IPC_HASH_TABLE_SIZE_SHIFT) - 1) - -static struct kfd_ipc_handles { - DECLARE_HASHTABLE(handles, KFD_IPC_HASH_TABLE_SIZE_SHIFT); - struct mutex lock; -} kfd_ipc_handles; - -/* Since, handles are random numbers, it can be used directly as hashing key. - * The least 4 bits of the handle are used as key. However, during import all - * 128 bits of the handle are checked to prevent handle snooping. - */ -#define HANDLE_TO_KEY(sh) ((*(uint64_t *)sh) & KFD_IPC_HASH_TABLE_SIZE_MASK) - -static int ipc_store_insert(void *val, void *sh, struct kfd_ipc_obj **ipc_obj) -{ - struct kfd_ipc_obj *obj; - - obj = kmalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; - - /* The initial ref belongs to the allocator process. - * The IPC object store itself does not hold a ref since - * there is no specific moment in time where that ref should - * be dropped, except "when there are no more userspace processes - * holding a ref to the object". Therefore the removal from IPC - * storage happens at ipc_obj release time. - */ - kref_init(&obj->ref); - obj->data = val; - get_random_bytes(obj->share_handle, sizeof(obj->share_handle)); - - memcpy(sh, obj->share_handle, sizeof(obj->share_handle)); - - mutex_lock(&kfd_ipc_handles.lock); - hlist_add_head(&obj->node, - &kfd_ipc_handles.handles[HANDLE_TO_KEY(obj->share_handle)]); - mutex_unlock(&kfd_ipc_handles.lock); - - if (ipc_obj) - *ipc_obj = obj; - - return 0; -} - -static void ipc_obj_release(struct kref *r) -{ - struct kfd_ipc_obj *obj; - - obj = container_of(r, struct kfd_ipc_obj, ref); - - mutex_lock(&kfd_ipc_handles.lock); - hash_del(&obj->node); - mutex_unlock(&kfd_ipc_handles.lock); - - dma_buf_put(obj->data); - kfree(obj); -} - -void ipc_obj_get(struct kfd_ipc_obj *obj) -{ - kref_get(&obj->ref); -} - -void ipc_obj_put(struct kfd_ipc_obj **obj) -{ - kref_put(&(*obj)->ref, ipc_obj_release); - *obj = NULL; -} - -int kfd_ipc_init(void) -{ - mutex_init(&kfd_ipc_handles.lock); - hash_init(kfd_ipc_handles.handles); - return 0; -} - -static int kfd_import_dmabuf_create_kfd_bo(struct kfd_dev *dev, - struct kfd_process *p, - uint32_t gpu_id, struct dma_buf *dmabuf, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset, - struct kfd_ipc_obj *ipc_obj) -{ - int r; - void *mem; - uint64_t size; - int idr_handle; - struct kfd_process_device *pdd = NULL; - uint64_t kfd_mmap_flags = KFD_MMAP_TYPE_MAP_BO | - KFD_MMAP_GPU_ID(gpu_id); - - if (!handle) - return -EINVAL; - - if (!dev || !dev->kfd2kgd->import_dmabuf) - return -EINVAL; - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - r = PTR_ERR(pdd); - goto err_unlock; - } - - r = dev->kfd2kgd->import_dmabuf(dev->kgd, dmabuf, - va_addr, pdd->vm, - (struct kgd_mem **)&mem, &size, - mmap_offset); - if (r) - goto err_unlock; - - idr_handle = kfd_process_device_create_obj_handle(pdd, mem, - va_addr, size, - ipc_obj); - if (idr_handle < 0) { - r = -EFAULT; - goto err_free; - } - - mutex_unlock(&p->mutex); - - *handle = MAKE_HANDLE(gpu_id, idr_handle); - if (mmap_offset) - *mmap_offset = (kfd_mmap_flags << PAGE_SHIFT) | *mmap_offset; - - return 0; - -err_free: - dev->kfd2kgd->free_memory_of_gpu(dev->kgd, - (struct kgd_mem *)mem, - pdd->vm); -err_unlock: - mutex_unlock(&p->mutex); - return r; -} - -int kfd_ipc_import_dmabuf(struct kfd_dev *dev, - struct kfd_process *p, - uint32_t gpu_id, int dmabuf_fd, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset) -{ - int r; - struct dma_buf *dmabuf = dma_buf_get(dmabuf_fd); - - if (!dmabuf) - return -EINVAL; - - r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, dmabuf, - va_addr, handle, mmap_offset, - NULL); - dma_buf_put(dmabuf); - return r; -} - -int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, - uint32_t gpu_id, uint32_t *share_handle, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset) -{ - int r; - struct kfd_ipc_obj *entry, *found = NULL; - - mutex_lock(&kfd_ipc_handles.lock); - /* Convert the user provided handle to hash key and search only in that - * bucket - */ - hlist_for_each_entry(entry, - &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { - if (!memcmp(entry->share_handle, share_handle, - sizeof(entry->share_handle))) { - found = entry; - break; - } - } - mutex_unlock(&kfd_ipc_handles.lock); - - if (!found) - return -EINVAL; - ipc_obj_get(found); - - pr_debug("Found ipc_dma_buf: %p\n", found->data); - - r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, found->data, - va_addr, handle, mmap_offset, - found); - if (r) - goto error_unref; - - return r; - -error_unref: - ipc_obj_put(&found); - return r; -} - -int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, - uint64_t handle, uint32_t *ipc_handle) -{ - struct kfd_process_device *pdd = NULL; - struct kfd_ipc_obj *obj; - struct kfd_bo *kfd_bo = NULL; - struct dma_buf *dmabuf; - int r; - - if (!dev || !ipc_handle) - return -EINVAL; - - mutex_lock(&p->mutex); - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - mutex_unlock(&p->mutex); - pr_err("Failed to get pdd\n"); - return PTR_ERR(pdd); - } - - kfd_bo = kfd_process_device_find_bo(pdd, GET_IDR_HANDLE(handle)); - mutex_unlock(&p->mutex); - - if (!kfd_bo) { - pr_err("Failed to get bo"); - return -EINVAL; - } - if (kfd_bo->kfd_ipc_obj) { - memcpy(ipc_handle, kfd_bo->kfd_ipc_obj->share_handle, - sizeof(kfd_bo->kfd_ipc_obj->share_handle)); - return 0; - } - - r = dev->kfd2kgd->export_dmabuf(dev->kgd, pdd->vm, - (struct kgd_mem *)kfd_bo->mem, - &dmabuf); - if (r) - return r; - - r = ipc_store_insert(dmabuf, ipc_handle, &obj); - if (r) - return r; - - kfd_bo->kfd_ipc_obj = obj; - - return r; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h deleted file mode 100644 index 9ee8627..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef KFD_IPC_H_ -#define KFD_IPC_H_ - -#include -#include "kfd_priv.h" - -struct kfd_ipc_obj { - struct hlist_node node; - struct kref ref; - void *data; - uint32_t share_handle[4]; -}; - -int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, - uint32_t gpu_id, uint32_t *share_handle, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset); -int kfd_ipc_import_dmabuf(struct kfd_dev *kfd, struct kfd_process *p, - uint32_t gpu_id, int dmabuf_fd, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset); -int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, - uint64_t handle, uint32_t *ipc_handle); - -void ipc_obj_get(struct kfd_ipc_obj *obj); -void ipc_obj_put(struct kfd_ipc_obj **obj); - -#endif /* KFD_IPC_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 903ef25..d135cd0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -41,8 +41,11 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, int retval; union PM4_MES_TYPE_3_HEADER nop; - pr_debug("Initializing queue type %d size %d\n", KFD_QUEUE_TYPE_HIQ, - queue_size); + BUG_ON(!kq || !dev); + BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ); + + pr_debug("amdkfd: In func %s initializing queue type %d size %d\n", + __func__, KFD_QUEUE_TYPE_HIQ, queue_size); memset(&prop, 0, sizeof(prop)); memset(&nop, 0, sizeof(nop)); @@ -60,23 +63,23 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, KFD_MQD_TYPE_HIQ); break; default: - pr_err("Invalid queue type %d\n", type); - return false; + BUG(); + break; } - if (!kq->mqd) + if (kq->mqd == NULL) return false; prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off); - if (!prop.doorbell_ptr) { - pr_err("Failed to initialize doorbell"); + if (prop.doorbell_ptr == NULL) { + pr_err("amdkfd: error init doorbell"); goto err_get_kernel_doorbell; } retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq); if (retval != 0) { - pr_err("Failed to init pq queues size %d\n", queue_size); + pr_err("amdkfd: error init pq queues size (%d)\n", queue_size); goto err_pq_allocate_vidmem; } @@ -84,7 +87,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->pq_gpu_addr = kq->pq->gpu_addr; retval = kq->ops_asic_specific.initialize(kq, dev, type, queue_size); - if (!retval) + if (retval == false) goto err_eop_allocate_vidmem; retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel), @@ -96,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->rptr_kernel = kq->rptr_mem->cpu_ptr; kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; - retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, + retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), &kq->wptr_mem); if (retval != 0) @@ -120,7 +123,6 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr; prop.eop_ring_buffer_address = kq->eop_gpu_addr; prop.eop_ring_buffer_size = PAGE_SIZE; - prop.cu_mask = NULL; if (init_queue(&kq->queue, &prop) != 0) goto err_init_queue; @@ -137,12 +139,11 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, /* assign HIQ to HQD */ if (type == KFD_QUEUE_TYPE_HIQ) { - pr_debug("Assigning hiq to hqd\n"); + pr_debug("assigning hiq to hqd\n"); kq->queue->pipe = KFD_CIK_HIQ_PIPE; kq->queue->queue = KFD_CIK_HIQ_QUEUE; kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe, - kq->queue->queue, &kq->queue->properties, - NULL); + kq->queue->queue, NULL); } else { /* allocate fence for DIQ */ @@ -179,10 +180,12 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, static void uninitialize(struct kernel_queue *kq) { + BUG_ON(!kq); + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) kq->mqd->destroy_mqd(kq->mqd, - kq->queue->mqd, - KFD_PREEMPT_TYPE_WAVEFRONT_RESET, + NULL, + false, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, kq->queue->pipe, kq->queue->queue); @@ -206,17 +209,12 @@ static int acquire_packet_buffer(struct kernel_queue *kq, size_t available_size; size_t queue_size_dwords; uint32_t wptr, rptr; - uint64_t wptr64; unsigned int *queue_address; - /* When rptr == wptr, the buffer is empty. - * When rptr == wptr + 1, the buffer is full. - * It is always rptr that advances to the position of wptr, rather than - * the opposite. So we can only use up to queue_size_dwords - 1 dwords. - */ + BUG_ON(!kq || !buffer_ptr); + rptr = *kq->rptr_kernel; - wptr = kq->pending_wptr; - wptr64 = kq->pending_wptr64; + wptr = *kq->wptr_kernel; queue_address = (unsigned int *)kq->pq_kernel_addr; queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); @@ -224,72 +222,28 @@ static int acquire_packet_buffer(struct kernel_queue *kq, pr_debug("wptr: %d\n", wptr); pr_debug("queue_address 0x%p\n", queue_address); - available_size = (rptr + queue_size_dwords - 1 - wptr) % + available_size = (rptr - 1 - wptr + queue_size_dwords) % queue_size_dwords; - if (packet_size_in_dwords > available_size) { + if (packet_size_in_dwords >= queue_size_dwords || + packet_size_in_dwords >= available_size) { /* * make sure calling functions know * acquire_packet_buffer() failed */ - goto err_no_space; + *buffer_ptr = NULL; + return -ENOMEM; } if (wptr + packet_size_in_dwords >= queue_size_dwords) { - /* make sure after rolling back to position 0, there is - * still enough space. - */ - if (packet_size_in_dwords >= rptr) - goto err_no_space; - - /* fill nops, roll back and start at position 0 */ while (wptr > 0) { queue_address[wptr] = kq->nop_packet; wptr = (wptr + 1) % queue_size_dwords; - wptr64++; } } *buffer_ptr = &queue_address[wptr]; kq->pending_wptr = wptr + packet_size_in_dwords; - kq->pending_wptr64 = wptr64 + packet_size_in_dwords; - - return 0; - -err_no_space: - *buffer_ptr = NULL; - return -ENOMEM; -} - -static int acquire_inline_ib(struct kernel_queue *kq, - size_t size_in_dwords, - unsigned int **buffer_ptr, - uint64_t *gpu_addr) -{ - int ret; - unsigned int *buf; - union PM4_MES_TYPE_3_HEADER nop; - - if (size_in_dwords >= (1 << 14)) - return -EINVAL; - - /* Allocate size_in_dwords on the ring, plus an extra dword - * for a NOP packet header - */ - ret = acquire_packet_buffer(kq, size_in_dwords + 1, &buf); - if (ret) - return ret; - - /* Build a NOP packet that contains the IB as "payload". */ - nop.u32all = 0; - nop.opcode = IT_NOP; - nop.count = size_in_dwords - 1; - nop.type = PM4_TYPE_3; - - *buf = nop.u32all; - *buffer_ptr = buf + 1; - *gpu_addr = kq->pq_gpu_addr + ((unsigned long)*buffer_ptr - - (unsigned long)kq->pq_kernel_addr); return 0; } @@ -298,7 +252,11 @@ static void submit_packet(struct kernel_queue *kq) { #ifdef DEBUG int i; +#endif + + BUG_ON(!kq); +#ifdef DEBUG for (i = *kq->wptr_kernel; i < kq->pending_wptr; i++) { pr_debug("0x%2X ", kq->pq_kernel_addr[i]); if (i % 15 == 0) @@ -307,11 +265,14 @@ static void submit_packet(struct kernel_queue *kq) pr_debug("\n"); #endif - kq->ops_asic_specific.submit_packet(kq); + *kq->wptr_kernel = kq->pending_wptr; + write_kernel_doorbell(kq->queue->properties.doorbell_ptr, + kq->pending_wptr); } static void rollback_packet(struct kernel_queue *kq) { + BUG_ON(!kq); kq->pending_wptr = *kq->queue->properties.write_ptr; } @@ -320,41 +281,30 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, { struct kernel_queue *kq; - kq = kzalloc(sizeof(*kq), GFP_KERNEL); + BUG_ON(!dev); + + kq = kzalloc(sizeof(struct kernel_queue), GFP_KERNEL); if (!kq) return NULL; kq->ops.initialize = initialize; kq->ops.uninitialize = uninitialize; kq->ops.acquire_packet_buffer = acquire_packet_buffer; - kq->ops.acquire_inline_ib = acquire_inline_ib; kq->ops.submit_packet = submit_packet; kq->ops.rollback_packet = rollback_packet; switch (dev->device_info->asic_family) { case CHIP_CARRIZO: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: kernel_queue_init_vi(&kq->ops_asic_specific); break; case CHIP_KAVERI: - case CHIP_HAWAII: kernel_queue_init_cik(&kq->ops_asic_specific); break; - - case CHIP_VEGA10: - case CHIP_RAVEN: - kernel_queue_init_v9(&kq->ops_asic_specific); - break; - default: - BUG(); } if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) { - pr_err("Failed to init kernel queue\n"); + pr_err("amdkfd: failed to init kernel queue\n"); kfree(kq); return NULL; } @@ -363,37 +313,32 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, void kernel_queue_uninit(struct kernel_queue *kq) { + BUG_ON(!kq); + kq->ops.uninitialize(kq); kfree(kq); } -/* FIXME: Can this test be removed? */ static __attribute__((unused)) void test_kq(struct kfd_dev *dev) { struct kernel_queue *kq; uint32_t *buffer, i; int retval; - pr_err("Starting kernel queue test\n"); + BUG_ON(!dev); + + pr_err("amdkfd: starting kernel queue test\n"); kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); - if (unlikely(!kq)) { - pr_err(" Failed to initialize HIQ\n"); - pr_err("Kernel queue test failed\n"); - return; - } + BUG_ON(!kq); retval = kq->ops.acquire_packet_buffer(kq, 5, &buffer); - if (unlikely(retval != 0)) { - pr_err(" Failed to acquire packet buffer\n"); - pr_err("Kernel queue test failed\n"); - return; - } + BUG_ON(retval != 0); for (i = 0; i < 5; i++) buffer[i] = kq->nop_packet; kq->ops.submit_packet(kq); - pr_err("Ending kernel queue test\n"); + pr_err("amdkfd: ending kernel queue test\n"); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index 82c94a6..5940531 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -42,12 +42,6 @@ * pending write pointer to that location so subsequent calls to * acquire_packet_buffer will get a correct write pointer * - * @acquire_inline_ib: Returns a pointer to the location in the kernel - * queue ring buffer where the calling function can write an inline IB. It is - * Guaranteed that there is enough space for that IB. It also updates the - * pending write pointer to that location so subsequent calls to - * acquire_packet_buffer will get a correct write pointer - * * @submit_packet: Update the write pointer and doorbell of a kernel queue. * * @sync_with_hw: Wait until the write pointer and the read pointer of a kernel @@ -65,10 +59,6 @@ struct kernel_queue_ops { int (*acquire_packet_buffer)(struct kernel_queue *kq, size_t packet_size_in_dwords, unsigned int **buffer_ptr); - int (*acquire_inline_ib)(struct kernel_queue *kq, - size_t packet_size_in_dwords, - unsigned int **buffer_ptr, - uint64_t *gpu_addr); void (*submit_packet)(struct kernel_queue *kq); void (*rollback_packet)(struct kernel_queue *kq); @@ -82,7 +72,6 @@ struct kernel_queue { struct kfd_dev *dev; struct mqd_manager *mqd; struct queue *queue; - uint64_t pending_wptr64; uint32_t pending_wptr; unsigned int nop_packet; @@ -90,10 +79,7 @@ struct kernel_queue { uint32_t *rptr_kernel; uint64_t rptr_gpu_addr; struct kfd_mem_obj *wptr_mem; - union { - uint64_t *wptr64_kernel; - uint32_t *wptr_kernel; - }; + uint32_t *wptr_kernel; uint64_t wptr_gpu_addr; struct kfd_mem_obj *pq; uint64_t pq_gpu_addr; @@ -111,6 +97,5 @@ struct kernel_queue { void kernel_queue_init_cik(struct kernel_queue_ops *ops); void kernel_queue_init_vi(struct kernel_queue_ops *ops); -void kernel_queue_init_v9(struct kernel_queue_ops *ops); #endif /* KFD_KERNEL_QUEUE_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c index 2808422..a90eb44 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c @@ -22,19 +22,15 @@ */ #include "kfd_kernel_queue.h" -#include "kfd_pm4_headers.h" -#include "kfd_pm4_opcodes.h" static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, enum kfd_queue_type type, unsigned int queue_size); static void uninitialize_cik(struct kernel_queue *kq); -static void submit_packet_cik(struct kernel_queue *kq); void kernel_queue_init_cik(struct kernel_queue_ops *ops) { ops->initialize = initialize_cik; ops->uninitialize = uninitialize_cik; - ops->submit_packet = submit_packet_cik; } static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, @@ -46,127 +42,3 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, static void uninitialize_cik(struct kernel_queue *kq) { } - -static void submit_packet_cik(struct kernel_queue *kq) -{ - *kq->wptr_kernel = kq->pending_wptr; - write_kernel_doorbell(kq->queue->properties.doorbell_ptr, - kq->pending_wptr); -} - -static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer, - struct qcm_process_device *qpd) -{ - struct pm4_map_process *packet; - - packet = (struct pm4_map_process *)buffer; - - memset(buffer, 0, sizeof(struct pm4_map_process)); - - packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, - sizeof(struct pm4_map_process)); - packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; - packet->bitfields2.pasid = qpd->pqm->process->pasid; - packet->bitfields3.page_table_base = qpd->page_table_base; - packet->bitfields10.gds_size = qpd->gds_size; - packet->bitfields10.num_gws = qpd->num_gws; - packet->bitfields10.num_oac = qpd->num_oac; - packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; - - packet->sh_mem_config = qpd->sh_mem_config; - packet->sh_mem_bases = qpd->sh_mem_bases; - packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; - packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; - - packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); - packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); - - return 0; -} - -static int pm_map_process_scratch_cik(struct packet_manager *pm, - uint32_t *buffer, struct qcm_process_device *qpd) -{ - struct pm4_map_process_scratch_kv *packet; - - packet = (struct pm4_map_process_scratch_kv *)buffer; - - memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); - - packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, - sizeof(struct pm4_map_process_scratch_kv)); - packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; - packet->bitfields2.pasid = qpd->pqm->process->pasid; - packet->bitfields3.page_table_base = qpd->page_table_base; - packet->bitfields14.gds_size = qpd->gds_size; - packet->bitfields14.num_gws = qpd->num_gws; - packet->bitfields14.num_oac = qpd->num_oac; - packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; - - packet->sh_mem_config = qpd->sh_mem_config; - packet->sh_mem_bases = qpd->sh_mem_bases; - packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; - packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; - - packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; - - packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); - packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); - - return 0; -} - -static uint32_t pm_get_map_process_packet_size_cik(void) -{ - return sizeof(struct pm4_map_process); -} -static uint32_t pm_get_map_process_scratch_packet_size_cik(void) -{ - return sizeof(struct pm4_map_process_scratch_kv); -} - - -static struct packet_manager_funcs kfd_cik_pm_funcs = { - .map_process = pm_map_process_cik, - .runlist = pm_runlist_vi, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_vi, - .unmap_queues = pm_unmap_queues_vi, - .query_status = pm_query_status_vi, - .release_mem = pm_release_mem_vi, - .get_map_process_packet_size = pm_get_map_process_packet_size_cik, - .get_runlist_packet_size = pm_get_runlist_packet_size_vi, - .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, - .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, - .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, - .get_query_status_packet_size = pm_get_query_status_packet_size_vi, - .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, -}; - -static struct packet_manager_funcs kfd_cik_scratch_pm_funcs = { - .map_process = pm_map_process_scratch_cik, - .runlist = pm_runlist_vi, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_vi, - .unmap_queues = pm_unmap_queues_vi, - .query_status = pm_query_status_vi, - .release_mem = pm_release_mem_vi, - .get_map_process_packet_size = - pm_get_map_process_scratch_packet_size_cik, - .get_runlist_packet_size = pm_get_runlist_packet_size_vi, - .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, - .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, - .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, - .get_query_status_packet_size = pm_get_query_status_packet_size_vi, - .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, -}; - -void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver) -{ - if (fw_ver >= KFD_SCRATCH_KV_FW_VER) - pm->pmf = &kfd_cik_scratch_pm_funcs; - else - pm->pmf = &kfd_cik_pm_funcs; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c deleted file mode 100644 index 5fe4f60..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +++ /dev/null @@ -1,377 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "kfd_kernel_queue.h" -#include "kfd_device_queue_manager.h" -#include "kfd_pm4_headers_ai.h" -#include "kfd_pm4_opcodes.h" - -static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, - enum kfd_queue_type type, unsigned int queue_size); -static void uninitialize_v9(struct kernel_queue *kq); -static void submit_packet_v9(struct kernel_queue *kq); - -void kernel_queue_init_v9(struct kernel_queue_ops *ops) -{ - ops->initialize = initialize_v9; - ops->uninitialize = uninitialize_v9; - ops->submit_packet = submit_packet_v9; -} - -static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, - enum kfd_queue_type type, unsigned int queue_size) -{ - int retval; - - retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); - if (retval != 0) - return false; - - kq->eop_gpu_addr = kq->eop_mem->gpu_addr; - kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; - - memset(kq->eop_kernel_addr, 0, PAGE_SIZE); - - return true; -} - -static void uninitialize_v9(struct kernel_queue *kq) -{ - kfd_gtt_sa_free(kq->dev, kq->eop_mem); -} - -static void submit_packet_v9(struct kernel_queue *kq) -{ - *kq->wptr64_kernel = kq->pending_wptr64; - write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, - kq->pending_wptr64); -} - -static int pm_map_process_v9(struct packet_manager *pm, - uint32_t *buffer, struct qcm_process_device *qpd) -{ - struct pm4_mes_map_process *packet; - uint64_t vm_page_table_base_addr = - (uint64_t)(qpd->page_table_base) << 12; - - packet = (struct pm4_mes_map_process *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_map_process)); - - packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, - sizeof(struct pm4_mes_map_process)); - packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; - packet->bitfields2.pasid = qpd->pqm->process->pasid; - packet->bitfields14.gds_size = qpd->gds_size; - packet->bitfields14.num_gws = qpd->num_gws; - packet->bitfields14.num_oac = qpd->num_oac; - packet->bitfields14.sdma_enable = 1; - packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; - - packet->sh_mem_config = qpd->sh_mem_config; - packet->sh_mem_bases = qpd->sh_mem_bases; - packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); - packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); - packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); - packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); - - packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); - packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); - - packet->vm_context_page_table_base_addr_lo32 = - lower_32_bits(vm_page_table_base_addr); - packet->vm_context_page_table_base_addr_hi32 = - upper_32_bits(vm_page_table_base_addr); - - return 0; -} - -static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, - uint64_t ib, size_t ib_size_in_dwords, bool chain) -{ - struct pm4_mes_runlist *packet; - - int concurrent_proc_cnt = 0; - struct kfd_dev *kfd = pm->dqm->dev; - - /* Determine the number of processes to map together to HW: - * it can not exceed the number of VMIDs available to the - * scheduler, and it is determined by the smaller of the number - * of processes in the runlist and kfd module parameter - * hws_max_conc_proc. - * Note: the arbitration between the number of VMIDs and - * hws_max_conc_proc has been done in - * kgd2kfd_device_init(). - */ - concurrent_proc_cnt = min(pm->dqm->processes_count, - kfd->max_proc_per_quantum); - - - packet = (struct pm4_mes_runlist *)buffer; - - memset(buffer, 0, sizeof(struct pm4_mes_runlist)); - packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, - sizeof(struct pm4_mes_runlist)); - - packet->bitfields4.ib_size = ib_size_in_dwords; - packet->bitfields4.chain = chain ? 1 : 0; - packet->bitfields4.offload_polling = 0; - packet->bitfields4.valid = 1; - packet->bitfields4.process_cnt = concurrent_proc_cnt; - packet->ordinal2 = lower_32_bits(ib); - packet->ib_base_hi = upper_32_bits(ib); - - return 0; -} - -static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, - struct queue *q, bool is_static) -{ - struct pm4_mes_map_queues *packet; - bool use_static = is_static; - - packet = (struct pm4_mes_map_queues *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); - - packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, - sizeof(struct pm4_mes_map_queues)); - packet->bitfields2.alloc_format = - alloc_format__mes_map_queues__one_per_pipe_vi; - packet->bitfields2.num_queues = 1; - packet->bitfields2.queue_sel = - queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; - - packet->bitfields2.engine_sel = - engine_sel__mes_map_queues__compute_vi; - packet->bitfields2.queue_type = - queue_type__mes_map_queues__normal_compute_vi; - - switch (q->properties.type) { - case KFD_QUEUE_TYPE_COMPUTE: - if (use_static) - packet->bitfields2.queue_type = - queue_type__mes_map_queues__normal_latency_static_queue_vi; - break; - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.queue_type = - queue_type__mes_map_queues__debug_interface_queue_vi; - break; - case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = q->properties.sdma_engine_id + - engine_sel__mes_map_queues__sdma0_vi; - use_static = false; /* no static queues under SDMA */ - break; - default: - WARN(1, "queue type %d", q->properties.type); - return -EINVAL; - } - packet->bitfields3.doorbell_offset = - q->properties.doorbell_off; - - packet->mqd_addr_lo = - lower_32_bits(q->gart_mqd_addr); - - packet->mqd_addr_hi = - upper_32_bits(q->gart_mqd_addr); - - packet->wptr_addr_lo = - lower_32_bits((uint64_t)q->properties.write_ptr); - - packet->wptr_addr_hi = - upper_32_bits((uint64_t)q->properties.write_ptr); - - return 0; -} - -static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, - enum kfd_queue_type type, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param, bool reset, - unsigned int sdma_engine) -{ - struct pm4_mes_unmap_queues *packet; - - packet = (struct pm4_mes_unmap_queues *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); - - packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, - sizeof(struct pm4_mes_unmap_queues)); - switch (type) { - case KFD_QUEUE_TYPE_COMPUTE: - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.engine_sel = - engine_sel__mes_unmap_queues__compute; - break; - case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = - engine_sel__mes_unmap_queues__sdma0 + sdma_engine; - break; - default: - WARN(1, "queue type %d", type); - return -EINVAL; - } - - if (reset) - packet->bitfields2.action = - action__mes_unmap_queues__reset_queues; - else - packet->bitfields2.action = - action__mes_unmap_queues__preempt_queues; - - switch (filter) { - case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_specified_queues; - packet->bitfields2.num_queues = 1; - packet->bitfields3b.doorbell_offset0 = filter_param; - break; - case KFD_UNMAP_QUEUES_FILTER_BY_PASID: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; - packet->bitfields3a.pasid = filter_param; - break; - case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__unmap_all_queues; - break; - case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: - /* in this case, we do not preempt static queues */ - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__unmap_all_non_static_queues; - break; - default: - WARN(1, "filter %d", filter); - return -EINVAL; - } - - return 0; - -} - -static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, - uint64_t fence_address, uint32_t fence_value) -{ - struct pm4_mes_query_status *packet; - - packet = (struct pm4_mes_query_status *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_query_status)); - - - packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, - sizeof(struct pm4_mes_query_status)); - - packet->bitfields2.context_id = 0; - packet->bitfields2.interrupt_sel = - interrupt_sel__mes_query_status__completion_status; - packet->bitfields2.command = - command__mes_query_status__fence_only_after_write_ack; - - packet->addr_hi = upper_32_bits((uint64_t)fence_address); - packet->addr_lo = lower_32_bits((uint64_t)fence_address); - packet->data_hi = upper_32_bits((uint64_t)fence_value); - packet->data_lo = lower_32_bits((uint64_t)fence_value); - - return 0; -} - - -static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) -{ - struct pm4_mec_release_mem *packet; - - packet = (struct pm4_mec_release_mem *)buffer; - memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); - - packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, - sizeof(struct pm4_mec_release_mem)); - - packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; - packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; - packet->bitfields2.tcl1_action_ena = 1; - packet->bitfields2.tc_action_ena = 1; - packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; - - packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; - packet->bitfields3.int_sel = - int_sel__mec_release_mem__send_interrupt_after_write_confirm; - - packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; - packet->address_hi = upper_32_bits(gpu_addr); - - packet->data_lo = 0; - - return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); -} - -static uint32_t pm_get_map_process_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_map_process); -} - -static uint32_t pm_get_runlist_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_runlist); -} - -static uint32_t pm_get_map_queues_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_map_queues); -} - -static uint32_t pm_get_unmap_queues_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_unmap_queues); -} - -static uint32_t pm_get_query_status_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_query_status); -} - -static uint32_t pm_get_release_mem_packet_size_v9(void) -{ - return sizeof(struct pm4_mec_release_mem); -} - -static struct packet_manager_funcs kfd_v9_pm_funcs = { - .map_process = pm_map_process_v9, - .runlist = pm_runlist_v9, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_v9, - .unmap_queues = pm_unmap_queues_v9, - .query_status = pm_query_status_v9, - .release_mem = pm_release_mem_v9, - .get_map_process_packet_size = pm_get_map_process_packet_size_v9, - .get_runlist_packet_size = pm_get_runlist_packet_size_v9, - .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, - .get_map_queues_packet_size = pm_get_map_queues_packet_size_v9, - .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_v9, - .get_query_status_packet_size = pm_get_query_status_packet_size_v9, - .get_release_mem_packet_size = pm_get_release_mem_packet_size_v9, -}; - -void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver) -{ - pm->pmf = &kfd_v9_pm_funcs; -} - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c index ecf4a33..f1d4828 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c @@ -22,20 +22,15 @@ */ #include "kfd_kernel_queue.h" -#include "kfd_device_queue_manager.h" -#include "kfd_pm4_headers_vi.h" -#include "kfd_pm4_opcodes.h" static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, enum kfd_queue_type type, unsigned int queue_size); static void uninitialize_vi(struct kernel_queue *kq); -static void submit_packet_vi(struct kernel_queue *kq); void kernel_queue_init_vi(struct kernel_queue_ops *ops) { ops->initialize = initialize_vi; ops->uninitialize = uninitialize_vi; - ops->submit_packet = submit_packet_vi; } static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, @@ -59,359 +54,3 @@ static void uninitialize_vi(struct kernel_queue *kq) { kfd_gtt_sa_free(kq->dev, kq->eop_mem); } - -static void submit_packet_vi(struct kernel_queue *kq) -{ - *kq->wptr_kernel = kq->pending_wptr; - write_kernel_doorbell(kq->queue->properties.doorbell_ptr, - kq->pending_wptr); -} - -static int pm_map_process_vi(struct packet_manager *pm, - uint32_t *buffer, struct qcm_process_device *qpd) -{ - struct pm4_mes_map_process *packet; - - packet = (struct pm4_mes_map_process *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_map_process)); - - packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, - sizeof(struct pm4_mes_map_process)); - packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; - packet->bitfields2.pasid = qpd->pqm->process->pasid; - packet->bitfields3.page_table_base = qpd->page_table_base; - packet->bitfields10.gds_size = qpd->gds_size; - packet->bitfields10.num_gws = qpd->num_gws; - packet->bitfields10.num_oac = qpd->num_oac; - packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; - - packet->sh_mem_config = qpd->sh_mem_config; - packet->sh_mem_bases = qpd->sh_mem_bases; - packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; - packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; - - packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; - - packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); - packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); - - return 0; -} - - -unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) -{ - union PM4_MES_TYPE_3_HEADER header; - - header.u32All = 0; - header.opcode = opcode; - header.count = packet_size/sizeof(uint32_t) - 2; - header.type = PM4_TYPE_3; - - return header.u32All; -} - -int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, - uint64_t ib, size_t ib_size_in_dwords, bool chain) -{ - struct pm4_mes_runlist *packet; - - int concurrent_proc_cnt = 0; - struct kfd_dev *kfd = pm->dqm->dev; - - /* Determine the number of processes to map together to HW: - * it can not exceed the number of VMIDs available to the - * scheduler, and it is determined by the smaller of the number - * of processes in the runlist and kfd module parameter - * hws_max_conc_proc. - * Note: the arbitration between the number of VMIDs and - * hws_max_conc_proc has been done in - * kgd2kfd_device_init(). - */ - concurrent_proc_cnt = min(pm->dqm->processes_count, - kfd->max_proc_per_quantum); - - - packet = (struct pm4_mes_runlist *)buffer; - - memset(buffer, 0, sizeof(struct pm4_mes_runlist)); - packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, - sizeof(struct pm4_mes_runlist)); - - packet->bitfields4.ib_size = ib_size_in_dwords; - packet->bitfields4.chain = chain ? 1 : 0; - packet->bitfields4.offload_polling = 0; - packet->bitfields4.valid = 1; - packet->bitfields4.process_cnt = concurrent_proc_cnt; - packet->ordinal2 = lower_32_bits(ib); - packet->bitfields3.ib_base_hi = upper_32_bits(ib); - - return 0; -} - -int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, - struct queue *q, bool is_static) -{ - struct pm4_mes_map_queues *packet; - bool use_static = is_static; - - packet = (struct pm4_mes_map_queues *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); - - packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, - sizeof(struct pm4_mes_map_queues)); - packet->bitfields2.alloc_format = - alloc_format__mes_map_queues__one_per_pipe_vi; - packet->bitfields2.num_queues = 1; - packet->bitfields2.queue_sel = - queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; - - packet->bitfields2.engine_sel = - engine_sel__mes_map_queues__compute_vi; - packet->bitfields2.queue_type = - queue_type__mes_map_queues__normal_compute_vi; - - switch (q->properties.type) { - case KFD_QUEUE_TYPE_COMPUTE: - if (use_static) - packet->bitfields2.queue_type = - queue_type__mes_map_queues__normal_latency_static_queue_vi; - break; - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.queue_type = - queue_type__mes_map_queues__debug_interface_queue_vi; - break; - case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = q->properties.sdma_engine_id + - engine_sel__mes_map_queues__sdma0_vi; - use_static = false; /* no static queues under SDMA */ - break; - default: - WARN(1, "queue type %d", q->properties.type); - return -EINVAL; - } - packet->bitfields3.doorbell_offset = - q->properties.doorbell_off; - - packet->mqd_addr_lo = - lower_32_bits(q->gart_mqd_addr); - - packet->mqd_addr_hi = - upper_32_bits(q->gart_mqd_addr); - - packet->wptr_addr_lo = - lower_32_bits((uint64_t)q->properties.write_ptr); - - packet->wptr_addr_hi = - upper_32_bits((uint64_t)q->properties.write_ptr); - - return 0; -} - -int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res) -{ - struct pm4_mes_set_resources *packet; - - packet = (struct pm4_mes_set_resources *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); - - packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, - sizeof(struct pm4_mes_set_resources)); - - packet->bitfields2.queue_type = - queue_type__mes_set_resources__hsa_interface_queue_hiq; - packet->bitfields2.vmid_mask = res->vmid_mask; - packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; - packet->bitfields7.oac_mask = res->oac_mask; - packet->bitfields8.gds_heap_base = res->gds_heap_base; - packet->bitfields8.gds_heap_size = res->gds_heap_size; - - packet->gws_mask_lo = lower_32_bits(res->gws_mask); - packet->gws_mask_hi = upper_32_bits(res->gws_mask); - - packet->queue_mask_lo = lower_32_bits(res->queue_mask); - packet->queue_mask_hi = upper_32_bits(res->queue_mask); - - return 0; -} - -int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, - enum kfd_queue_type type, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param, bool reset, - unsigned int sdma_engine) -{ - struct pm4_mes_unmap_queues *packet; - - packet = (struct pm4_mes_unmap_queues *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); - - packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, - sizeof(struct pm4_mes_unmap_queues)); - switch (type) { - case KFD_QUEUE_TYPE_COMPUTE: - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.engine_sel = - engine_sel__mes_unmap_queues__compute; - break; - case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = - engine_sel__mes_unmap_queues__sdma0 + sdma_engine; - break; - default: - WARN(1, "queue type %d", type); - return -EINVAL; - } - - if (reset) - packet->bitfields2.action = - action__mes_unmap_queues__reset_queues; - else - packet->bitfields2.action = - action__mes_unmap_queues__preempt_queues; - - switch (filter) { - case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_specified_queues; - packet->bitfields2.num_queues = 1; - packet->bitfields3b.doorbell_offset0 = filter_param; - break; - case KFD_UNMAP_QUEUES_FILTER_BY_PASID: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; - packet->bitfields3a.pasid = filter_param; - break; - case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__unmap_all_queues; - break; - case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: - /* in this case, we do not preempt static queues */ - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__unmap_all_non_static_queues; - break; - default: - WARN(1, "filter %d", filter); - return -EINVAL; - } - - return 0; - -} - -int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, - uint64_t fence_address, uint32_t fence_value) -{ - struct pm4_mes_query_status *packet; - - packet = (struct pm4_mes_query_status *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_query_status)); - - - packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, - sizeof(struct pm4_mes_query_status)); - - packet->bitfields2.context_id = 0; - packet->bitfields2.interrupt_sel = - interrupt_sel__mes_query_status__completion_status; - packet->bitfields2.command = - command__mes_query_status__fence_only_after_write_ack; - - packet->addr_hi = upper_32_bits((uint64_t)fence_address); - packet->addr_lo = lower_32_bits((uint64_t)fence_address); - packet->data_hi = upper_32_bits((uint64_t)fence_value); - packet->data_lo = lower_32_bits((uint64_t)fence_value); - - return 0; -} - - -uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) -{ - struct pm4_mec_release_mem *packet; - - packet = (struct pm4_mec_release_mem *)buffer; - memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); - - packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, - sizeof(struct pm4_mec_release_mem)); - - packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; - packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; - packet->bitfields2.tcl1_action_ena = 1; - packet->bitfields2.tc_action_ena = 1; - packet->bitfields2.cache_policy = cache_policy___release_mem__lru; - packet->bitfields2.atc = 0; - - packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; - packet->bitfields3.int_sel = - int_sel___release_mem__send_interrupt_after_write_confirm; - - packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; - packet->address_hi = upper_32_bits(gpu_addr); - - packet->data_lo = 0; - - return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); -} - -uint32_t pm_get_map_process_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_map_process); -} - -uint32_t pm_get_runlist_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_runlist); -} - -uint32_t pm_get_set_resources_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_set_resources); -} - -uint32_t pm_get_map_queues_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_map_queues); -} - -uint32_t pm_get_unmap_queues_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_unmap_queues); -} - -uint32_t pm_get_query_status_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_query_status); -} - -uint32_t pm_get_release_mem_packet_size_vi(void) -{ - return sizeof(struct pm4_mec_release_mem); -} - - -static struct packet_manager_funcs kfd_vi_pm_funcs = { - .map_process = pm_map_process_vi, - .runlist = pm_runlist_vi, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_vi, - .unmap_queues = pm_unmap_queues_vi, - .query_status = pm_query_status_vi, - .release_mem = pm_release_mem_vi, - .get_map_process_packet_size = pm_get_map_process_packet_size_vi, - .get_runlist_packet_size = pm_get_runlist_packet_size_vi, - .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, - .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, - .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, - .get_query_status_packet_size = pm_get_query_status_packet_size_vi, - .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, -}; - -void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver) -{ - pm->pmf = &kfd_vi_pm_funcs; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index ba4d5de..850a562 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -29,10 +29,10 @@ #define KFD_DRIVER_AUTHOR "AMD Inc. and others" #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" -#define KFD_DRIVER_DATE "20160408" -#define KFD_DRIVER_MAJOR 2 -#define KFD_DRIVER_MINOR 0 -#define KFD_DRIVER_PATCHLEVEL 0 +#define KFD_DRIVER_DATE "20150421" +#define KFD_DRIVER_MAJOR 0 +#define KFD_DRIVER_MINOR 7 +#define KFD_DRIVER_PATCHLEVEL 2 static const struct kgd2kfd_calls kgd2kfd = { .exit = kgd2kfd_exit, @@ -42,10 +42,6 @@ static const struct kgd2kfd_calls kgd2kfd = { .interrupt = kgd2kfd_interrupt, .suspend = kgd2kfd_suspend, .resume = kgd2kfd_resume, - .quiesce_mm = kgd2kfd_quiesce_mm, - .resume_mm = kgd2kfd_resume_mm, - .schedule_evict_and_restore_process = - kgd2kfd_schedule_evict_and_restore_process, }; int sched_policy = KFD_SCHED_POLICY_HWS; @@ -53,15 +49,6 @@ module_param(sched_policy, int, 0444); MODULE_PARM_DESC(sched_policy, "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); -int hws_max_conc_proc = 8; -module_param(hws_max_conc_proc, int, 0444); -MODULE_PARM_DESC(hws_max_conc_proc, - "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); - -int cwsr_enable = 1; -module_param(cwsr_enable, int, 0444); -MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); - int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; module_param(max_num_of_queues_per_device, int, 0444); MODULE_PARM_DESC(max_num_of_queues_per_device, @@ -74,28 +61,7 @@ MODULE_PARM_DESC(send_sigterm, static int amdkfd_init_completed; -int debug_largebar; -module_param(debug_largebar, int, 0444); -MODULE_PARM_DESC(debug_largebar, - "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); - -int ignore_crat; -module_param(ignore_crat, int, 0444); -MODULE_PARM_DESC(ignore_crat, - "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); - -int vega10_noretry; -module_param_named(noretry, vega10_noretry, int, 0644); -MODULE_PARM_DESC(noretry, - "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); - -int priv_cp_queues; -module_param(priv_cp_queues, int, 0644); -MODULE_PARM_DESC(priv_cp_queues, - "Enable privileged mode for CP queues (0 = off (default), 1 = on)"); - -int kgd2kfd_init(unsigned int interface_version, - const struct kgd2kfd_calls **g2f) +int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f) { if (!amdkfd_init_completed) return -EPROBE_DEFER; @@ -124,7 +90,7 @@ static int __init kfd_module_init(void) /* Verify module parameters */ if ((sched_policy < KFD_SCHED_POLICY_HWS) || (sched_policy > KFD_SCHED_POLICY_NO_HWS)) { - pr_err("sched_policy has invalid value\n"); + pr_err("kfd: sched_policy has invalid value\n"); return -1; } @@ -132,13 +98,13 @@ static int __init kfd_module_init(void) if ((max_num_of_queues_per_device < 1) || (max_num_of_queues_per_device > KFD_MAX_NUM_OF_QUEUES_PER_DEVICE)) { - pr_err("max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n"); + pr_err("kfd: max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n"); return -1; } err = kfd_pasid_init(); if (err < 0) - return err; + goto err_pasid; err = kfd_chardev_init(); if (err < 0) @@ -148,16 +114,8 @@ static int __init kfd_module_init(void) if (err < 0) goto err_topology; - err = kfd_ipc_init(); - if (err < 0) - goto err_topology; - kfd_process_create_wq(); - kfd_init_peer_direct(); - - kfd_debugfs_init(); - amdkfd_init_completed = 1; dev_info(kfd_device, "Initialized module\n"); @@ -168,6 +126,7 @@ static int __init kfd_module_init(void) kfd_chardev_exit(); err_ioctl: kfd_pasid_exit(); +err_pasid: return err; } @@ -175,8 +134,6 @@ static void __exit kfd_module_exit(void) { amdkfd_init_completed = 0; - kfd_debugfs_fini(); - kfd_close_peer_direct(); kfd_process_destroy_wq(); kfd_topology_shutdown(); kfd_chardev_exit(); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 9eb2d54..b1ef136 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -23,68 +23,14 @@ #include "kfd_priv.h" -/* Mapping queue priority to pipe priority, indexed by queue priority */ -int pipe_priority_map[] = { - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_HIGH, - KFD_PIPE_PRIORITY_CS_HIGH, - KFD_PIPE_PRIORITY_CS_HIGH, - KFD_PIPE_PRIORITY_CS_HIGH, - KFD_PIPE_PRIORITY_CS_HIGH -}; - -/* Mapping queue priority to SPI priority, indexed by queue priority - * SPI priority 2 and 3 are reserved for trap handler context save - */ -int spi_priority_map[] = { - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW -}; - struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, struct kfd_dev *dev) { switch (dev->device_info->asic_family) { case CHIP_KAVERI: return mqd_manager_init_cik(type, dev); - case CHIP_HAWAII: - return mqd_manager_init_cik_hawaii(type, dev); case CHIP_CARRIZO: return mqd_manager_init_vi(type, dev); - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - return mqd_manager_init_vi_tonga(type, dev); - case CHIP_VEGA10: - case CHIP_RAVEN: - return mqd_manager_init_v9(type, dev); - default: - BUG(); } return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index dcaeda8..213a71e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -43,9 +43,6 @@ * * @is_occupied: Checks if the relevant HQD slot is occupied. * - * @get_wave_state: Retrieves context save state and optionally copies the - * control stack, if kept in the MQD, to the given userspace address. - * * @mqd_mutex: Mqd manager mutex. * * @dev: The kfd device structure coupled with this module. @@ -62,8 +59,7 @@ * per KFD_MQD_TYPE for each device. * */ -extern int pipe_priority_map[]; -extern int spi_priority_map[]; + struct mqd_manager { int (*init_mqd)(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, @@ -71,8 +67,7 @@ struct mqd_manager { int (*load_mqd)(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, - struct mm_struct *mms); + uint32_t __user *wptr); int (*update_mqd)(struct mqd_manager *mm, void *mqd, struct queue_properties *q); @@ -89,15 +84,6 @@ struct mqd_manager { uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); - int (*get_wave_state)(struct mqd_manager *mm, void *mqd, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size); - -#if defined(CONFIG_DEBUG_FS) - int (*debugfs_show_mqd)(struct seq_file *m, void *data); -#endif - struct mutex mqd_mutex; struct kfd_dev *dev; }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 5724d33..6acc431 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -30,80 +30,12 @@ #include "cik_regs.h" #include "cik_structs.h" #include "oss/oss_2_4_sh_mask.h" -#include "gca/gfx_7_2_sh_mask.h" static inline struct cik_mqd *get_mqd(void *mqd) { return (struct cik_mqd *)mqd; } -static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) -{ - return (struct cik_sdma_rlc_registers *)mqd; -} - -static void update_cu_mask(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct cik_mqd *m; - struct kfd_cu_info cu_info; - uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ - uint32_t cu_mask_count = q->cu_mask_count; - const uint32_t *cu_mask = q->cu_mask; - int se, cu_per_sh, cu_index, i; - - if (cu_mask_count == 0) - return; - - m = get_mqd(mqd); - m->compute_static_thread_mgmt_se0 = 0; - m->compute_static_thread_mgmt_se1 = 0; - m->compute_static_thread_mgmt_se2 = 0; - m->compute_static_thread_mgmt_se3 = 0; - - mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); - - /* If # CU mask bits > # CUs, set it to the # of CUs */ - if (cu_mask_count > cu_info.cu_active_number) - cu_mask_count = cu_info.cu_active_number; - - cu_index = 0; - for (se = 0; se < cu_info.num_shader_engines; se++) { - cu_per_sh = 0; - - /* Get the number of CUs on this Shader Engine */ - for (i = 0; i < 4; i++) - cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); - - se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); - if ((cu_per_sh + (cu_index % 32)) > 32) - se_mask[se] |= cu_mask[(cu_index / 32) + 1] - << (32 - (cu_index % 32)); - se_mask[se] &= (1 << cu_per_sh) - 1; - cu_index += cu_per_sh; - } - m->compute_static_thread_mgmt_se0 = se_mask[0]; - m->compute_static_thread_mgmt_se1 = se_mask[1]; - m->compute_static_thread_mgmt_se2 = se_mask[2]; - m->compute_static_thread_mgmt_se3 = se_mask[3]; - - pr_debug("Update cu mask to %#x %#x %#x %#x\n", - m->compute_static_thread_mgmt_se0, - m->compute_static_thread_mgmt_se1, - m->compute_static_thread_mgmt_se2, - m->compute_static_thread_mgmt_se3); -} - -static void set_priority(struct cik_mqd *m, struct queue_properties *q) -{ - m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; - m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & - (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | - (spi_priority_map[q->priority] << - COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); -} - static int init_mqd(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -112,6 +44,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, struct cik_mqd *m; int retval; + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), mqd_mem_obj); @@ -142,6 +78,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, m->cp_mqd_base_addr_lo = lower_32_bits(addr); m->cp_mqd_base_addr_hi = upper_32_bits(addr); + m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN; + /* Although WinKFD writes this, I suspect it should not be necessary */ + m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; + m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | QUANTUM_DURATION(10); @@ -154,17 +94,14 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, * 1 = CS_MEDIUM (typically between HP3D and GFX * 2 = CS_HIGH (typically above HP3D) */ - set_priority(m, q); + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_iq_rptr = AQL_ENABLE; - if (priv_cp_queues) - m->cp_hqd_pq_control |= - 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; - *mqd = m; - if (gart_addr) + if (gart_addr != NULL) *gart_addr = addr; retval = mm->update_mqd(mm, m, q); @@ -178,6 +115,8 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, int retval; struct cik_sdma_rlc_registers *m; + BUG_ON(!mm || !mqd || !mqd_mem_obj); + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_sdma_rlc_registers), mqd_mem_obj); @@ -190,7 +129,7 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, memset(m, 0, sizeof(struct cik_sdma_rlc_registers)); *mqd = m; - if (gart_addr) + if (gart_addr != NULL) *gart_addr = (*mqd_mem_obj)->gpu_addr; retval = mm->update_mqd(mm, m, q); @@ -201,50 +140,43 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, static void uninit_mqd(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { + BUG_ON(!mm || !mqd); kfd_gtt_sa_free(mm->dev, mqd_mem_obj); } static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { + BUG_ON(!mm || !mqd); kfd_gtt_sa_free(mm->dev, mqd_mem_obj); } static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, - uint32_t queue_id, struct queue_properties *p, - struct mm_struct *mms) + uint32_t queue_id, uint32_t __user *wptr) { - /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ - uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); - uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); - - return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, - (uint32_t __user *)p->write_ptr, - wptr_shift, wptr_mask, mms); + return mm->dev->kfd2kgd->hqd_load + (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); } static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) + uint32_t pipe_id, uint32_t queue_id, + uint32_t __user *wptr) { - return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, - (uint32_t __user *)p->write_ptr, - mms); + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); } -static int __update_mqd(struct mqd_manager *mm, void *mqd, - struct queue_properties *q, unsigned int atc_bit) +static int update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) { struct cik_mqd *m; + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + m = get_mqd(mqd); m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | - DEFAULT_MIN_AVAIL_SIZE; - m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE; - if (atc_bit) { - m->cp_hqd_pq_control |= PQ_ATC_EN; - m->cp_hqd_ib_control |= IB_ATC_EN; - } + DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN; /* * Calculating queue size which is log base 2 of actual queue size -1 @@ -256,47 +188,37 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); + m->cp_hqd_pq_doorbell_control = DOORBELL_EN | + DOORBELL_OFFSET(q->doorbell_off); m->cp_hqd_vmid = q->vmid; - if (q->format == KFD_QUEUE_FORMAT_AQL) + if (q->format == KFD_QUEUE_FORMAT_AQL) { m->cp_hqd_pq_control |= NO_UPDATE_RPTR; + } - update_cu_mask(mm, mqd, q); - set_priority(m, q); - + m->cp_hqd_active = 0; q->is_active = false; if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted) { + q->queue_percent > 0) { + m->cp_hqd_active = 1; q->is_active = true; } return 0; } -static int update_mqd(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - return __update_mqd(mm, mqd, q, 1); -} - -static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - return __update_mqd(mm, mqd, q, 0); -} - static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct cik_sdma_rlc_registers *m; + BUG_ON(!mm || !mqd || !q); + m = get_sdma_mqd(mqd); - m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) - << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << + SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; @@ -305,8 +227,9 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8); m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->sdma_rlc_doorbell = - q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; + m->sdma_rlc_doorbell = q->doorbell_off << + SDMA0_RLC0_DOORBELL__OFFSET__SHIFT | + 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; m->sdma_rlc_virtual_addr = q->sdma_vm_addr; @@ -316,8 +239,10 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, q->is_active = false; if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted) { + q->queue_percent > 0) { + m->sdma_rlc_rb_cntl |= + 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; + q->is_active = true; } @@ -329,7 +254,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id) { - return mm->dev->kfd2kgd->hqd_destroy(mm->dev->kgd, mqd, type, timeout, + return mm->dev->kfd2kgd->hqd_destroy(mm->dev->kgd, type, timeout, pipe_id, queue_id); } @@ -376,6 +301,10 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, struct cik_mqd *m; int retval; + BUG_ON(!mm || !q || !mqd || !mqd_mem_obj); + + pr_debug("kfd: In func %s\n", __func__); + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), mqd_mem_obj); @@ -414,7 +343,8 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, * 1 = CS_MEDIUM (typically between HP3D and GFX * 2 = CS_HIGH (typically above HP3D) */ - set_priority(m, q); + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; *mqd = m; if (gart_addr) @@ -429,6 +359,10 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, { struct cik_mqd *m; + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + m = get_mqd(mqd); m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | DEFAULT_MIN_AVAIL_SIZE | @@ -445,50 +379,45 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); + m->cp_hqd_pq_doorbell_control = DOORBELL_EN | + DOORBELL_OFFSET(q->doorbell_off); m->cp_hqd_vmid = q->vmid; + m->cp_hqd_active = 0; q->is_active = false; if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted) { + q->queue_percent > 0) { + m->cp_hqd_active = 1; q->is_active = true; } - set_priority(m, q); return 0; } -#if defined(CONFIG_DEBUG_FS) - -static int debugfs_show_mqd(struct seq_file *m, void *data) +struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) { - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct cik_mqd), false); - return 0; -} + struct cik_sdma_rlc_registers *m; -static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct cik_sdma_rlc_registers), false); - return 0; -} + BUG_ON(!mqd); -#endif + m = (struct cik_sdma_rlc_registers *)mqd; + return m; +} struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_dev *dev) { struct mqd_manager *mqd; - if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) - return NULL; + BUG_ON(!dev); + BUG_ON(type >= KFD_MQD_TYPE_MAX); - mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + pr_debug("kfd: In func %s\n", __func__); + + mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); if (!mqd) return NULL; @@ -503,9 +432,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif break; case KFD_MQD_TYPE_HIQ: mqd->init_mqd = init_mqd_hiq; @@ -514,9 +440,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif break; case KFD_MQD_TYPE_SDMA: mqd->init_mqd = init_mqd_sdma; @@ -525,9 +448,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; -#endif break; default: kfree(mqd); @@ -537,15 +457,3 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, return mqd; } -struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) -{ - struct mqd_manager *mqd; - - mqd = mqd_manager_init_cik(type, dev); - if (!mqd) - return NULL; - if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) - mqd->update_mqd = update_mqd_hawaii; - return mqd; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c deleted file mode 100644 index 6c302d2..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ /dev/null @@ -1,528 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include -#include -#include -#include "kfd_priv.h" -#include "kfd_mqd_manager.h" -#include "v9_structs.h" -#include "vega10/GC/gc_9_0_offset.h" -#include "vega10/GC/gc_9_0_sh_mask.h" -#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" - -static inline struct v9_mqd *get_mqd(void *mqd) -{ - return (struct v9_mqd *)mqd; -} - -static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) -{ - return (struct v9_sdma_mqd *)mqd; -} - -static void update_cu_mask(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct v9_mqd *m; - struct kfd_cu_info cu_info; - uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ - uint32_t cu_mask_count = q->cu_mask_count; - const uint32_t *cu_mask = q->cu_mask; - int se, cu_per_sh, cu_index, i; - - if (cu_mask_count == 0) - return; - - m = get_mqd(mqd); - m->compute_static_thread_mgmt_se0 = 0; - m->compute_static_thread_mgmt_se1 = 0; - m->compute_static_thread_mgmt_se2 = 0; - m->compute_static_thread_mgmt_se3 = 0; - - mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); - - /* If # CU mask bits > # CUs, set it to the # of CUs */ - if (cu_mask_count > cu_info.cu_active_number) - cu_mask_count = cu_info.cu_active_number; - - cu_index = 0; - for (se = 0; se < cu_info.num_shader_engines; se++) { - cu_per_sh = 0; - - /* Get the number of CUs on this Shader Engine */ - for (i = 0; i < 4; i++) - cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); - - se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); - if ((cu_per_sh + (cu_index % 32)) > 32) - se_mask[se] |= cu_mask[(cu_index / 32) + 1] - << (32 - (cu_index % 32)); - se_mask[se] &= (1 << cu_per_sh) - 1; - cu_index += cu_per_sh; - } - m->compute_static_thread_mgmt_se0 = se_mask[0]; - m->compute_static_thread_mgmt_se1 = se_mask[1]; - m->compute_static_thread_mgmt_se2 = se_mask[2]; - m->compute_static_thread_mgmt_se3 = se_mask[3]; - - pr_debug("update cu mask to %#x %#x %#x %#x\n", - m->compute_static_thread_mgmt_se0, - m->compute_static_thread_mgmt_se1, - m->compute_static_thread_mgmt_se2, - m->compute_static_thread_mgmt_se3); -} - -static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -{ - int retval; - uint64_t addr; - struct v9_mqd *m; - struct kfd_dev *kfd = mm->dev; - - /* From V9, for CWSR, the control stack is located on the next page - * boundary after the mqd, we will use the gtt allocation function - * instead of sub-allocation function. - */ - if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { - *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); - retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, - ALIGN(q->ctl_stack_size, PAGE_SIZE) + - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), - &((*mqd_mem_obj)->gtt_mem), - &((*mqd_mem_obj)->gpu_addr), - (void *)&((*mqd_mem_obj)->cpu_ptr)); - } else - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), - mqd_mem_obj); - if (retval != 0) - return -ENOMEM; - - m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; - addr = (*mqd_mem_obj)->gpu_addr; - - memset(m, 0, sizeof(struct v9_mqd)); - - m->header = 0xC0310800; - m->compute_pipelinestat_enable = 1; - m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; - m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; - m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; - m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; - - m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | - 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; - - m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; - - m->cp_mqd_base_addr_lo = lower_32_bits(addr); - m->cp_mqd_base_addr_hi = upper_32_bits(addr); - - m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | - 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | - 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; - - m->cp_hqd_pipe_priority = 1; - m->cp_hqd_queue_priority = 15; - - if (q->format == KFD_QUEUE_FORMAT_AQL) { - m->cp_hqd_aql_control = - 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; - } - - if (q->tba_addr) { - m->compute_pgm_rsrc2 |= - (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); - } - - if (mm->dev->cwsr_enabled) { - m->cp_hqd_persistent_state |= - (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); - m->cp_hqd_ctx_save_base_addr_lo = - lower_32_bits(q->ctx_save_restore_area_address); - m->cp_hqd_ctx_save_base_addr_hi = - upper_32_bits(q->ctx_save_restore_area_address); - m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; - m->cp_hqd_cntl_stack_size = q->ctl_stack_size; - m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; - m->cp_hqd_wg_state_offset = q->ctl_stack_size; - } - - if (priv_cp_queues) - m->cp_hqd_pq_control |= - 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; - - *mqd = m; - if (gart_addr) - *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - - return retval; -} - -static int load_mqd(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) -{ - /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ - uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); - - return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, - (uint32_t __user *)p->write_ptr, - wptr_shift, 0, mms); -} - -static int update_mqd(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct v9_mqd *m; - - m = get_mqd(mqd); - - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; - m->cp_hqd_pq_control |= - ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); - - m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); - m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); - - m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); - m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); - - m->cp_hqd_pq_doorbell_control = - q->doorbell_off << - CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; - pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", - m->cp_hqd_pq_doorbell_control); - - m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; - - /* - * HW does not clamp this field correctly. Maximum EOP queue size - * is constrained by per-SE EOP done signal count, which is 8-bit. - * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit - * more than (EOP entry count - 1) so a queue size of 0x800 dwords - * is safe, giving a maximum field value of 0xA. - */ - m->cp_hqd_eop_control = min(0xA, - ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); - m->cp_hqd_eop_base_addr_lo = - lower_32_bits(q->eop_ring_buffer_address >> 8); - m->cp_hqd_eop_base_addr_hi = - upper_32_bits(q->eop_ring_buffer_address >> 8); - - m->cp_hqd_iq_timer = 0; - - m->cp_hqd_vmid = q->vmid; - - if (q->format == KFD_QUEUE_FORMAT_AQL) { - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | - 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | - 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | - 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; - m->cp_hqd_pq_doorbell_control |= - 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; - } - if (mm->dev->cwsr_enabled) - m->cp_hqd_ctx_save_control = 0; - - update_cu_mask(mm, mqd, q); - - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted) { - q->is_active = true; - } - - return 0; -} - - -static int destroy_mqd(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_destroy - (mm->dev->kgd, mqd, type, timeout, - pipe_id, queue_id); -} - -static void uninit_mqd(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - struct kfd_dev *kfd = mm->dev; - - if (mqd_mem_obj->gtt_mem) { - kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); - kfree(mqd_mem_obj); - } else { - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); - } -} - -static bool is_occupied(struct mqd_manager *mm, void *mqd, - uint64_t queue_address, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_is_occupied( - mm->dev->kgd, queue_address, - pipe_id, queue_id); -} - -static int get_wave_state(struct mqd_manager *mm, void *mqd, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size) -{ - struct v9_mqd *m; - - /* Control stack is located one page after MQD. */ - void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); - - m = get_mqd(mqd); - - *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - - m->cp_hqd_cntl_stack_offset; - *save_area_used_size = m->cp_hqd_wg_state_offset - - m->cp_hqd_cntl_stack_size; - - if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) - return -EFAULT; - - return 0; -} - -static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -{ - struct v9_mqd *m; - int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); - - if (retval != 0) - return retval; - - m = get_mqd(*mqd); - - m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | - 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; - - return retval; -} - -static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct v9_mqd *m; - int retval = update_mqd(mm, mqd, q); - - if (retval != 0) - return retval; - - /* TODO: what's the point? update_mqd already does this. */ - m = get_mqd(mqd); - m->cp_hqd_vmid = q->vmid; - return retval; -} - -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -{ - int retval; - struct v9_sdma_mqd *m; - - - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct v9_sdma_mqd), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; - - memset(m, 0, sizeof(struct v9_sdma_mqd)); - - *mqd = m; - if (gart_addr) - *gart_addr = (*mqd_mem_obj)->gpu_addr; - - retval = mm->update_mqd(mm, m, q); - - return retval; -} - -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -} - -static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) -{ - return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, - (uint32_t __user *)p->write_ptr, - mms); -} - -#define SDMA_RLC_DUMMY_DEFAULT 0xf - -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct v9_sdma_mqd *m; - - m = get_sdma_mqd(mqd); - m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) - << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | - q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | - 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; - - m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); - m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); - m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->sdmax_rlcx_doorbell_offset = - q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; - - m->sdma_engine_id = q->sdma_engine_id; - m->sdma_queue_id = q->sdma_queue_id; - m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; - - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted) { - q->is_active = true; - } - - return 0; -} - -/* - * * preempt type here is ignored because there is only one way - * * to preempt sdma queue - */ -static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); -} - -static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, - uint64_t queue_address, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); -} - -#if defined(CONFIG_DEBUG_FS) - -static int debugfs_show_mqd(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct v9_mqd), false); - return 0; -} - -static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct v9_sdma_mqd), false); - return 0; -} - -#endif - -struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) -{ - struct mqd_manager *mqd; - - if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) - return NULL; - - mqd = kzalloc(sizeof(*mqd), GFP_NOIO); - if (!mqd) - return NULL; - - mqd->dev = dev; - - switch (type) { - case KFD_MQD_TYPE_CP: - case KFD_MQD_TYPE_COMPUTE: - mqd->init_mqd = init_mqd; - mqd->uninit_mqd = uninit_mqd; - mqd->load_mqd = load_mqd; - mqd->update_mqd = update_mqd; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; - mqd->get_wave_state = get_wave_state; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif - break; - case KFD_MQD_TYPE_HIQ: - mqd->init_mqd = init_mqd_hiq; - mqd->uninit_mqd = uninit_mqd; - mqd->load_mqd = load_mqd; - mqd->update_mqd = update_mqd_hiq; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif - break; - case KFD_MQD_TYPE_SDMA: - mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; - mqd->load_mqd = load_mqd_sdma; - mqd->update_mqd = update_mqd_sdma; - mqd->destroy_mqd = destroy_mqd_sdma; - mqd->is_occupied = is_occupied_sdma; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; -#endif - break; - default: - kfree(mqd); - return NULL; - } - - return mqd; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 5c26e5a..a9b9882 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -30,7 +30,6 @@ #include "vi_structs.h" #include "gca/gfx_8_0_sh_mask.h" #include "gca/gfx_8_0_enum.h" -#include "oss/oss_3_0_sh_mask.h" #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 @@ -39,73 +38,6 @@ static inline struct vi_mqd *get_mqd(void *mqd) return (struct vi_mqd *)mqd; } -static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) -{ - return (struct vi_sdma_mqd *)mqd; -} - -static void update_cu_mask(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct vi_mqd *m; - struct kfd_cu_info cu_info; - uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ - uint32_t cu_mask_count = q->cu_mask_count; - const uint32_t *cu_mask = q->cu_mask; - int se, cu_per_sh, cu_index, i; - - if (cu_mask_count == 0) - return; - - m = get_mqd(mqd); - m->compute_static_thread_mgmt_se0 = 0; - m->compute_static_thread_mgmt_se1 = 0; - m->compute_static_thread_mgmt_se2 = 0; - m->compute_static_thread_mgmt_se3 = 0; - - mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); - - /* If # CU mask bits > # CUs, set it to the # of CUs */ - if (cu_mask_count > cu_info.cu_active_number) - cu_mask_count = cu_info.cu_active_number; - - cu_index = 0; - for (se = 0; se < cu_info.num_shader_engines; se++) { - cu_per_sh = 0; - - /* Get the number of CUs on this Shader Engine */ - for (i = 0; i < 4; i++) - cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); - - se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); - if ((cu_per_sh + (cu_index % 32)) > 32) - se_mask[se] |= cu_mask[(cu_index / 32) + 1] - << (32 - (cu_index % 32)); - se_mask[se] &= (1 << cu_per_sh) - 1; - cu_index += cu_per_sh; - } - m->compute_static_thread_mgmt_se0 = se_mask[0]; - m->compute_static_thread_mgmt_se1 = se_mask[1]; - m->compute_static_thread_mgmt_se2 = se_mask[2]; - m->compute_static_thread_mgmt_se3 = se_mask[3]; - - pr_debug("Update cu mask to %#x %#x %#x %#x\n", - m->compute_static_thread_mgmt_se0, - m->compute_static_thread_mgmt_se1, - m->compute_static_thread_mgmt_se2, - m->compute_static_thread_mgmt_se3); -} - -static void set_priority(struct vi_mqd *m, struct queue_properties *q) -{ - m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; - m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & - (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | - (spi_priority_map[q->priority] << - COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); -} - static int init_mqd(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -144,40 +76,16 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; - set_priority(m, q); + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT; if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_iq_rptr = 1; - if (q->tba_addr) { - m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); - m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); - m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); - m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); - m->compute_pgm_rsrc2 |= - (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); - } - - if (mm->dev->cwsr_enabled) { - m->cp_hqd_persistent_state |= - (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); - m->cp_hqd_ctx_save_base_addr_lo = - lower_32_bits(q->ctx_save_restore_area_address); - m->cp_hqd_ctx_save_base_addr_hi = - upper_32_bits(q->ctx_save_restore_area_address); - m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; - m->cp_hqd_cntl_stack_size = q->ctl_stack_size; - m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; - m->cp_hqd_wg_state_offset = q->ctl_stack_size; - } - - if (priv_cp_queues) - m->cp_hqd_pq_control |= - 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; - *mqd = m; - if (gart_addr) + if (gart_addr != NULL) *gart_addr = addr; retval = mm->update_mqd(mm, m, q); @@ -186,15 +94,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) + uint32_t __user *wptr) { - /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ - uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); - uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); - - return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, - (uint32_t __user *)p->write_ptr, - wptr_shift, wptr_mask, mms); + return mm->dev->kfd2kgd->hqd_load + (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); } static int __update_mqd(struct mqd_manager *mm, void *mqd, @@ -203,6 +106,10 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, { struct vi_mqd *m; + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + m = get_mqd(mqd); m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT | @@ -210,20 +117,19 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + pr_debug("kfd: cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); - m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); m->cp_hqd_pq_doorbell_control = + 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_EN__SHIFT | q->doorbell_off << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; - pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + pr_debug("kfd: cp_hqd_pq_doorbell_control 0x%x\n", m->cp_hqd_pq_doorbell_control); m->cp_hqd_eop_control = atc_bit << CP_HQD_EOP_CONTROL__EOP_ATC__SHIFT | @@ -233,15 +139,8 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | mtype << CP_HQD_IB_CONTROL__MTYPE__SHIFT; - /* - * HW does not clamp this field correctly. Maximum EOP queue size - * is constrained by per-SE EOP done signal count, which is 8-bit. - * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit - * more than (EOP entry count - 1) so a queue size of 0x800 dwords - * is safe, giving a maximum field value of 0xA. - */ - m->cp_hqd_eop_control |= min(0xA, - ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); + m->cp_hqd_eop_control |= + ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1; m->cp_hqd_eop_base_addr_lo = lower_32_bits(q->eop_ring_buffer_address >> 8); m->cp_hqd_eop_base_addr_hi = @@ -256,19 +155,13 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; } - if (mm->dev->cwsr_enabled) - m->cp_hqd_ctx_save_control = - atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | - mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; - - update_cu_mask(mm, mqd, q); - set_priority(m, q); + m->cp_hqd_active = 0; q->is_active = false; if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted) { + q->queue_percent > 0) { + m->cp_hqd_active = 1; q->is_active = true; } @@ -282,25 +175,20 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, return __update_mqd(mm, mqd, q, MTYPE_CC, 1); } -static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - return __update_mqd(mm, mqd, q, MTYPE_UC, 0); -} - static int destroy_mqd(struct mqd_manager *mm, void *mqd, enum kfd_preempt_type type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id) { return mm->dev->kfd2kgd->hqd_destroy - (mm->dev->kgd, mqd, type, timeout, + (mm->dev->kgd, type, timeout, pipe_id, queue_id); } static void uninit_mqd(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { + BUG_ON(!mm || !mqd); kfd_gtt_sa_free(mm->dev, mqd_mem_obj); } @@ -313,28 +201,6 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd, pipe_id, queue_id); } -static int get_wave_state(struct mqd_manager *mm, void *mqd, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size) -{ - struct vi_mqd *m; - - m = get_mqd(mqd); - - *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - - m->cp_hqd_cntl_stack_offset; - *save_area_used_size = m->cp_hqd_wg_state_offset - - m->cp_hqd_cntl_stack_size; - - /* Control stack is not copied to user mode for GFXv8 because - * it's part of the context save area that is already - * accessible to user mode - */ - - return 0; -} - static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -367,130 +233,17 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, return retval; } -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -{ - int retval; - struct vi_sdma_mqd *m; - - - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct vi_sdma_mqd), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; - - memset(m, 0, sizeof(struct vi_sdma_mqd)); - - *mqd = m; - if (gart_addr) - *gart_addr = (*mqd_mem_obj)->gpu_addr; - - retval = mm->update_mqd(mm, m, q); - - return retval; -} - -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -} - -static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) -{ - return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, - (uint32_t __user *)p->write_ptr, - mms); -} - -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct vi_sdma_mqd *m; - - m = get_sdma_mqd(mqd); - m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) - << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | - q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | - 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; - - m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); - m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); - m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->sdmax_rlcx_doorbell = - q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; - - m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr; - - m->sdma_engine_id = q->sdma_engine_id; - m->sdma_queue_id = q->sdma_queue_id; - - q->is_active = false; - if (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted) { - q->is_active = true; - } - - return 0; -} - -/* - * * preempt type here is ignored because there is only one way - * * to preempt sdma queue - */ -static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); -} - -static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, - uint64_t queue_address, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); -} - -#if defined(CONFIG_DEBUG_FS) - -static int debugfs_show_mqd(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct vi_mqd), false); - return 0; -} - -static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct vi_sdma_mqd), false); - return 0; -} - -#endif - struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, struct kfd_dev *dev) { struct mqd_manager *mqd; - if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) - return NULL; + BUG_ON(!dev); + BUG_ON(type >= KFD_MQD_TYPE_MAX); + + pr_debug("kfd: In func %s\n", __func__); - mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); if (!mqd) return NULL; @@ -505,10 +258,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; - mqd->get_wave_state = get_wave_state; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif break; case KFD_MQD_TYPE_HIQ: mqd->init_mqd = init_mqd_hiq; @@ -517,20 +266,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif break; case KFD_MQD_TYPE_SDMA: - mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; - mqd->load_mqd = load_mqd_sdma; - mqd->update_mqd = update_mqd_sdma; - mqd->destroy_mqd = destroy_mqd_sdma; - mqd->is_occupied = is_occupied_sdma; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; -#endif break; default: kfree(mqd); @@ -539,17 +276,3 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, return mqd; } - -struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) -{ - struct mqd_manager *mqd; - - mqd = mqd_manager_init_vi(type, dev); - if (!mqd) - return NULL; - if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) - mqd->update_mqd = update_mqd_tonga; - return mqd; -} - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 7cca7b4..7e92921 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -26,6 +26,8 @@ #include "kfd_device_queue_manager.h" #include "kfd_kernel_queue.h" #include "kfd_priv.h" +#include "kfd_pm4_headers.h" +#include "kfd_pm4_headers_vi.h" #include "kfd_pm4_opcodes.h" static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, @@ -33,45 +35,47 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, { unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t); - WARN((temp * sizeof(uint32_t)) > buffer_size_bytes, - "Runlist IB overflow"); + BUG_ON((temp * sizeof(uint32_t)) > buffer_size_bytes); *wptr = temp; } +static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) +{ + union PM4_MES_TYPE_3_HEADER header; + + header.u32all = 0; + header.opcode = opcode; + header.count = packet_size/sizeof(uint32_t) - 2; + header.type = PM4_TYPE_3; + + return header.u32all; +} + static void pm_calc_rlib_size(struct packet_manager *pm, unsigned int *rlib_size, bool *over_subscription) { - unsigned int process_count, queue_count, compute_queue_count; + unsigned int process_count, queue_count; unsigned int map_queue_size; - unsigned int max_proc_per_quantum = 1; - struct kfd_dev *dev = pm->dqm->dev; + BUG_ON(!pm || !rlib_size || !over_subscription); process_count = pm->dqm->processes_count; queue_count = pm->dqm->queue_count; - compute_queue_count = queue_count - pm->dqm->sdma_queue_count; - - /* check if there is over subscription - * Note: the arbitration between the number of VMIDs and - * hws_max_conc_proc has been done in - * kgd2kfd_device_init(). - */ + /* check if there is over subscription*/ *over_subscription = false; - - if (dev->max_proc_per_quantum > 1) - max_proc_per_quantum = dev->max_proc_per_quantum; - - if ((process_count > max_proc_per_quantum) || - compute_queue_count > get_queues_num(pm->dqm)) { + if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) { *over_subscription = true; - pr_debug("Over subscribed runlist\n"); + pr_debug("kfd: over subscribed runlist\n"); } - map_queue_size = pm->pmf->get_map_queues_packet_size(); + map_queue_size = + (pm->dqm->dev->device_info->asic_family == CHIP_CARRIZO) ? + sizeof(struct pm4_mes_map_queues) : + sizeof(struct pm4_map_queues); /* calculate run list ib allocation size */ - *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + + *rlib_size = process_count * sizeof(struct pm4_map_process) + queue_count * map_queue_size; /* @@ -79,9 +83,9 @@ static void pm_calc_rlib_size(struct packet_manager *pm, * when over subscription */ if (*over_subscription) - *rlib_size += pm->pmf->get_runlist_packet_size(); + *rlib_size += sizeof(struct pm4_runlist); - pr_debug("runlist ib size %d\n", *rlib_size); + pr_debug("kfd: runlist ib size %d\n", *rlib_size); } static int pm_allocate_runlist_ib(struct packet_manager *pm, @@ -92,19 +96,18 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, { int retval; - if (WARN_ON(pm->allocated)) - return -EINVAL; + BUG_ON(!pm); + BUG_ON(pm->allocated); + BUG_ON(is_over_subscription == NULL); pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); - mutex_lock(&pm->lock); - retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, &pm->ib_buffer_obj); - if (retval) { - pr_err("Failed to allocate runlist IB\n"); - goto out; + if (retval != 0) { + pr_err("kfd: failed to allocate runlist IB\n"); + return retval; } *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; @@ -112,12 +115,198 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, memset(*rl_buffer, 0, *rl_buffer_size); pm->allocated = true; - -out: - mutex_unlock(&pm->lock); return retval; } +static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) +{ + struct pm4_runlist *packet; + + BUG_ON(!pm || !buffer || !ib); + + packet = (struct pm4_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_runlist)); + packet->header.u32all = build_pm4_header(IT_RUN_LIST, + sizeof(struct pm4_runlist)); + + packet->bitfields4.ib_size = ib_size_in_dwords; + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; + packet->ordinal2 = lower_32_bits(ib); + packet->bitfields3.ib_base_hi = upper_32_bits(ib); + + return 0; +} + +static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, + struct qcm_process_device *qpd) +{ + struct pm4_map_process *packet; + struct queue *cur; + uint32_t num_queues; + + BUG_ON(!pm || !buffer || !qpd); + + packet = (struct pm4_map_process *)buffer; + + pr_debug("kfd: In func %s\n", __func__); + + memset(buffer, 0, sizeof(struct pm4_map_process)); + + packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields3.page_table_base = qpd->page_table_base; + packet->bitfields10.gds_size = qpd->gds_size; + packet->bitfields10.num_gws = qpd->num_gws; + packet->bitfields10.num_oac = qpd->num_oac; + num_queues = 0; + list_for_each_entry(cur, &qpd->queues_list, list) + num_queues++; + packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + return 0; +} + +static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static) +{ + struct pm4_mes_map_queues *packet; + bool use_static = is_static; + + BUG_ON(!pm || !buffer || !q); + + pr_debug("kfd: In func %s\n", __func__); + + packet = (struct pm4_mes_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_map_queues)); + + packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_map_queues)); + packet->bitfields2.alloc_format = + alloc_format__mes_map_queues__one_per_pipe_vi; + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; + + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute_vi; + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_compute_vi; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + if (use_static) + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_latency_static_queue_vi; + break; + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.queue_type = + queue_type__mes_map_queues__debug_interface_queue_vi; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__sdma0_vi; + use_static = false; /* no static queues under SDMA */ + break; + default: + pr_err("kfd: in %s queue type %d\n", __func__, + q->properties.type); + BUG(); + break; + } + packet->bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} + +static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static) +{ + struct pm4_map_queues *packet; + bool use_static = is_static; + + BUG_ON(!pm || !buffer || !q); + + pr_debug("kfd: In func %s\n", __func__); + + packet = (struct pm4_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_map_queues)); + + packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_map_queues)); + packet->bitfields2.alloc_format = + alloc_format__mes_map_queues__one_per_pipe; + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots; + + packet->bitfields2.vidmem = (q->properties.is_interop) ? + vidmem__mes_map_queues__uses_video_memory : + vidmem__mes_map_queues__uses_no_video_memory; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__sdma0; + use_static = false; /* no static queues under SDMA */ + break; + default: + BUG(); + break; + } + + packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mes_map_queues_ordinals[0].bitfields3.is_static = + (use_static) ? 1 : 0; + + packet->mes_map_queues_ordinals[0].mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mes_map_queues_ordinals[0].mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->mes_map_queues_ordinals[0].wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->mes_map_queues_ordinals[0].wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} static int pm_create_runlist_ib(struct packet_manager *pm, struct list_head *queues, @@ -133,17 +322,19 @@ static int pm_create_runlist_ib(struct packet_manager *pm, struct kernel_queue *kq; bool is_over_subscription; + BUG_ON(!pm || !queues || !rl_size_bytes || !rl_gpu_addr); + rl_wptr = retval = proccesses_mapped = 0; retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr, &alloc_size_bytes, &is_over_subscription); - if (retval) + if (retval != 0) return retval; *rl_size_bytes = alloc_size_bytes; - pm->ib_size_bytes = alloc_size_bytes; - pr_debug("Building runlist ib process count: %d queues count %d\n", + pr_debug("kfd: In func %s\n", __func__); + pr_debug("kfd: building runlist ib process count: %d queues count %d\n", pm->dqm->processes_count, pm->dqm->queue_count); /* build the run list ib packet */ @@ -151,35 +342,42 @@ static int pm_create_runlist_ib(struct packet_manager *pm, qpd = cur->qpd; /* build map process packet */ if (proccesses_mapped >= pm->dqm->processes_count) { - pr_debug("Not enough space left in runlist IB\n"); + pr_debug("kfd: not enough space left in runlist IB\n"); pm_release_ib(pm); return -ENOMEM; } - retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); - if (retval) + retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); + if (retval != 0) return retval; proccesses_mapped++; - inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(), + inc_wptr(&rl_wptr, sizeof(struct pm4_map_process), alloc_size_bytes); list_for_each_entry(kq, &qpd->priv_queue_list, list) { if (!kq->queue->properties.is_active) continue; - pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", + pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n", kq->queue->queue, qpd->is_debug); - retval = pm->pmf->map_queues(pm, + if (pm->dqm->dev->device_info->asic_family == + CHIP_CARRIZO) + retval = pm_create_map_queue_vi(pm, + &rl_buffer[rl_wptr], + kq->queue, + qpd->is_debug); + else + retval = pm_create_map_queue(pm, &rl_buffer[rl_wptr], kq->queue, qpd->is_debug); - if (retval) + if (retval != 0) return retval; inc_wptr(&rl_wptr, - pm->pmf->get_map_queues_packet_size(), + sizeof(struct pm4_map_queues), alloc_size_bytes); } @@ -187,74 +385,63 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (!q->properties.is_active) continue; - pr_debug("static_queue, mapping user queue %d, is debug status %d\n", + pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n", q->queue, qpd->is_debug); - retval = pm->pmf->map_queues(pm, + if (pm->dqm->dev->device_info->asic_family == + CHIP_CARRIZO) + retval = pm_create_map_queue_vi(pm, &rl_buffer[rl_wptr], q, qpd->is_debug); - if (retval) + else + retval = pm_create_map_queue(pm, + &rl_buffer[rl_wptr], + q, + qpd->is_debug); + + if (retval != 0) return retval; inc_wptr(&rl_wptr, - pm->pmf->get_map_queues_packet_size(), + sizeof(struct pm4_map_queues), alloc_size_bytes); } } - pr_debug("Finished map process and queues to runlist\n"); + pr_debug("kfd: finished map process and queues to runlist\n"); if (is_over_subscription) - retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], - *rl_gpu_addr, - alloc_size_bytes / sizeof(uint32_t), - true); + pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, + alloc_size_bytes / sizeof(uint32_t), true); for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) pr_debug("0x%2X ", rl_buffer[i]); pr_debug("\n"); - return retval; + return 0; } -int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, - uint16_t fw_ver) +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) { + BUG_ON(!dqm); + pm->dqm = dqm; mutex_init(&pm->lock); pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); - if (!pm->priv_queue) { + if (pm->priv_queue == NULL) { mutex_destroy(&pm->lock); return -ENOMEM; } pm->allocated = false; - switch (pm->dqm->dev->device_info->asic_family) { - case CHIP_KAVERI: - case CHIP_HAWAII: - kfd_pm_func_init_cik(pm, fw_ver); - break; - case CHIP_CARRIZO: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - kfd_pm_func_init_vi(pm, fw_ver); - break; - case CHIP_VEGA10: - case CHIP_RAVEN: - kfd_pm_func_init_v9(pm, fw_ver); - break; - default: - BUG(); - } - return 0; } void pm_uninit(struct packet_manager *pm) { + BUG_ON(!pm); + mutex_destroy(&pm->lock); kernel_queue_uninit(pm->priv_queue); } @@ -262,30 +449,45 @@ void pm_uninit(struct packet_manager *pm) int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res) { - uint32_t *buffer, size; - int retval = 0; + struct pm4_set_resources *packet; + + BUG_ON(!pm || !res); + + pr_debug("kfd: In func %s\n", __func__); - size = pm->pmf->get_set_resources_packet_size(); mutex_lock(&pm->lock); pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, - size / sizeof(uint32_t), - (unsigned int **)&buffer); - if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); - retval = -ENOMEM; - goto out; + sizeof(*packet) / sizeof(uint32_t), + (unsigned int **)&packet); + if (packet == NULL) { + mutex_unlock(&pm->lock); + pr_err("kfd: failed to allocate buffer on kernel queue\n"); + return -ENOMEM; } - retval = pm->pmf->set_resources(pm, buffer, res); - if (!retval) - pm->priv_queue->ops.submit_packet(pm->priv_queue); - else - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + memset(packet, 0, sizeof(struct pm4_set_resources)); + packet->header.u32all = build_pm4_header(IT_SET_RESOURCES, + sizeof(struct pm4_set_resources)); + + packet->bitfields2.queue_type = + queue_type__mes_set_resources__hsa_interface_queue_hiq; + packet->bitfields2.vmid_mask = res->vmid_mask; + packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY; + packet->bitfields7.oac_mask = res->oac_mask; + packet->bitfields8.gds_heap_base = res->gds_heap_base; + packet->bitfields8.gds_heap_size = res->gds_heap_size; + + packet->gws_mask_lo = lower_32_bits(res->gws_mask); + packet->gws_mask_hi = upper_32_bits(res->gws_mask); + + packet->queue_mask_lo = lower_32_bits(res->queue_mask); + packet->queue_mask_hi = upper_32_bits(res->queue_mask); + + pm->priv_queue->ops.submit_packet(pm->priv_queue); -out: mutex_unlock(&pm->lock); - return retval; + return 0; } int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) @@ -295,25 +497,26 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) size_t rl_ib_size, packet_size_dwords; int retval; + BUG_ON(!pm || !dqm_queues); + retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr, &rl_ib_size); - if (retval) + if (retval != 0) goto fail_create_runlist_ib; - pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); + pr_debug("kfd: runlist IB address: 0x%llX\n", rl_gpu_ib_addr); - packet_size_dwords = pm->pmf->get_runlist_packet_size() / - sizeof(uint32_t); + packet_size_dwords = sizeof(struct pm4_runlist) / sizeof(uint32_t); mutex_lock(&pm->lock); retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, packet_size_dwords, &rl_buffer); - if (retval) + if (retval != 0) goto fail_acquire_packet_buffer; - retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, - rl_ib_size / sizeof(uint32_t), false); - if (retval) + retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, + rl_ib_size / sizeof(uint32_t), false); + if (retval != 0) goto fail_create_runlist; pm->priv_queue->ops.submit_packet(pm->priv_queue); @@ -327,72 +530,138 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) fail_acquire_packet_buffer: mutex_unlock(&pm->lock); fail_create_runlist_ib: - pm_release_ib(pm); + if (pm->allocated) + pm_release_ib(pm); return retval; } int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, uint32_t fence_value) { - uint32_t *buffer, size; - int retval = 0; + int retval; + struct pm4_query_status *packet; - if (WARN_ON(!fence_address)) - return -EFAULT; + BUG_ON(!pm || !fence_address); - size = pm->pmf->get_query_status_packet_size(); mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, - size / sizeof(uint32_t), (unsigned int **)&buffer); - if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); - retval = -ENOMEM; - goto out; - } + retval = pm->priv_queue->ops.acquire_packet_buffer( + pm->priv_queue, + sizeof(struct pm4_query_status) / sizeof(uint32_t), + (unsigned int **)&packet); + if (retval != 0) + goto fail_acquire_packet_buffer; - retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); - if (!retval) - pm->priv_queue->ops.submit_packet(pm->priv_queue); - else - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + packet->header.u32all = build_pm4_header(IT_QUERY_STATUS, + sizeof(struct pm4_query_status)); + + packet->bitfields2.context_id = 0; + packet->bitfields2.interrupt_sel = + interrupt_sel__mes_query_status__completion_status; + packet->bitfields2.command = + command__mes_query_status__fence_only_after_write_ack; + + packet->addr_hi = upper_32_bits((uint64_t)fence_address); + packet->addr_lo = lower_32_bits((uint64_t)fence_address); + packet->data_hi = upper_32_bits((uint64_t)fence_value); + packet->data_lo = lower_32_bits((uint64_t)fence_value); + + pm->priv_queue->ops.submit_packet(pm->priv_queue); + mutex_unlock(&pm->lock); + + return 0; -out: +fail_acquire_packet_buffer: mutex_unlock(&pm->lock); return retval; } int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - enum kfd_unmap_queues_filter filter, + enum kfd_preempt_type_filter mode, uint32_t filter_param, bool reset, unsigned int sdma_engine) { - uint32_t *buffer, size; - int retval = 0; + int retval; + uint32_t *buffer; + struct pm4_unmap_queues *packet; + + BUG_ON(!pm); - size = pm->pmf->get_unmap_queues_packet_size(); mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, - size / sizeof(uint32_t), (unsigned int **)&buffer); - if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); - retval = -ENOMEM; - goto out; + retval = pm->priv_queue->ops.acquire_packet_buffer( + pm->priv_queue, + sizeof(struct pm4_unmap_queues) / sizeof(uint32_t), + &buffer); + if (retval != 0) + goto err_acquire_packet_buffer; + + packet = (struct pm4_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_unmap_queues)); + pr_debug("kfd: static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n", + mode, reset, type); + packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_unmap_queues)); + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + break; + default: + BUG(); + break; } - retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, - reset, sdma_engine); - if (!retval) - pm->priv_queue->ops.submit_packet(pm->priv_queue); + if (reset) + packet->bitfields2.action = + action__mes_unmap_queues__reset_queues; else - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + + switch (mode) { + case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; + case KFD_PREEMPT_TYPE_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; + case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; + break; + case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES: + /* in this case, we do not preempt static queues */ + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; + break; + default: + BUG(); + break; + } + + pm->priv_queue->ops.submit_packet(pm->priv_queue); -out: + mutex_unlock(&pm->lock); + return 0; + +err_acquire_packet_buffer: mutex_unlock(&pm->lock); return retval; } void pm_release_ib(struct packet_manager *pm) { + BUG_ON(!pm); + mutex_lock(&pm->lock); if (pm->allocated) { kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj); @@ -400,18 +669,3 @@ void pm_release_ib(struct packet_manager *pm) } mutex_unlock(&pm->lock); } - -int pm_debugfs_runlist(struct seq_file *m, void *data) -{ - struct packet_manager *pm = data; - - if (!pm->allocated) { - seq_puts(m, " No active runlist\n"); - return 0; - } - - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); - - return 0; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c index 1e06de0..6cfe7f1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c @@ -32,8 +32,7 @@ int kfd_pasid_init(void) { pasid_limit = KFD_MAX_NUM_OF_PROCESSES; - pasid_bitmap = kcalloc(BITS_TO_LONGS(pasid_limit), sizeof(long), - GFP_KERNEL); + pasid_bitmap = kcalloc(BITS_TO_LONGS(pasid_limit), sizeof(long), GFP_KERNEL); if (!pasid_bitmap) return -ENOMEM; @@ -92,6 +91,6 @@ unsigned int kfd_pasid_alloc(void) void kfd_pasid_free(unsigned int pasid) { - if (!WARN_ON(pasid == 0 || pasid >= pasid_limit)) - clear_bit(pasid, pasid_bitmap); + BUG_ON(pasid == 0 || pasid >= pasid_limit); + clear_bit(pasid, pasid_bitmap); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c deleted file mode 100644 index 543ed83..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c +++ /dev/null @@ -1,513 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - - -/* NOTE: - * - * This file contains logic to dynamically detect and enable PeerDirect - * suppor. PeerDirect support is delivered e.g. as part of OFED - * from Mellanox. Because we are not able to rely on the fact that the - * corresponding OFED will be installed we should: - * - copy PeerDirect definitions locally to avoid dependency on - * corresponding header file - * - try dynamically detect address of PeerDirect function - * pointers. - * - * If dynamic detection failed then PeerDirect support should be - * enabled using the standard PeerDirect bridge driver from: - * https://github.com/RadeonOpenCompute/ROCnRDMA - * - * - * Logic to support PeerDirect relies only on official public API to be - * non-intrusive as much as possible. - * - **/ - -#include -#include -#include -#include -#include -#include -#include - -#include "kfd_priv.h" - - - -/* ----------------------- PeerDirect interface ------------------------------*/ - -/* - * Copyright (c) 2013, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#define IB_PEER_MEMORY_NAME_MAX 64 -#define IB_PEER_MEMORY_VER_MAX 16 - -struct peer_memory_client { - char name[IB_PEER_MEMORY_NAME_MAX]; - char version[IB_PEER_MEMORY_VER_MAX]; - /* acquire return code: 1-mine, 0-not mine */ - int (*acquire)(unsigned long addr, size_t size, - void *peer_mem_private_data, - char *peer_mem_name, - void **client_context); - int (*get_pages)(unsigned long addr, - size_t size, int write, int force, - struct sg_table *sg_head, - void *client_context, void *core_context); - int (*dma_map)(struct sg_table *sg_head, void *client_context, - struct device *dma_device, int dmasync, int *nmap); - int (*dma_unmap)(struct sg_table *sg_head, void *client_context, - struct device *dma_device); - void (*put_pages)(struct sg_table *sg_head, void *client_context); - unsigned long (*get_page_size)(void *client_context); - void (*release)(void *client_context); - void* (*get_context_private_data)(u64 peer_id); - void (*put_context_private_data)(void *context); -}; - -typedef int (*invalidate_peer_memory)(void *reg_handle, - void *core_context); - -void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, - invalidate_peer_memory *invalidate_callback); -void ib_unregister_peer_memory_client(void *reg_handle); - - -/*------------------- PeerDirect bridge driver ------------------------------*/ - -#define AMD_PEER_BRIDGE_DRIVER_VERSION "1.0" -#define AMD_PEER_BRIDGE_DRIVER_NAME "amdkfd" - - -static void* (*pfn_ib_register_peer_memory_client)(struct peer_memory_client - *peer_client, - invalidate_peer_memory - *invalidate_callback); - -static void (*pfn_ib_unregister_peer_memory_client)(void *reg_handle); - -static const struct amd_rdma_interface *rdma_interface; - -static invalidate_peer_memory ib_invalidate_callback; -static void *ib_reg_handle; - -struct amd_mem_context { - uint64_t va; - uint64_t size; - struct pid *pid; - - struct amd_p2p_info *p2p_info; - - /* Flag that free callback was called */ - int free_callback_called; - - /* Context received from PeerDirect call */ - void *core_context; -}; - - -static void free_callback(void *client_priv) -{ - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_priv; - - pr_debug("data 0x%p\n", mem_context); - - if (!mem_context) { - pr_warn("Invalid client context\n"); - return; - } - - pr_debug("mem_context->core_context 0x%p\n", mem_context->core_context); - - /* Call back IB stack asking to invalidate memory */ - (*ib_invalidate_callback) (ib_reg_handle, mem_context->core_context); - - /* amdkfd will free resources when we return from this callback. - * Set flag to inform that there is nothing to do on "put_pages", etc. - */ - ACCESS_ONCE(mem_context->free_callback_called) = 1; -} - - -static int amd_acquire(unsigned long addr, size_t size, - void *peer_mem_private_data, - char *peer_mem_name, void **client_context) -{ - int ret; - struct amd_mem_context *mem_context; - struct pid *pid; - - /* Get pointer to structure describing current process */ - pid = get_task_pid(current, PIDTYPE_PID); - - pr_debug("addr:0x%lx,size:0x%x, pid 0x%p\n", - addr, (unsigned int)size, pid); - - /* Check if address is handled by AMD GPU driver */ - ret = rdma_interface->is_gpu_address(addr, pid); - - if (!ret) { - pr_debug("Not GPU Address\n"); - /* This is not GPU address */ - return 0; - } - - pr_debug("GPU address\n"); - - /* Initialize context used for operation with given address */ - mem_context = kzalloc(sizeof(*mem_context), GFP_KERNEL); - - if (!mem_context) - return 0; /* Error case handled as not GPU address */ - - mem_context->free_callback_called = 0; - mem_context->va = addr; - mem_context->size = size; - - /* Save PID. It is guaranteed that the function will be - * called in the correct process context as opposite to others. - */ - mem_context->pid = pid; - - pr_debug("Client context %p\n", mem_context); - - /* Return pointer to allocated context */ - *client_context = mem_context; - - /* Return 1 to inform that this address which will be handled - * by AMD GPU driver - */ - return 1; -} - -static int amd_get_pages(unsigned long addr, size_t size, int write, int force, - struct sg_table *sg_head, - void *client_context, void *core_context) -{ - int ret; - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("addr:0x%lx,size:0x%x, core_context:%p\n", - addr, (unsigned int)size, core_context); - - if (!mem_context) { - pr_warn("Invalid client context"); - return -EINVAL; - } - - pr_debug("pid :0x%p\n", mem_context->pid); - - - if (addr != mem_context->va) { - pr_warn("Context address (0x%llx) is not the same\n", - mem_context->va); - return -EINVAL; - } - - if (size != mem_context->size) { - pr_warn("Context size (0x%llx) is not the same\n", - mem_context->size); - return -EINVAL; - } - - ret = rdma_interface->get_pages(addr, - size, - mem_context->pid, - &mem_context->p2p_info, - free_callback, - mem_context); - - if (ret || !mem_context->p2p_info) { - pr_err("Could not rdma::get_pages failure: %d\n", ret); - return ret; - } - - mem_context->core_context = core_context; - - /* Note: At this stage it is OK not to fill sg_table */ - return 0; -} - - -static int amd_dma_map(struct sg_table *sg_head, void *client_context, - struct device *dma_device, int dmasync, int *nmap) -{ - /* - * NOTE/TODO: - * We could have potentially three cases for real memory - * location: - * - all memory in the local - * - all memory in the system (RAM) - * - memory is spread (s/g) between local and system. - * - * In the case of all memory in the system we could use - * iommu driver to build DMA addresses but not in the case - * of local memory because currently iommu driver doesn't - * deal with local/device memory addresses (it requires "struct - * page"). - * - * Accordingly returning assumes that iommu funcutionality - * should be disabled so we can assume that sg_table already - * contains DMA addresses. - * - */ - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("Context 0x%p, sg_head 0x%p\n", - client_context, sg_head); - - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - if (!mem_context->p2p_info) { - pr_err("No sg table were allocated\n"); - return -EINVAL; - } - - /* Copy information about previosly allocated sg_table */ - *sg_head = *mem_context->p2p_info->pages; - - /* Return number of pages */ - *nmap = mem_context->p2p_info->pages->nents; - - return 0; -} - -static int amd_dma_unmap(struct sg_table *sg_head, void *client_context, - struct device *dma_device) -{ - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("Context 0x%p, sg_table 0x%p\n", - client_context, sg_head); - - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - /* Assume success */ - return 0; -} -static void amd_put_pages(struct sg_table *sg_head, void *client_context) -{ - int ret = 0; - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("sg_head %p client_context: 0x%p\n", - sg_head, client_context); - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - pr_debug("mem_context->p2p_info %p\n", - mem_context->p2p_info); - - if (ACCESS_ONCE(mem_context->free_callback_called)) { - pr_debug("Free callback was called\n"); - return; - } - - if (mem_context->p2p_info) { - ret = rdma_interface->put_pages(&mem_context->p2p_info); - mem_context->p2p_info = NULL; - - if (ret) - pr_err("Failure: %d (callback status %d)\n", - ret, mem_context->free_callback_called); - } else - pr_err("Pointer to p2p info is null\n"); -} -static unsigned long amd_get_page_size(void *client_context) -{ - unsigned long page_size; - int result; - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("context: %p\n", client_context); - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - - result = rdma_interface->get_page_size( - mem_context->va, - mem_context->size, - mem_context->pid, - &page_size); - - if (result) { - pr_err("Could not get page size. %d\n", result); - /* If we failed to get page size then do not know what to do. - * Let's return some default value - */ - return PAGE_SIZE; - } - - return page_size; -} - -static void amd_release(void *client_context) -{ - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("context: 0x%p\n", client_context); - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - kfree(mem_context); -} - - -static struct peer_memory_client amd_mem_client = { - .acquire = amd_acquire, - .get_pages = amd_get_pages, - .dma_map = amd_dma_map, - .dma_unmap = amd_dma_unmap, - .put_pages = amd_put_pages, - .get_page_size = amd_get_page_size, - .release = amd_release, - .get_context_private_data = NULL, - .put_context_private_data = NULL, -}; - -/** Initialize PeerDirect interface with RDMA Network stack. - * - * Because network stack could potentially be loaded later we check - * presence of PeerDirect when HSA process is created. If PeerDirect was - * already initialized we do nothing otherwise try to detect and register. - */ -void kfd_init_peer_direct(void) -{ - int result; - - if (pfn_ib_unregister_peer_memory_client) { - pr_debug("PeerDirect support was already initialized\n"); - return; - } - - pr_debug("Try to initialize PeerDirect support\n"); - - pfn_ib_register_peer_memory_client = - (void *(*)(struct peer_memory_client *, - invalidate_peer_memory *)) - symbol_request(ib_register_peer_memory_client); - - pfn_ib_unregister_peer_memory_client = (void (*)(void *)) - symbol_request(ib_unregister_peer_memory_client); - - if (!pfn_ib_register_peer_memory_client || - !pfn_ib_unregister_peer_memory_client) { - pr_debug("PeerDirect interface was not detected\n"); - /* Do cleanup */ - kfd_close_peer_direct(); - return; - } - - result = amdkfd_query_rdma_interface(&rdma_interface); - - if (result < 0) { - pr_err("Cannot get RDMA Interface (result = %d)\n", result); - return; - } - - strcpy(amd_mem_client.name, AMD_PEER_BRIDGE_DRIVER_NAME); - strcpy(amd_mem_client.version, AMD_PEER_BRIDGE_DRIVER_VERSION); - - ib_reg_handle = pfn_ib_register_peer_memory_client(&amd_mem_client, - &ib_invalidate_callback); - - if (!ib_reg_handle) { - pr_err("Cannot register peer memory client\n"); - /* Do cleanup */ - kfd_close_peer_direct(); - return; - } - - pr_info("PeerDirect support was initialized successfully\n"); -} - -/** - * Close connection with PeerDirect interface with RDMA Network stack. - * - */ -void kfd_close_peer_direct(void) -{ - if (pfn_ib_unregister_peer_memory_client) { - if (ib_reg_handle) - pfn_ib_unregister_peer_memory_client(ib_reg_handle); - - symbol_put(ib_unregister_peer_memory_client); - } - - if (pfn_ib_register_peer_memory_client) - symbol_put(ib_register_peer_memory_client); - - - /* Reset pointers to be safe */ - pfn_ib_unregister_peer_memory_client = NULL; - pfn_ib_register_peer_memory_client = NULL; - ib_reg_handle = NULL; -} - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h index e50f73d..5b393f3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h @@ -28,19 +28,112 @@ #define PM4_MES_HEADER_DEFINED union PM4_MES_TYPE_3_HEADER { struct { - /* reserved */ - uint32_t reserved1:8; - /* IT opcode */ - uint32_t opcode:8; - /* number of DWORDs - 1 in the information body */ - uint32_t count:14; - /* packet identifier. It should be 3 for type 3 packets */ - uint32_t type:2; + uint32_t reserved1:8; /* < reserved */ + uint32_t opcode:8; /* < IT opcode */ + uint32_t count:14; /* < number of DWORDs - 1 + * in the information body. + */ + uint32_t type:2; /* < packet identifier. + * It should be 3 for type 3 packets + */ }; uint32_t u32all; }; #endif /* PM4_MES_HEADER_DEFINED */ +/* --------------------MES_SET_RESOURCES-------------------- */ + +#ifndef PM4_MES_SET_RESOURCES_DEFINED +#define PM4_MES_SET_RESOURCES_DEFINED +enum set_resources_queue_type_enum { + queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, + queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, + queue_type__mes_set_resources__hsa_debug_interface_queue = 4 +}; + +struct pm4_set_resources { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t vmid_mask:16; + uint32_t unmap_latency:8; + uint32_t reserved1:5; + enum set_resources_queue_type_enum queue_type:3; + } bitfields2; + uint32_t ordinal2; + }; + + uint32_t queue_mask_lo; + uint32_t queue_mask_hi; + uint32_t gws_mask_lo; + uint32_t gws_mask_hi; + + union { + struct { + uint32_t oac_mask:16; + uint32_t reserved2:16; + } bitfields7; + uint32_t ordinal7; + }; + + union { + struct { + uint32_t gds_heap_base:6; + uint32_t reserved3:5; + uint32_t gds_heap_size:6; + uint32_t reserved4:15; + } bitfields8; + uint32_t ordinal8; + }; + +}; +#endif + +/*--------------------MES_RUN_LIST-------------------- */ + +#ifndef PM4_MES_RUN_LIST_DEFINED +#define PM4_MES_RUN_LIST_DEFINED + +struct pm4_runlist { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t reserved1:2; + uint32_t ib_base_lo:30; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t ib_base_hi:16; + uint32_t reserved2:16; + } bitfields3; + uint32_t ordinal3; + }; + + union { + struct { + uint32_t ib_size:20; + uint32_t chain:1; + uint32_t offload_polling:1; + uint32_t reserved3:1; + uint32_t valid:1; + uint32_t reserved4:8; + } bitfields4; + uint32_t ordinal4; + }; + +}; +#endif /*--------------------MES_MAP_PROCESS-------------------- */ @@ -93,58 +186,217 @@ struct pm4_map_process { }; #endif -#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH -#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH +/*--------------------MES_MAP_QUEUES--------------------*/ + +#ifndef PM4_MES_MAP_QUEUES_DEFINED +#define PM4_MES_MAP_QUEUES_DEFINED +enum map_queues_queue_sel_enum { + queue_sel__mes_map_queues__map_to_specified_queue_slots = 0, + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots = 1, + queue_sel__mes_map_queues__enable_process_queues = 2 +}; -struct pm4_map_process_scratch_kv { +enum map_queues_vidmem_enum { + vidmem__mes_map_queues__uses_no_video_memory = 0, + vidmem__mes_map_queues__uses_video_memory = 1 +}; + +enum map_queues_alloc_format_enum { + alloc_format__mes_map_queues__one_per_pipe = 0, + alloc_format__mes_map_queues__all_on_one_pipe = 1 +}; + +enum map_queues_engine_sel_enum { + engine_sel__mes_map_queues__compute = 0, + engine_sel__mes_map_queues__sdma0 = 2, + engine_sel__mes_map_queues__sdma1 = 3 +}; + +struct pm4_map_queues { union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; }; union { struct { - uint32_t pasid:16; - uint32_t reserved1:8; - uint32_t diq_enable:1; - uint32_t process_quantum:7; + uint32_t reserved1:4; + enum map_queues_queue_sel_enum queue_sel:2; + uint32_t reserved2:2; + uint32_t vmid:4; + uint32_t reserved3:4; + enum map_queues_vidmem_enum vidmem:2; + uint32_t reserved4:6; + enum map_queues_alloc_format_enum alloc_format:2; + enum map_queues_engine_sel_enum engine_sel:3; + uint32_t num_queues:3; } bitfields2; uint32_t ordinal2; }; + struct { + union { + struct { + uint32_t is_static:1; + uint32_t reserved5:1; + uint32_t doorbell_offset:21; + uint32_t reserved6:3; + uint32_t queue:6; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t mqd_addr_lo; + uint32_t mqd_addr_hi; + uint32_t wptr_addr_lo; + uint32_t wptr_addr_hi; + + } mes_map_queues_ordinals[1]; /* 1..N of these ordinal groups */ + +}; +#endif + +/*--------------------MES_QUERY_STATUS--------------------*/ + +#ifndef PM4_MES_QUERY_STATUS_DEFINED +#define PM4_MES_QUERY_STATUS_DEFINED +enum query_status_interrupt_sel_enum { + interrupt_sel__mes_query_status__completion_status = 0, + interrupt_sel__mes_query_status__process_status = 1, + interrupt_sel__mes_query_status__queue_status = 2 +}; + +enum query_status_command_enum { + command__mes_query_status__interrupt_only = 0, + command__mes_query_status__fence_only_immediate = 1, + command__mes_query_status__fence_only_after_write_ack = 2, + command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 +}; + +enum query_status_engine_sel_enum { + engine_sel__mes_query_status__compute = 0, + engine_sel__mes_query_status__sdma0_queue = 2, + engine_sel__mes_query_status__sdma1_queue = 3 +}; + +struct pm4_query_status { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + union { struct { - uint32_t page_table_base:28; - uint32_t reserved2:4; - } bitfields3; + uint32_t context_id:28; + enum query_status_interrupt_sel_enum interrupt_sel:2; + enum query_status_command_enum command:2; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:16; + } bitfields3a; + struct { + uint32_t reserved2:2; + uint32_t doorbell_offset:21; + uint32_t reserved3:3; + enum query_status_engine_sel_enum engine_sel:3; + uint32_t reserved4:3; + } bitfields3b; uint32_t ordinal3; }; - uint32_t reserved3; - uint32_t sh_mem_bases; - uint32_t sh_mem_config; - uint32_t sh_mem_ape1_base; - uint32_t sh_mem_ape1_limit; - uint32_t sh_hidden_private_base_vmid; - uint32_t reserved4; - uint32_t reserved5; - uint32_t gds_addr_lo; - uint32_t gds_addr_hi; + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t data_lo; + uint32_t data_hi; +}; +#endif + +/*--------------------MES_UNMAP_QUEUES--------------------*/ + +#ifndef PM4_MES_UNMAP_QUEUES_DEFINED +#define PM4_MES_UNMAP_QUEUES_DEFINED +enum unmap_queues_action_enum { + action__mes_unmap_queues__preempt_queues = 0, + action__mes_unmap_queues__reset_queues = 1, + action__mes_unmap_queues__disable_process_queues = 2 +}; + +enum unmap_queues_queue_sel_enum { + queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, + queue_sel__mes_unmap_queues__perform_request_on_all_active_queues = 2, + queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only = 3 +}; + +enum unmap_queues_engine_sel_enum { + engine_sel__mes_unmap_queues__compute = 0, + engine_sel__mes_unmap_queues__sdma0 = 2, + engine_sel__mes_unmap_queues__sdma1 = 3 +}; + +struct pm4_unmap_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + enum unmap_queues_action_enum action:2; + uint32_t reserved1:2; + enum unmap_queues_queue_sel_enum queue_sel:2; + uint32_t reserved2:20; + enum unmap_queues_engine_sel_enum engine_sel:3; + uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved3:16; + } bitfields3a; + struct { + uint32_t reserved4:2; + uint32_t doorbell_offset0:21; + uint32_t reserved5:9; + } bitfields3b; + uint32_t ordinal3; + }; union { struct { - uint32_t num_gws:6; uint32_t reserved6:2; - uint32_t num_oac:4; - uint32_t reserved7:4; - uint32_t gds_size:6; - uint32_t num_queues:10; - } bitfields14; - uint32_t ordinal14; + uint32_t doorbell_offset1:21; + uint32_t reserved7:9; + } bitfields4; + uint32_t ordinal4; + }; + + union { + struct { + uint32_t reserved8:2; + uint32_t doorbell_offset2:21; + uint32_t reserved9:9; + } bitfields5; + uint32_t ordinal5; + }; + + union { + struct { + uint32_t reserved10:2; + uint32_t doorbell_offset3:21; + uint32_t reserved11:9; + } bitfields6; + uint32_t ordinal6; }; - uint32_t completion_signal_lo32; -uint32_t completion_signal_hi32; }; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h deleted file mode 100644 index ddad9be..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +++ /dev/null @@ -1,583 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef F32_MES_PM4_PACKETS_H -#define F32_MES_PM4_PACKETS_H - -#ifndef PM4_MES_HEADER_DEFINED -#define PM4_MES_HEADER_DEFINED -union PM4_MES_TYPE_3_HEADER { - struct { - uint32_t reserved1 : 8; /* < reserved */ - uint32_t opcode : 8; /* < IT opcode */ - uint32_t count : 14;/* < number of DWORDs - 1 in the - * information body. - */ - uint32_t type : 2; /* < packet identifier. - * It should be 3 for type 3 packets - */ - }; - uint32_t u32All; -}; -#endif /* PM4_MES_HEADER_DEFINED */ - -/*--------------------MES_SET_RESOURCES--------------------*/ - -#ifndef PM4_MES_SET_RESOURCES_DEFINED -#define PM4_MES_SET_RESOURCES_DEFINED -enum mes_set_resources_queue_type_enum { - queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, - queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, - queue_type__mes_set_resources__hsa_debug_interface_queue = 4 -}; - - -struct pm4_mes_set_resources { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t vmid_mask:16; - uint32_t unmap_latency:8; - uint32_t reserved1:5; - enum mes_set_resources_queue_type_enum queue_type:3; - } bitfields2; - uint32_t ordinal2; - }; - - uint32_t queue_mask_lo; - uint32_t queue_mask_hi; - uint32_t gws_mask_lo; - uint32_t gws_mask_hi; - - union { - struct { - uint32_t oac_mask:16; - uint32_t reserved2:16; - } bitfields7; - uint32_t ordinal7; - }; - - union { - struct { - uint32_t gds_heap_base:6; - uint32_t reserved3:5; - uint32_t gds_heap_size:6; - uint32_t reserved4:15; - } bitfields8; - uint32_t ordinal8; - }; - -}; -#endif - -/*--------------------MES_RUN_LIST--------------------*/ - -#ifndef PM4_MES_RUN_LIST_DEFINED -#define PM4_MES_RUN_LIST_DEFINED - -struct pm4_mes_runlist { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t reserved1:2; - uint32_t ib_base_lo:30; - } bitfields2; - uint32_t ordinal2; - }; - - uint32_t ib_base_hi; - - union { - struct { - uint32_t ib_size:20; - uint32_t chain:1; - uint32_t offload_polling:1; - uint32_t reserved2:1; - uint32_t valid:1; - uint32_t process_cnt:4; - uint32_t reserved3:4; - } bitfields4; - uint32_t ordinal4; - }; - -}; -#endif - -/*--------------------MES_MAP_PROCESS--------------------*/ - -#ifndef PM4_MES_MAP_PROCESS_DEFINED -#define PM4_MES_MAP_PROCESS_DEFINED - -struct pm4_mes_map_process { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t pasid:16; - uint32_t reserved1:8; - uint32_t diq_enable:1; - uint32_t process_quantum:7; - } bitfields2; - uint32_t ordinal2; - }; - - uint32_t vm_context_page_table_base_addr_lo32; - - uint32_t vm_context_page_table_base_addr_hi32; - - uint32_t sh_mem_bases; - - uint32_t sh_mem_config; - - uint32_t sq_shader_tba_lo; - - uint32_t sq_shader_tba_hi; - - uint32_t sq_shader_tma_lo; - - uint32_t sq_shader_tma_hi; - - uint32_t reserved6; - - uint32_t gds_addr_lo; - - uint32_t gds_addr_hi; - - union { - struct { - uint32_t num_gws:6; - uint32_t reserved7:1; - uint32_t sdma_enable:1; - uint32_t num_oac:4; - uint32_t reserved8:4; - uint32_t gds_size:6; - uint32_t num_queues:10; - } bitfields14; - uint32_t ordinal14; - }; - - uint32_t completion_signal_lo; - - uint32_t completion_signal_hi; - -}; - -#endif - -/*--------------------MES_MAP_PROCESS_VM--------------------*/ - -#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED -#define PM4_MES_MAP_PROCESS_VM_DEFINED - -struct PM4_MES_MAP_PROCESS_VM { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - uint32_t reserved1; - - uint32_t vm_context_cntl; - - uint32_t reserved2; - - uint32_t vm_context_page_table_end_addr_lo32; - - uint32_t vm_context_page_table_end_addr_hi32; - - uint32_t vm_context_page_table_start_addr_lo32; - - uint32_t vm_context_page_table_start_addr_hi32; - - uint32_t reserved3; - - uint32_t reserved4; - - uint32_t reserved5; - - uint32_t reserved6; - - uint32_t reserved7; - - uint32_t reserved8; - - uint32_t completion_signal_lo32; - - uint32_t completion_signal_hi32; - -}; -#endif - -/*--------------------MES_MAP_QUEUES--------------------*/ - -#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED -#define PM4_MES_MAP_QUEUES_VI_DEFINED -enum mes_map_queues_queue_sel_enum { - queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, -queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 -}; - -enum mes_map_queues_queue_type_enum { - queue_type__mes_map_queues__normal_compute_vi = 0, - queue_type__mes_map_queues__debug_interface_queue_vi = 1, - queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, -queue_type__mes_map_queues__low_latency_static_queue_vi = 3 -}; - -enum mes_map_queues_alloc_format_enum { - alloc_format__mes_map_queues__one_per_pipe_vi = 0, -alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 -}; - -enum mes_map_queues_engine_sel_enum { - engine_sel__mes_map_queues__compute_vi = 0, - engine_sel__mes_map_queues__sdma0_vi = 2, - engine_sel__mes_map_queues__sdma1_vi = 3 -}; - - -struct pm4_mes_map_queues { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t reserved1:4; - enum mes_map_queues_queue_sel_enum queue_sel:2; - uint32_t reserved2:15; - enum mes_map_queues_queue_type_enum queue_type:3; - enum mes_map_queues_alloc_format_enum alloc_format:2; - enum mes_map_queues_engine_sel_enum engine_sel:3; - uint32_t num_queues:3; - } bitfields2; - uint32_t ordinal2; - }; - - union { - struct { - uint32_t reserved3:1; - uint32_t check_disable:1; - uint32_t doorbell_offset:26; - uint32_t reserved4:4; - } bitfields3; - uint32_t ordinal3; - }; - - uint32_t mqd_addr_lo; - uint32_t mqd_addr_hi; - uint32_t wptr_addr_lo; - uint32_t wptr_addr_hi; -}; -#endif - -/*--------------------MES_QUERY_STATUS--------------------*/ - -#ifndef PM4_MES_QUERY_STATUS_DEFINED -#define PM4_MES_QUERY_STATUS_DEFINED -enum mes_query_status_interrupt_sel_enum { - interrupt_sel__mes_query_status__completion_status = 0, - interrupt_sel__mes_query_status__process_status = 1, - interrupt_sel__mes_query_status__queue_status = 2 -}; - -enum mes_query_status_command_enum { - command__mes_query_status__interrupt_only = 0, - command__mes_query_status__fence_only_immediate = 1, - command__mes_query_status__fence_only_after_write_ack = 2, - command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 -}; - -enum mes_query_status_engine_sel_enum { - engine_sel__mes_query_status__compute = 0, - engine_sel__mes_query_status__sdma0_queue = 2, - engine_sel__mes_query_status__sdma1_queue = 3 -}; - -struct pm4_mes_query_status { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t context_id:28; - enum mes_query_status_interrupt_sel_enum interrupt_sel:2; - enum mes_query_status_command_enum command:2; - } bitfields2; - uint32_t ordinal2; - }; - - union { - struct { - uint32_t pasid:16; - uint32_t reserved1:16; - } bitfields3a; - struct { - uint32_t reserved2:2; - uint32_t doorbell_offset:26; - enum mes_query_status_engine_sel_enum engine_sel:3; - uint32_t reserved3:1; - } bitfields3b; - uint32_t ordinal3; - }; - - uint32_t addr_lo; - uint32_t addr_hi; - uint32_t data_lo; - uint32_t data_hi; -}; -#endif - -/*--------------------MES_UNMAP_QUEUES--------------------*/ - -#ifndef PM4_MES_UNMAP_QUEUES_DEFINED -#define PM4_MES_UNMAP_QUEUES_DEFINED -enum mes_unmap_queues_action_enum { - action__mes_unmap_queues__preempt_queues = 0, - action__mes_unmap_queues__reset_queues = 1, - action__mes_unmap_queues__disable_process_queues = 2, - action__mes_unmap_queues__reserved = 3 -}; - -enum mes_unmap_queues_queue_sel_enum { - queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, - queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, - queue_sel__mes_unmap_queues__unmap_all_queues = 2, - queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 -}; - -enum mes_unmap_queues_engine_sel_enum { - engine_sel__mes_unmap_queues__compute = 0, - engine_sel__mes_unmap_queues__sdma0 = 2, - engine_sel__mes_unmap_queues__sdmal = 3 -}; - -struct pm4_mes_unmap_queues { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - enum mes_unmap_queues_action_enum action:2; - uint32_t reserved1:2; - enum mes_unmap_queues_queue_sel_enum queue_sel:2; - uint32_t reserved2:20; - enum mes_unmap_queues_engine_sel_enum engine_sel:3; - uint32_t num_queues:3; - } bitfields2; - uint32_t ordinal2; - }; - - union { - struct { - uint32_t pasid:16; - uint32_t reserved3:16; - } bitfields3a; - struct { - uint32_t reserved4:2; - uint32_t doorbell_offset0:26; - int32_t reserved5:4; - } bitfields3b; - uint32_t ordinal3; - }; - - union { - struct { - uint32_t reserved6:2; - uint32_t doorbell_offset1:26; - uint32_t reserved7:4; - } bitfields4; - uint32_t ordinal4; - }; - - union { - struct { - uint32_t reserved8:2; - uint32_t doorbell_offset2:26; - uint32_t reserved9:4; - } bitfields5; - uint32_t ordinal5; - }; - - union { - struct { - uint32_t reserved10:2; - uint32_t doorbell_offset3:26; - uint32_t reserved11:4; - } bitfields6; - uint32_t ordinal6; - }; -}; -#endif - -#ifndef PM4_MEC_RELEASE_MEM_DEFINED -#define PM4_MEC_RELEASE_MEM_DEFINED - -enum mec_release_mem_event_index_enum { - event_index__mec_release_mem__end_of_pipe = 5, - event_index__mec_release_mem__shader_done = 6 -}; - -enum mec_release_mem_cache_policy_enum { - cache_policy__mec_release_mem__lru = 0, - cache_policy__mec_release_mem__stream = 1 -}; - -enum mec_release_mem_pq_exe_status_enum { - pq_exe_status__mec_release_mem__default = 0, - pq_exe_status__mec_release_mem__phase_update = 1 -}; - -enum mec_release_mem_dst_sel_enum { - dst_sel__mec_release_mem__memory_controller = 0, - dst_sel__mec_release_mem__tc_l2 = 1, - dst_sel__mec_release_mem__queue_write_pointer_register = 2, - dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 -}; - -enum mec_release_mem_int_sel_enum { - int_sel__mec_release_mem__none = 0, - int_sel__mec_release_mem__send_interrupt_only = 1, - int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, - int_sel__mec_release_mem__send_data_after_write_confirm = 3, - int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, - int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, - int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 -}; - -enum mec_release_mem_data_sel_enum { - data_sel__mec_release_mem__none = 0, - data_sel__mec_release_mem__send_32_bit_low = 1, - data_sel__mec_release_mem__send_64_bit_data = 2, - data_sel__mec_release_mem__send_gpu_clock_counter = 3, - data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, - data_sel__mec_release_mem__store_gds_data_to_memory = 5 -}; - -struct pm4_mec_release_mem { - union { - union PM4_MES_TYPE_3_HEADER header; /*header */ - unsigned int ordinal1; - }; - - union { - struct { - unsigned int event_type:6; - unsigned int reserved1:2; - enum mec_release_mem_event_index_enum event_index:4; - unsigned int tcl1_vol_action_ena:1; - unsigned int tc_vol_action_ena:1; - unsigned int reserved2:1; - unsigned int tc_wb_action_ena:1; - unsigned int tcl1_action_ena:1; - unsigned int tc_action_ena:1; - uint32_t reserved3:1; - uint32_t tc_nc_action_ena:1; - uint32_t tc_wc_action_ena:1; - uint32_t tc_md_action_ena:1; - uint32_t reserved4:3; - enum mec_release_mem_cache_policy_enum cache_policy:2; - uint32_t reserved5:2; - enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; - uint32_t reserved6:2; - } bitfields2; - unsigned int ordinal2; - }; - - union { - struct { - uint32_t reserved7:16; - enum mec_release_mem_dst_sel_enum dst_sel:2; - uint32_t reserved8:6; - enum mec_release_mem_int_sel_enum int_sel:3; - uint32_t reserved9:2; - enum mec_release_mem_data_sel_enum data_sel:3; - } bitfields3; - unsigned int ordinal3; - }; - - union { - struct { - uint32_t reserved10:2; - unsigned int address_lo_32b:30; - } bitfields4; - struct { - uint32_t reserved11:3; - uint32_t address_lo_64b:29; - } bitfields4b; - uint32_t reserved12; - unsigned int ordinal4; - }; - - union { - uint32_t address_hi; - uint32_t reserved13; - uint32_t ordinal5; - }; - - union { - uint32_t data_lo; - uint32_t cmp_data_lo; - struct { - uint32_t dw_offset:16; - uint32_t num_dwords:16; - } bitfields6c; - uint32_t reserved14; - uint32_t ordinal6; - }; - - union { - uint32_t data_hi; - uint32_t cmp_data_hi; - uint32_t reserved15; - uint32_t reserved16; - uint32_t ordinal7; - }; - - uint32_t int_ctxid; - -}; - -#endif - -enum { - CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 -}; -#endif - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h index 0b314a8..a0ff348 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h @@ -77,6 +77,103 @@ struct pm4__indirect_buffer_pasid { #endif +/*--------------------_RELEASE_MEM-------------------- */ + +#ifndef _PM4__RELEASE_MEM_DEFINED +#define _PM4__RELEASE_MEM_DEFINED +enum _RELEASE_MEM_event_index_enum { + event_index___release_mem__end_of_pipe = 5, + event_index___release_mem__shader_done = 6 +}; + +enum _RELEASE_MEM_cache_policy_enum { + cache_policy___release_mem__lru = 0, + cache_policy___release_mem__stream = 1, + cache_policy___release_mem__bypass = 2 +}; + +enum _RELEASE_MEM_dst_sel_enum { + dst_sel___release_mem__memory_controller = 0, + dst_sel___release_mem__tc_l2 = 1, + dst_sel___release_mem__queue_write_pointer_register = 2, + dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 +}; + +enum _RELEASE_MEM_int_sel_enum { + int_sel___release_mem__none = 0, + int_sel___release_mem__send_interrupt_only = 1, + int_sel___release_mem__send_interrupt_after_write_confirm = 2, + int_sel___release_mem__send_data_after_write_confirm = 3 +}; + +enum _RELEASE_MEM_data_sel_enum { + data_sel___release_mem__none = 0, + data_sel___release_mem__send_32_bit_low = 1, + data_sel___release_mem__send_64_bit_data = 2, + data_sel___release_mem__send_gpu_clock_counter = 3, + data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, + data_sel___release_mem__store_gds_data_to_memory = 5 +}; + +struct pm4__release_mem { + union { + union PM4_MES_TYPE_3_HEADER header; /*header */ + unsigned int ordinal1; + }; + + union { + struct { + unsigned int event_type:6; + unsigned int reserved1:2; + enum _RELEASE_MEM_event_index_enum event_index:4; + unsigned int tcl1_vol_action_ena:1; + unsigned int tc_vol_action_ena:1; + unsigned int reserved2:1; + unsigned int tc_wb_action_ena:1; + unsigned int tcl1_action_ena:1; + unsigned int tc_action_ena:1; + unsigned int reserved3:6; + unsigned int atc:1; + enum _RELEASE_MEM_cache_policy_enum cache_policy:2; + unsigned int reserved4:5; + } bitfields2; + unsigned int ordinal2; + }; + + union { + struct { + unsigned int reserved5:16; + enum _RELEASE_MEM_dst_sel_enum dst_sel:2; + unsigned int reserved6:6; + enum _RELEASE_MEM_int_sel_enum int_sel:3; + unsigned int reserved7:2; + enum _RELEASE_MEM_data_sel_enum data_sel:3; + } bitfields3; + unsigned int ordinal3; + }; + + union { + struct { + unsigned int reserved8:2; + unsigned int address_lo_32b:30; + } bitfields4; + struct { + unsigned int reserved9:3; + unsigned int address_lo_64b:29; + } bitfields5; + unsigned int ordinal4; + }; + + unsigned int address_hi; + + unsigned int data_lo; + + unsigned int data_hi; + +}; +#endif + + /*--------------------_SET_CONFIG_REG-------------------- */ #ifndef _PM4__SET_CONFIG_REG_DEFINED diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h index 7c8d9b3..08c7219 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h @@ -30,12 +30,10 @@ union PM4_MES_TYPE_3_HEADER { struct { uint32_t reserved1 : 8; /* < reserved */ uint32_t opcode : 8; /* < IT opcode */ - uint32_t count : 14;/* < Number of DWORDS - 1 in the - * information body - */ - uint32_t type : 2; /* < packet identifier - * It should be 3 for type 3 packets - */ + uint32_t count : 14;/* < number of DWORDs - 1 in the + information body. */ + uint32_t type : 2; /* < packet identifier. + It should be 3 for type 3 packets */ }; uint32_t u32All; }; @@ -126,10 +124,9 @@ struct pm4_mes_runlist { uint32_t ib_size:20; uint32_t chain:1; uint32_t offload_polling:1; - uint32_t reserved2:1; + uint32_t reserved3:1; uint32_t valid:1; - uint32_t process_cnt:4; - uint32_t reserved3:4; + uint32_t reserved4:8; } bitfields4; uint32_t ordinal4; }; @@ -144,8 +141,8 @@ struct pm4_mes_runlist { struct pm4_mes_map_process { union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; }; union { @@ -156,48 +153,36 @@ struct pm4_mes_map_process { uint32_t process_quantum:7; } bitfields2; uint32_t ordinal2; - }; +}; union { struct { uint32_t page_table_base:28; - uint32_t reserved3:4; + uint32_t reserved2:4; } bitfields3; uint32_t ordinal3; }; - uint32_t reserved; - uint32_t sh_mem_bases; - uint32_t sh_mem_config; uint32_t sh_mem_ape1_base; uint32_t sh_mem_ape1_limit; - - uint32_t sh_hidden_private_base_vmid; - - uint32_t reserved2; - uint32_t reserved3; - + uint32_t sh_mem_config; uint32_t gds_addr_lo; uint32_t gds_addr_hi; union { struct { uint32_t num_gws:6; - uint32_t reserved4:2; + uint32_t reserved3:2; uint32_t num_oac:4; - uint32_t reserved5:4; + uint32_t reserved4:4; uint32_t gds_size:6; uint32_t num_queues:10; } bitfields10; uint32_t ordinal10; }; - uint32_t completion_signal_lo; - uint32_t completion_signal_hi; - }; - #endif /*--------------------MES_MAP_QUEUES--------------------*/ @@ -350,7 +335,7 @@ enum mes_unmap_queues_engine_sel_enum { engine_sel__mes_unmap_queues__sdmal = 3 }; -struct pm4_mes_unmap_queues { +struct PM4_MES_UNMAP_QUEUES { union { union PM4_MES_TYPE_3_HEADER header; /* header */ uint32_t ordinal1; @@ -410,101 +395,4 @@ struct pm4_mes_unmap_queues { }; #endif -#ifndef PM4_MEC_RELEASE_MEM_DEFINED -#define PM4_MEC_RELEASE_MEM_DEFINED -enum RELEASE_MEM_event_index_enum { - event_index___release_mem__end_of_pipe = 5, - event_index___release_mem__shader_done = 6 -}; - -enum RELEASE_MEM_cache_policy_enum { - cache_policy___release_mem__lru = 0, - cache_policy___release_mem__stream = 1, - cache_policy___release_mem__bypass = 2 -}; - -enum RELEASE_MEM_dst_sel_enum { - dst_sel___release_mem__memory_controller = 0, - dst_sel___release_mem__tc_l2 = 1, - dst_sel___release_mem__queue_write_pointer_register = 2, - dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 -}; - -enum RELEASE_MEM_int_sel_enum { - int_sel___release_mem__none = 0, - int_sel___release_mem__send_interrupt_only = 1, - int_sel___release_mem__send_interrupt_after_write_confirm = 2, - int_sel___release_mem__send_data_after_write_confirm = 3 -}; - -enum RELEASE_MEM_data_sel_enum { - data_sel___release_mem__none = 0, - data_sel___release_mem__send_32_bit_low = 1, - data_sel___release_mem__send_64_bit_data = 2, - data_sel___release_mem__send_gpu_clock_counter = 3, - data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, - data_sel___release_mem__store_gds_data_to_memory = 5 -}; - -struct pm4_mec_release_mem { - union { - union PM4_MES_TYPE_3_HEADER header; /*header */ - unsigned int ordinal1; - }; - - union { - struct { - unsigned int event_type:6; - unsigned int reserved1:2; - enum RELEASE_MEM_event_index_enum event_index:4; - unsigned int tcl1_vol_action_ena:1; - unsigned int tc_vol_action_ena:1; - unsigned int reserved2:1; - unsigned int tc_wb_action_ena:1; - unsigned int tcl1_action_ena:1; - unsigned int tc_action_ena:1; - unsigned int reserved3:6; - unsigned int atc:1; - enum RELEASE_MEM_cache_policy_enum cache_policy:2; - unsigned int reserved4:5; - } bitfields2; - unsigned int ordinal2; - }; - - union { - struct { - unsigned int reserved5:16; - enum RELEASE_MEM_dst_sel_enum dst_sel:2; - unsigned int reserved6:6; - enum RELEASE_MEM_int_sel_enum int_sel:3; - unsigned int reserved7:2; - enum RELEASE_MEM_data_sel_enum data_sel:3; - } bitfields3; - unsigned int ordinal3; - }; - - union { - struct { - unsigned int reserved8:2; - unsigned int address_lo_32b:30; - } bitfields4; - struct { - unsigned int reserved9:3; - unsigned int address_lo_64b:29; - } bitfields5; - unsigned int ordinal4; - }; - - unsigned int address_hi; - - unsigned int data_lo; - - unsigned int data_hi; -}; -#endif - -enum { - CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 -}; - #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h old mode 100755 new mode 100644 index 88fdfc9..4750cab --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -30,49 +30,13 @@ #include #include #include -#include #include -#include -#include -#include -#include -#include #include -#include -#include "amd_shared.h" - #define KFD_SYSFS_FILE_MODE 0444 -/* GPU ID hash width in bits */ -#define KFD_GPU_ID_HASH_WIDTH 16 - -/* Use upper bits of mmap offset to store KFD driver specific information. - * BITS[63:62] - Encode MMAP type - * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to - * BITS[45:40] - Reserved. Not Used. - * BITS[39:0] - MMAP offset value. Used by TTM. - * - * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these - * defines are w.r.t to PAGE_SIZE - */ -#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) -#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) -#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) -#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) -#define KFD_MMAP_TYPE_MAP_BO (0x1ULL << KFD_MMAP_TYPE_SHIFT) -#define KFD_MMAP_TYPE_RESERVED_MEM (0x0ULL << KFD_MMAP_TYPE_SHIFT) - -#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) -#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ - << KFD_MMAP_GPU_ID_SHIFT) -#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ - & KFD_MMAP_GPU_ID_MASK) -#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ - >> KFD_MMAP_GPU_ID_SHIFT) - -#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT) -#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) +#define KFD_MMAP_DOORBELL_MASK 0x8000000000000 +#define KFD_MMAP_EVENTS_MASK 0x4000000000000 /* * When working with cp scheduler we should assign the HIQ manually or via @@ -84,6 +48,8 @@ #define KFD_CIK_HIQ_PIPE 4 #define KFD_CIK_HIQ_QUEUE 0 +/* GPU ID hash width in bits */ +#define KFD_GPU_ID_HASH_WIDTH 16 /* Macro for allocating structures */ #define kfd_alloc_struct(ptr_to_struct) \ @@ -108,42 +74,12 @@ extern int max_num_of_queues_per_device; /* Kernel module parameter to specify the scheduling policy */ extern int sched_policy; -extern int cwsr_enable; - -/* - * Kernel module parameter to specify the maximum process - * number per HW scheduler - */ -extern int hws_max_conc_proc; - /* * Kernel module parameter to specify whether to send sigterm to HSA process on * unhandled exception */ extern int send_sigterm; -/* - * This kernel module is used to simulate large bar machine on non-large bar - * enabled machines. - */ -extern int debug_largebar; - -/* - * Ignore CRAT table during KFD initialization, can be used to work around - * broken CRAT tables on some AMD systems - */ -extern int ignore_crat; - -/* - * Set sh_mem_config.retry_disable on Vega10 - */ -extern int vega10_noretry; - -/* - * Enable privileged mode for all CP queues including user queues - */ -extern int priv_cp_queues; - /** * enum kfd_sched_policy * @@ -176,28 +112,26 @@ enum cache_policy { cache_policy_noncoherent }; -#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) +enum asic_family_type { + CHIP_KAVERI = 0, + CHIP_CARRIZO +}; struct kfd_event_interrupt_class { bool (*interrupt_isr)(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, uint32_t *patched_ihre, - bool *patched_flag); + const uint32_t *ih_ring_entry); void (*interrupt_wq)(struct kfd_dev *dev, - const uint32_t *ih_ring_entry); + const uint32_t *ih_ring_entry); }; struct kfd_device_info { - enum amd_asic_type asic_family; + unsigned int asic_family; const struct kfd_event_interrupt_class *event_interrupt_class; unsigned int max_pasid_bits; unsigned int max_no_of_hqd; - unsigned int doorbell_size; size_t ih_ring_entry_size; uint8_t num_of_watch_points; uint16_t mqd_size_aligned; - bool is_need_iommu_device; - bool supports_cwsr; - bool needs_pci_atomics; }; struct kfd_mem_obj { @@ -205,13 +139,6 @@ struct kfd_mem_obj { uint32_t range_end; uint64_t gpu_addr; uint32_t *cpu_ptr; - void *gtt_mem; -}; - -struct kfd_vmid_info { - uint32_t first_vmid_kfd; - uint32_t last_vmid_kfd; - uint32_t vmid_num_kfd; }; struct kfd_dev { @@ -238,12 +165,11 @@ struct kfd_dev { */ struct kgd2kfd_shared_resources shared_resources; - struct kfd_vmid_info vm_info; const struct kfd2kgd_calls *kfd2kgd; struct mutex doorbell_mutex; - unsigned long doorbell_available_index[DIV_ROUND_UP( - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; + DECLARE_BITMAP(doorbell_available_index, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); void *gtt_mem; uint64_t gtt_start_gpu_addr; @@ -253,17 +179,18 @@ struct kfd_dev { unsigned int gtt_sa_chunk_size; unsigned int gtt_sa_num_of_chunks; - /* QCM Device instance */ - struct device_queue_manager *dqm; - - bool init_complete; - /* Interrupts */ - struct kfifo ih_fifo; - struct workqueue_struct *ih_wq; + void *interrupt_ring; + size_t interrupt_ring_size; + atomic_t interrupt_ring_rptr; + atomic_t interrupt_ring_wptr; struct work_struct interrupt_work; spinlock_t interrupt_lock; + /* QCM Device instance */ + struct device_queue_manager *dqm; + + bool init_complete; /* * Interrupts of interest to KFD are copied * from the HW ring into a SW ring. @@ -271,32 +198,7 @@ struct kfd_dev { bool interrupts_active; /* Debug manager */ - struct kfd_dbgmgr *dbgmgr; - - /* MEC firmware version*/ - uint16_t mec_fw_version; - - /* Maximum process number mapped to HW scheduler */ - unsigned int max_proc_per_quantum; - - /* cwsr */ - bool cwsr_enabled; - struct page *cwsr_pages; - uint32_t cwsr_size; - uint32_t tma_offset; /*Offset for TMA from the start of cwsr_mem*/ - - /* IB usage */ - uint32_t ib_size; -}; - -struct kfd_ipc_obj; - -struct kfd_bo { - void *mem; - struct interval_tree_node it; - struct kfd_dev *dev; - struct list_head cb_data_head; - struct kfd_ipc_obj *kfd_ipc_obj; + struct kfd_dbgmgr *dbgmgr; }; /* KGD2KFD callbacks */ @@ -319,22 +221,27 @@ void kfd_chardev_exit(void); struct device *kfd_chardev(void); /** - * enum kfd_unmap_queues_filter + * enum kfd_preempt_type_filter * - * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue. + * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue. * - * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the + * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the * running queues list. * - * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to + * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to * specific process. * */ -enum kfd_unmap_queues_filter { - KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE, - KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, - KFD_UNMAP_QUEUES_FILTER_BY_PASID +enum kfd_preempt_type_filter { + KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE, + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, + KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, + KFD_PREEMPT_TYPE_FILTER_BY_PASID +}; + +enum kfd_preempt_type { + KFD_PREEMPT_TYPE_WAVEFRONT, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET }; /** @@ -360,11 +267,6 @@ enum kfd_queue_format { KFD_QUEUE_FORMAT_AQL }; -enum KFD_QUEUE_PRIORITY { - KFD_QUEUE_PRIORITY_MINIMUM = 0, - KFD_QUEUE_PRIORITY_MAXIMUM = 15 -}; - /** * struct queue_properties * @@ -392,13 +294,13 @@ enum KFD_QUEUE_PRIORITY { * @write_ptr: Defines the number of dwords written to the ring buffer. * * @doorbell_ptr: This field aim is to notify the H/W of new packet written to - * the queue ring buffer. This field should be similar to write_ptr and the - * user should update this field after he updated the write_ptr. + * the queue ring buffer. This field should be similar to write_ptr and the user + * should update this field after he updated the write_ptr. * * @doorbell_off: The doorbell offset in the doorbell pci-bar. * - * @is_interop: Defines if this is a interop queue. Interop queue means that - * the queue can access both graphics and compute resources. + * @is_interop: Defines if this is a interop queue. Interop queue means that the + * queue can access both graphics and compute resources. * * @is_active: Defines if the queue is active or not. * @@ -419,10 +321,9 @@ struct queue_properties { uint32_t queue_percent; uint32_t *read_ptr; uint32_t *write_ptr; - void __iomem *doorbell_ptr; + uint32_t __iomem *doorbell_ptr; uint32_t doorbell_off; bool is_interop; - bool is_evicted; /* true -> queue is evicted */ bool is_active; /* Not relevant for user mode queues in cp scheduling */ unsigned int vmid; @@ -435,12 +336,6 @@ struct queue_properties { uint32_t eop_ring_buffer_size; uint64_t ctx_save_restore_area_address; uint32_t ctx_save_restore_area_size; - uint32_t ctl_stack_size; - uint64_t tba_addr; - uint64_t tma_addr; - /* Relevant for CU */ - uint32_t cu_mask_count; /* Must be a multiple of 32 */ - uint32_t *cu_mask; }; /** @@ -457,10 +352,9 @@ struct queue_properties { * @properties: The queue properties. * * @mec: Used only in no cp scheduling mode and identifies to micro engine id - * that the queue should be execute on. + * that the queue should be execute on. * - * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe - * id. + * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe id. * * @queue: Used only in no cp scheduliong mode and identifies the queue's slot. * @@ -485,7 +379,6 @@ struct queue { uint32_t queue; unsigned int sdma_id; - unsigned int doorbell_id; struct kfd_process *process; struct kfd_dev *device; @@ -502,19 +395,6 @@ enum KFD_MQD_TYPE { KFD_MQD_TYPE_MAX }; -enum KFD_PIPE_PRIORITY { - KFD_PIPE_PRIORITY_CS_LOW = 0, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_HIGH -}; - -enum KFD_SPI_PRIORITY { - KFD_SPI_PRIORITY_EXTRA_LOW = 0, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_MEDIUM, - KFD_SPI_PRIORITY_HIGH -}; - struct scheduling_resources { unsigned int vmid_mask; enum kfd_queue_type type; @@ -528,6 +408,7 @@ struct scheduling_resources { struct process_queue_manager { /* data */ struct kfd_process *process; + unsigned int num_concurrent_processes; struct list_head queues; unsigned long *queue_slot_bitmap; }; @@ -543,13 +424,6 @@ struct qcm_process_device { unsigned int queue_count; unsigned int vmid; bool is_debug; - unsigned int evicted; /* eviction counter, 0=active */ - - /* This flag tells if we should reset all wavefronts on - * process termination - */ - bool reset_wavefronts; - /* * All the memory management data should be here too */ @@ -562,55 +436,6 @@ struct qcm_process_device { uint32_t gds_size; uint32_t num_gws; uint32_t num_oac; - uint32_t sh_hidden_private_base; - - /*cwsr memory*/ - uint64_t cwsr_base; - uint64_t tba_addr; - uint64_t tma_addr; - void *cwsr_kaddr; - struct page *cwsr_pages; - - /* IB memory */ - uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */ - void *ib_kaddr; - - /*doorbell resources per process per device*/ - unsigned long *doorbell_bitmap; -}; - -/* KFD Memory Eviction */ -struct kfd_eviction_work { - struct delayed_work dwork; - struct dma_fence *quiesce_fence; -}; - -/* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 -/* Approx. back off time if restore fails due to lack of memory */ -#define PROCESS_BACK_OFF_TIME_MS 100 -/* Approx. time before evicting the process again */ -#define PROCESS_ACTIVE_TIME_MS 10 - -void kfd_evict_bo_worker(struct work_struct *work); -void kfd_restore_bo_worker(struct work_struct *work); -int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, - struct dma_fence *fence); -int quiesce_process_mm(struct kfd_process *p); - - -/* 8 byte handle containing GPU ID in the most significant 4 bytes and - * idr_handle in the least significant 4 bytes - */ -#define MAKE_HANDLE(gpu_id, idr_handle) \ - (((uint64_t)(gpu_id) << 32) + idr_handle) -#define GET_GPU_ID(handle) (handle >> 32) -#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) - -enum kfd_pdd_bound { - PDD_UNBOUND = 0, - PDD_BOUND, - PDD_BOUND_SUSPENDED, }; /* Data that is per-process-per device. */ @@ -624,8 +449,6 @@ struct kfd_process_device { /* The device that owns this data. */ struct kfd_dev *dev; - /* The process that owns this kfd_process_device. */ - struct kfd_process *process; /* per-process-per device QCM data structure */ struct qcm_process_device qpd; @@ -637,27 +460,14 @@ struct kfd_process_device { uint64_t gpuvm_limit; uint64_t scratch_base; uint64_t scratch_limit; - uint64_t dgpu_base; - uint64_t dgpu_limit; - - uint64_t sh_hidden_private_base_vmid; - - /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) - */ - enum kfd_pdd_bound bound; - /* VM context for GPUVM allocations */ - void *vm; + /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ + bool bound; - /* GPUVM allocations storage */ - struct idr alloc_idr; - - /* Flag used to tell the pdd has dequeued from the dqm. - * This is used to prevent dev->dqm->ops.process_termination() from - * being called twice when it is already called in IOMMU callback - * function. + /* This flag tells if we should reset all + * wavefronts on process termination */ - bool already_dequeued; + bool reset_wavefronts; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -670,15 +480,7 @@ struct kfd_process { */ struct hlist_node kfd_processes; - /* - * Opaque pointer to mm_struct. We don't hold a reference to - * it so it should never be dereferenced from here. This is - * only used for looking up processes by their mm. - */ - void *mm; - - struct kref ref; - struct work_struct release_work; + struct mm_struct *mm; struct mutex mutex; @@ -686,8 +488,6 @@ struct kfd_process { * In any process, the thread that started main() is the lead * thread and outlives the rest. * It is here because amd_iommu_bind_pasid wants a task_struct. - * It can also be used for safely getting a reference to the - * mm_struct of the process. */ struct task_struct *lead_thread; @@ -707,8 +507,11 @@ struct kfd_process { struct process_queue_manager pqm; - unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, - BITS_PER_LONG)]; + /* The process's queues. */ + size_t queue_array_size; + + /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ + struct kfd_queue **queues; /*Is the user space process 32 bit?*/ bool is_32bit_user_mode; @@ -717,29 +520,10 @@ struct kfd_process { struct mutex event_mutex; /* All events in process hashed by ID, linked on kfd_event.events. */ DECLARE_HASHTABLE(events, 4); - /* struct slot_page_header.event_pages */ - struct list_head signal_event_pages; + struct list_head signal_event_pages; /* struct slot_page_header. + event_pages */ u32 next_nonsignal_event_id; size_t signal_event_count; - bool signal_event_limit_reached; - - struct rb_root_cached bo_interval_tree; - - /* Information used for memory eviction */ - void *process_info; - /* Eviction fence that is attached to all the BOs of this process. The - * fence will be triggered during eviction and new one will be created - * during restore - */ - struct dma_fence *ef; - - /* Work items for evicting and restoring BOs */ - struct kfd_eviction_work eviction_work; - struct delayed_work restore_work; - /* Approx. the last timestamp (in jiffies) when the process was - * restored after an eviction - */ - unsigned long last_restore_timestamp; }; /** @@ -762,55 +546,21 @@ struct amdkfd_ioctl_desc { void kfd_process_create_wq(void); void kfd_process_destroy_wq(void); -struct kfd_process *kfd_create_process(struct file *filep); -struct kfd_process *kfd_get_process(const struct task_struct *task); +struct kfd_process *kfd_create_process(const struct task_struct *); +struct kfd_process *kfd_get_process(const struct task_struct *); struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); -struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); -void kfd_unref_process(struct kfd_process *p); -void kfd_suspend_all_processes(void); -int kfd_resume_all_processes(void); struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process *p); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -int kfd_bind_processes_to_device(struct kfd_dev *dev); -void kfd_unbind_processes_from_device(struct kfd_dev *dev); -#endif -void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid); +void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process *p); struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, struct kfd_process *p); -int kfd_reserved_mem_mmap(struct kfd_process *process, - struct vm_area_struct *vma); - -/* KFD process API for creating and translating handles */ -int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, - void *mem, uint64_t start, - uint64_t length, - struct kfd_ipc_obj *ipc_obj); -void *kfd_process_device_translate_handle(struct kfd_process_device *p, - int handle); -struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, - int handle); -void *kfd_process_find_bo_from_interval(struct kfd_process *p, - uint64_t start_addr, - uint64_t last_addr); -void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, - int handle); - -void run_rdma_free_callback(struct kfd_bo *buf_obj); -struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); - -/* kfd dgpu memory */ -int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd); - /* Process device data iterator */ -struct kfd_process_device *kfd_get_first_process_device_data( - struct kfd_process *p); -struct kfd_process_device *kfd_get_next_process_device_data( - struct kfd_process *p, +struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p); +struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, struct kfd_process_device *pdd); bool kfd_has_process_device_data(struct kfd_process *p); @@ -823,20 +573,16 @@ unsigned int kfd_pasid_alloc(void); void kfd_pasid_free(unsigned int pasid); /* Doorbells */ -size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); -int kfd_doorbell_init(struct kfd_dev *kfd); -void kfd_doorbell_fini(struct kfd_dev *kfd); -int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process, - struct vm_area_struct *vma); -void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, +void kfd_doorbell_init(struct kfd_dev *kfd); +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, unsigned int *doorbell_off); void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); u32 read_kernel_doorbell(u32 __iomem *db); -void write_kernel_doorbell(void __iomem *db, u32 value); -void write_kernel_doorbell64(void __iomem *db, u64 value); -unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, +void write_kernel_doorbell(u32 __iomem *db, u32 value); +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, struct kfd_process *process, - unsigned int doorbell_id); + unsigned int queue_id); /* GTT Sub-Allocator */ @@ -852,22 +598,16 @@ int kfd_topology_init(void); void kfd_topology_shutdown(void); int kfd_topology_add_device(struct kfd_dev *gpu); int kfd_topology_remove_device(struct kfd_dev *gpu); -struct kfd_topology_device *kfd_topology_device_by_proximity_domain( - uint32_t proximity_domain); struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); -struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd); -int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); -int kfd_numa_node_to_apic_id(int numa_node_id); +struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); /* Interrupts */ int kfd_interrupt_init(struct kfd_dev *dev); void kfd_interrupt_exit(struct kfd_dev *dev); void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry); -bool interrupt_is_wanted(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, - uint32_t *patched_ihre, bool *flag); +bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry); /* Power Management */ void kgd2kfd_suspend(struct kfd_dev *kfd); @@ -875,10 +615,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd); /* amdkfd Apertures */ int kfd_init_apertures(struct kfd_process *process); -int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, - uint64_t base, uint64_t limit); /* Queue Context Management */ +struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); + int init_queue(struct queue **q, const struct queue_properties *properties); void uninit_queue(struct queue *q); void print_queue_properties(struct queue_properties *q); @@ -888,20 +628,13 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_dev *dev); -struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, struct kfd_dev *dev); -struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); -struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, enum kfd_queue_type type); void kernel_queue_uninit(struct kernel_queue *kq); -int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid); /* Process Queue Manager */ struct process_queue_node { @@ -910,36 +643,32 @@ struct process_queue_node { struct list_head process_queue_list; }; -void kfd_process_dequeue_from_device(struct kfd_process_device *pdd); -void kfd_process_dequeue_from_all_devices(struct kfd_process *p); int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p); void pqm_uninit(struct process_queue_manager *pqm); int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_dev *dev, struct file *f, struct queue_properties *properties, + unsigned int flags, + enum kfd_queue_type type, unsigned int *qid); int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, struct queue_properties *p); -int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, - struct queue_properties *p); struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, unsigned int qid); -int pqm_get_wave_state(struct process_queue_manager *pqm, - unsigned int qid, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size); -int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm); -int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); + +int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + unsigned int fence_value, + unsigned long timeout); /* Packet Manager */ +#define KFD_HIQ_TIMEOUT (500) + #define KFD_FENCE_COMPLETED (100) #define KFD_FENCE_INIT (10) - -struct packet_manager_func; +#define KFD_UNMAP_LATENCY (150) struct packet_manager { struct device_queue_manager *dqm; @@ -947,42 +676,9 @@ struct packet_manager { struct mutex lock; bool allocated; struct kfd_mem_obj *ib_buffer_obj; - unsigned int ib_size_bytes; - - struct packet_manager_funcs *pmf; -}; - -struct packet_manager_funcs { - /* Support different firmware versions for PM4 packets */ - int (*map_process)(struct packet_manager *pm, uint32_t *buffer, - struct qcm_process_device *qpd); - int (*runlist)(struct packet_manager *pm, uint32_t *buffer, - uint64_t ib, size_t ib_size_in_dwords, bool chain); - int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res); - int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, - struct queue *q, bool is_static); - int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, - enum kfd_queue_type type, - enum kfd_unmap_queues_filter mode, - uint32_t filter_param, bool reset, - unsigned int sdma_engine); - int (*query_status)(struct packet_manager *pm, uint32_t *buffer, - uint64_t fence_address, uint32_t fence_value); - uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); - - uint32_t (*get_map_process_packet_size)(void); - uint32_t (*get_runlist_packet_size)(void); - uint32_t (*get_set_resources_packet_size)(void); - uint32_t (*get_map_queues_packet_size)(void); - uint32_t (*get_unmap_queues_packet_size)(void); - uint32_t (*get_query_status_packet_size)(void); - uint32_t (*get_release_mem_packet_size)(void); - }; -int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, - uint16_t fw_ver); +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); void pm_uninit(struct packet_manager *pm); int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res); @@ -991,55 +687,18 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, uint32_t fence_value); int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - enum kfd_unmap_queues_filter mode, + enum kfd_preempt_type_filter mode, uint32_t filter_param, bool reset, unsigned int sdma_engine); void pm_release_ib(struct packet_manager *pm); -/* Following PM funcs can be shared among CIK and VI */ -unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); -int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, - uint64_t ib, size_t ib_size_in_dwords, bool chain); -int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, - struct queue *q, bool is_static); -int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res); -int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, - enum kfd_queue_type type, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param, bool reset, - unsigned int sdma_engine); -int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, - uint64_t fence_address, uint32_t fence_value); -uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer); - -uint32_t pm_get_map_process_packet_size_vi(void); -uint32_t pm_get_runlist_packet_size_vi(void); -uint32_t pm_get_set_resources_packet_size_vi(void); -uint32_t pm_get_map_queues_packet_size_vi(void); -uint32_t pm_get_unmap_queues_packet_size_vi(void); -uint32_t pm_get_query_status_packet_size_vi(void); -uint32_t pm_get_release_mem_packet_size_vi(void); - - -void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver); -void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver); - -void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver); - - uint64_t kfd_get_number_elems(struct kfd_dev *kfd); phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, struct kfd_process *process); -int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - unsigned int fence_value, - unsigned long timeout_ms); /* Events */ extern const struct kfd_event_interrupt_class event_interrupt_class_cik; -extern const struct kfd_event_interrupt_class event_interrupt_class_v9; - extern const struct kfd_device_global_init_class device_global_init_class_cik; enum kfd_event_wait_result { @@ -1057,55 +716,18 @@ int kfd_wait_on_events(struct kfd_process *p, enum kfd_event_wait_result *wait_result); void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, unsigned long address, bool is_write_requested, bool is_execute_requested); -#endif void kfd_signal_hw_exception_event(unsigned int pasid); int kfd_set_event(struct kfd_process *p, uint32_t event_id); int kfd_reset_event(struct kfd_process *p, uint32_t event_id); int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint32_t event_type, bool auto_reset, uint32_t node_id, uint32_t *event_id, uint32_t *event_trigger_data, - uint64_t *event_page_offset, uint32_t *event_slot_index, - void *kern_addr); + uint64_t *event_page_offset, uint32_t *event_slot_index); int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); -void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle); - -void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, - struct kfd_vm_fault_info *info); - -void kfd_flush_tlb(struct kfd_dev *dev, uint32_t pasid); int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); -#define KFD_SCRATCH_KV_FW_VER 413 - -/* PeerDirect support */ -void kfd_init_peer_direct(void); -void kfd_close_peer_direct(void); - -/* IPC Support */ -int kfd_ipc_init(void); - -/* Debugfs */ -#if defined(CONFIG_DEBUG_FS) - -void kfd_debugfs_init(void); -void kfd_debugfs_fini(void); -int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data); -int pqm_debugfs_mqds(struct seq_file *m, void *data); -int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data); -int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data); -int kfd_debugfs_rls_by_device(struct seq_file *m, void *data); -int pm_debugfs_runlist(struct seq_file *m, void *data); - -#else - -static inline void kfd_debugfs_init(void) {} -static inline void kfd_debugfs_fini(void) {} - -#endif - #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index c798fa3..035bbc9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -24,16 +24,10 @@ #include #include #include -#include #include -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) #include -#endif #include #include -#include -#include -#include "kfd_ipc.h" struct mm_struct; @@ -41,6 +35,13 @@ struct mm_struct; #include "kfd_dbgmgr.h" /* + * Initial size for the array of queues. + * The allocated size is doubled each time + * it is exceeded up to MAX_PROCESS_QUEUES. + */ +#define INITIAL_QUEUE_ARRAY_SIZE 16 + +/* * List of struct kfd_process (field kfd_process). * Unique/indexed by mm_struct* */ @@ -52,16 +53,13 @@ DEFINE_STATIC_SRCU(kfd_processes_srcu); static struct workqueue_struct *kfd_process_wq; -#define MIN_IDR_ID 1 -#define MAX_IDR_ID 0 /*0 - for unlimited*/ - -static struct kfd_process *find_process(const struct task_struct *thread, - bool ref); -static void kfd_process_ref_release(struct kref *ref); -static struct kfd_process *create_process(const struct task_struct *thread, - struct file *filep); -static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); +struct kfd_process_release_work { + struct work_struct kfd_work; + struct kfd_process *p; +}; +static struct kfd_process *find_process(const struct task_struct *thread); +static struct kfd_process *create_process(const struct task_struct *thread); void kfd_process_create_wq(void) { @@ -77,144 +75,22 @@ void kfd_process_destroy_wq(void) } } -static void kfd_process_free_gpuvm(struct kgd_mem *mem, - struct kfd_process_device *pdd) -{ - kfd_unmap_memory_from_gpu(mem, pdd); - pdd->dev->kfd2kgd->free_memory_of_gpu(pdd->dev->kgd, mem, pdd->vm); -} - -/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process - * This function should be only called right after the process - * is created and when kfd_processes_mutex is still being held - * to avoid concurrency. Because of that exclusiveness, we do - * not need to take p->mutex. - */ -static int kfd_process_alloc_gpuvm(struct kfd_process *p, - struct kfd_dev *kdev, uint64_t gpu_va, uint32_t size, - void **kptr, struct kfd_process_device *pdd, uint32_t flags) -{ - int err; - void *mem = NULL; - int handle; - - err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, - pdd->vm, - (struct kgd_mem **)&mem, NULL, flags); - if (err) - goto err_alloc_mem; - - err = kdev->kfd2kgd->map_memory_to_gpu( - kdev->kgd, (struct kgd_mem *)mem, pdd->vm); - if (err) - goto err_map_mem; - - err = kdev->kfd2kgd->sync_memory(kdev->kgd, (struct kgd_mem *) mem, - true); - if (err) { - pr_debug("Sync memory failed, wait interrupted by user signal\n"); - goto sync_memory_failed; - } - - kfd_flush_tlb(kdev, p->pasid); - - /* Create an obj handle so kfd_process_device_remove_obj_handle - * will take care of the bo removal when the process finishes. - * We do not need to take p->mutex, because the process is just - * created and the ioctls have not had the chance to run. - */ - handle = kfd_process_device_create_obj_handle( - pdd, mem, gpu_va, size, NULL); - - if (handle < 0) { - err = handle; - goto free_gpuvm; - } - - if (kptr) { - err = kdev->kfd2kgd->map_gtt_bo_to_kernel(kdev->kgd, - (struct kgd_mem *)mem, kptr); - if (err) { - pr_debug("Map GTT BO to kernel failed\n"); - goto free_obj_handle; - } - } - - return err; - -free_obj_handle: - kfd_process_device_remove_obj_handle(pdd, handle); -free_gpuvm: -sync_memory_failed: - kfd_process_free_gpuvm(mem, pdd); - return err; - -err_map_mem: - kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem, pdd->vm); -err_alloc_mem: - *kptr = NULL; - return err; -} - -/* kfd_process_reserve_ib_mem - Reserve memory inside the process for IB usage - * The memory reserved is for KFD to submit IB to AMDGPU from kernel. - * If the memory is reserved successfully, ib_kaddr_assigned will have - * the CPU/kernel address. Check ib_kaddr_assigned before accessing the - * memory. - */ -static int kfd_process_reserve_ib_mem(struct kfd_process *p) -{ - int ret = 0; - struct kfd_process_device *temp, *pdd = NULL; - struct kfd_dev *kdev = NULL; - struct qcm_process_device *qpd = NULL; - void *kaddr; - uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | - ALLOC_MEM_FLAGS_NO_SUBSTITUTE | - ALLOC_MEM_FLAGS_EXECUTE_ACCESS; - - list_for_each_entry_safe(pdd, temp, &p->per_device_data, - per_device_list) { - kdev = pdd->dev; - qpd = &pdd->qpd; - if (!kdev->ib_size || qpd->ib_kaddr) - continue; - - if (qpd->ib_base) { /* is dGPU */ - ret = kfd_process_alloc_gpuvm(p, kdev, - qpd->ib_base, kdev->ib_size, - &kaddr, pdd, flags); - if (!ret) - qpd->ib_kaddr = kaddr; - else - /* In case of error, the kfd_bos for some pdds - * which are already allocated successfully - * will be freed in upper level function - * i.e. create_process(). - */ - return ret; - } else { - /* FIXME: Support APU */ - continue; - } - } - - return 0; -} - -struct kfd_process *kfd_create_process(struct file *filep) +struct kfd_process *kfd_create_process(const struct task_struct *thread) { struct kfd_process *process; - struct task_struct *thread = current; + BUG_ON(!kfd_process_wq); - if (!thread->mm) + if (thread->mm == NULL) return ERR_PTR(-EINVAL); /* Only the pthreads threading model is supported. */ if (thread->group_leader->mm != thread->mm) return ERR_PTR(-EINVAL); + /* Take mmap_sem because we call __mmu_notifier_register inside */ + down_write(&thread->mm->mmap_sem); + /* * take kfd processes mutex before starting of process creation * so there won't be a case where two threads of the same process @@ -223,14 +99,17 @@ struct kfd_process *kfd_create_process(struct file *filep) mutex_lock(&kfd_processes_mutex); /* A prior open of /dev/kfd could have already created the process. */ - process = find_process(thread, false); + process = find_process(thread); if (process) - pr_debug("Process already found\n"); - else - process = create_process(thread, filep); + pr_debug("kfd: process already found\n"); + + if (!process) + process = create_process(thread); mutex_unlock(&kfd_processes_mutex); + up_write(&thread->mm->mmap_sem); + return process; } @@ -238,14 +117,14 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) { struct kfd_process *process; - if (!thread->mm) + if (thread->mm == NULL) return ERR_PTR(-EINVAL); /* Only the pthreads threading model is supported. */ if (thread->group_leader->mm != thread->mm) return ERR_PTR(-EINVAL); - process = find_process(thread, false); + process = find_process(thread); return process; } @@ -262,158 +141,81 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) return NULL; } -static struct kfd_process *find_process(const struct task_struct *thread, - bool ref) +static struct kfd_process *find_process(const struct task_struct *thread) { struct kfd_process *p; int idx; idx = srcu_read_lock(&kfd_processes_srcu); p = find_process_by_mm(thread->mm); - if (p && ref) - kref_get(&p->ref); srcu_read_unlock(&kfd_processes_srcu, idx); return p; } -void kfd_unref_process(struct kfd_process *p) -{ - kref_put(&p->ref, kfd_process_ref_release); -} - -/* This increments the process->ref counter. */ -struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid) +static void kfd_process_wq_release(struct work_struct *work) { - struct task_struct *task = NULL; - struct kfd_process *p = NULL; - - if (!pid) - task = current; - else - task = get_pid_task(pid, PIDTYPE_PID); + struct kfd_process_release_work *my_work; + struct kfd_process_device *pdd, *temp; + struct kfd_process *p; - if (task) - p = find_process(task, true); + my_work = (struct kfd_process_release_work *) work; - return p; -} + p = my_work->p; -static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p) -{ - struct kfd_process_device *pdd, *peer_pdd; - struct kfd_bo *buf_obj; - int id; - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - /* - * Remove all handles from idr and release appropriate - * local memory object - */ - idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) { - list_for_each_entry(peer_pdd, &p->per_device_data, - per_device_list) { - peer_pdd->dev->kfd2kgd->unmap_memory_to_gpu( - peer_pdd->dev->kgd, - buf_obj->mem, peer_pdd->vm); - } - - run_rdma_free_callback(buf_obj); - pdd->dev->kfd2kgd->free_memory_of_gpu( - pdd->dev->kgd, buf_obj->mem, pdd->vm); - kfd_process_device_remove_obj_handle(pdd, id); - } - } -} + pr_debug("Releasing process (pasid %d) in workqueue\n", + p->pasid); -static void kfd_process_destroy_pdds(struct kfd_process *p) -{ - struct kfd_process_device *pdd, *temp; + mutex_lock(&p->mutex); list_for_each_entry_safe(pdd, temp, &p->per_device_data, - per_device_list) { - kfd_flush_tlb(pdd->dev, p->pasid); - /* Destroy the GPUVM VM context */ - if (pdd->vm) { - dma_fence_put(p->ef); - pdd->dev->kfd2kgd->destroy_process_vm( - pdd->dev->kgd, pdd->vm); - } - list_del(&pdd->per_device_list); + per_device_list) { + pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", + pdd->dev->id, p->pasid); - if (pdd->qpd.cwsr_pages) { - kunmap(pdd->qpd.cwsr_pages); - __free_pages(pdd->qpd.cwsr_pages, - get_order(pdd->dev->cwsr_size)); - } + if (pdd->reset_wavefronts) + dbgdev_wave_reset_wavefronts(pdd->dev, p); - kfree(pdd->qpd.doorbell_bitmap); - idr_destroy(&pdd->alloc_idr); + amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); + list_del(&pdd->per_device_list); kfree(pdd); } -} - -/* No process locking is needed in this function, because the process - * is not findable any more. We must assume that no other thread is - * using it any more, otherwise we couldn't safely free the process - * structure in the end. - */ -static void kfd_process_wq_release(struct work_struct *work) -{ - struct kfd_process *p = container_of(work, struct kfd_process, - release_work); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - struct kfd_process_device *pdd; - - pr_debug("Releasing process (pasid %d)\n", - p->pasid); - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", - pdd->dev->id, p->pasid); - - if (pdd->dev->device_info->is_need_iommu_device) { - if (pdd->bound == PDD_BOUND) { - amd_iommu_unbind_pasid(pdd->dev->pdev, - p->pasid); - pdd->bound = PDD_UNBOUND; - } - } - } -#endif - - kfd_process_free_outstanding_kfd_bos(p); - - kfd_process_destroy_pdds(p); kfd_event_free_process(p); kfd_pasid_free(p->pasid); + mutex_unlock(&p->mutex); + mutex_destroy(&p->mutex); - put_task_struct(p->lead_thread); + kfree(p->queues); kfree(p); + + kfree(work); } -static void kfd_process_ref_release(struct kref *ref) +static void kfd_process_destroy_delayed(struct rcu_head *rcu) { - struct kfd_process *p = container_of(ref, struct kfd_process, ref); + struct kfd_process_release_work *work; + struct kfd_process *p; - if (WARN_ON(!kfd_process_wq)) - return; + BUG_ON(!kfd_process_wq); - INIT_WORK(&p->release_work, kfd_process_wq_release); - queue_work(kfd_process_wq, &p->release_work); -} + p = container_of(rcu, struct kfd_process, rcu); + BUG_ON(atomic_read(&p->mm->mm_count) <= 0); -static void kfd_process_destroy_delayed(struct rcu_head *rcu) -{ - struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); + mmdrop(p->mm); + + work = kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC); - kfd_unref_process(p); + if (work) { + INIT_WORK((struct work_struct *) work, kfd_process_wq_release); + work->p = p; + queue_work(kfd_process_wq, (struct work_struct *) work); + } } static void kfd_process_notifier_release(struct mmu_notifier *mn, @@ -421,19 +223,13 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, { struct kfd_process *p; struct kfd_process_device *pdd = NULL; - struct kfd_dev *dev = NULL; - long status = -EFAULT; /* * The kfd_process structure can not be free because the * mmu_notifier srcu is read locked */ p = container_of(mn, struct kfd_process, mmu_notifier); - if (WARN_ON(p->mm != mm)) - return; - - cancel_delayed_work_sync(&p->eviction_work.dwork); - cancel_delayed_work_sync(&p->restore_work); + BUG_ON(p->mm != mm); mutex_lock(&kfd_processes_mutex); hash_del_rcu(&p->kfd_processes); @@ -442,46 +238,33 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, mutex_lock(&p->mutex); - /* Iterate over all process device data structures and if the pdd is in - * debug mode,we should first force unregistration, then we will be - * able to destroy the queues - */ - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - dev = pdd->dev; - mutex_lock(kfd_get_dbgmgr_mutex()); - - if (dev && dev->dbgmgr && (dev->dbgmgr->pasid == p->pasid)) { - - status = kfd_dbgmgr_unregister(dev->dbgmgr, p); - if (status == 0) { - kfd_dbgmgr_destroy(dev->dbgmgr); - dev->dbgmgr = NULL; - } - } - mutex_unlock(kfd_get_dbgmgr_mutex()); - } - - kfd_process_dequeue_from_all_devices(p); - - /* now we can uninit the pqm: */ + /* In case our notifier is called before IOMMU notifier */ pqm_uninit(&p->pqm); /* Iterate over all process device data structure and check - * if we should delete debug managers + * if we should delete debug managers and reset all wavefronts */ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { if ((pdd->dev->dbgmgr) && (pdd->dev->dbgmgr->pasid == p->pasid)) kfd_dbgmgr_destroy(pdd->dev->dbgmgr); + if (pdd->reset_wavefronts) { + pr_warn("amdkfd: Resetting all wave fronts\n"); + dbgdev_wave_reset_wavefronts(pdd->dev, p); + pdd->reset_wavefronts = false; + } } - /* Indicate to other users that MM is no longer valid */ - p->mm = NULL; - mutex_unlock(&p->mutex); - mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); + /* + * Because we drop mm_count inside kfd_process_destroy_delayed + * and because the mmu_notifier_unregister function also drop + * mm_count we need to take an extra count here. + */ + mmgrab(p->mm); + mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm); mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); } @@ -489,68 +272,7 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { .release = kfd_process_notifier_release, }; -static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep) -{ - int ret; - unsigned long offset; - struct kfd_process_device *temp, *pdd = NULL; - struct kfd_dev *dev = NULL; - struct qcm_process_device *qpd = NULL; - void *kaddr; - uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | - ALLOC_MEM_FLAGS_NO_SUBSTITUTE | - ALLOC_MEM_FLAGS_READONLY | - ALLOC_MEM_FLAGS_EXECUTE_ACCESS; - - list_for_each_entry_safe(pdd, temp, &p->per_device_data, - per_device_list) { - dev = pdd->dev; - qpd = &pdd->qpd; - if (!dev->cwsr_enabled || qpd->cwsr_kaddr) - continue; - if (qpd->cwsr_base) { - /* cwsr_base is only set for DGPU */ - ret = kfd_process_alloc_gpuvm(p, dev, qpd->cwsr_base, - dev->cwsr_size, &kaddr, pdd, flags); - if (!ret) { - qpd->cwsr_kaddr = kaddr; - qpd->tba_addr = qpd->cwsr_base; - } else - /* In case of error, the kfd_bos for some pdds - * which are already allocated successfully - * will be freed in upper level function - * i.e. create_process(). - */ - return ret; - } else { - offset = (dev->id | - KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT; - qpd->tba_addr = (uint64_t)vm_mmap(filep, 0, - dev->cwsr_size, PROT_READ | PROT_EXEC, - MAP_SHARED, offset); - - if (IS_ERR_VALUE(qpd->tba_addr)) { - pr_err("Failure to set tba address. error -%d.\n", - (int)qpd->tba_addr); - qpd->tba_addr = 0; - qpd->cwsr_kaddr = NULL; - return -ENOMEM; - } - } - - memcpy(qpd->cwsr_kaddr, kmap(dev->cwsr_pages), PAGE_SIZE); - kunmap(dev->cwsr_pages); - - qpd->tma_addr = qpd->tba_addr + dev->tma_offset; - pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n", - qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr); - } - - return 0; -} - -static struct kfd_process *create_process(const struct task_struct *thread, - struct file *filep) +static struct kfd_process *create_process(const struct task_struct *thread) { struct kfd_process *process; int err = -ENOMEM; @@ -560,20 +282,22 @@ static struct kfd_process *create_process(const struct task_struct *thread, if (!process) goto err_alloc_process; - process->bo_interval_tree = RB_ROOT_CACHED; + process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, + sizeof(process->queues[0]), GFP_KERNEL); + if (!process->queues) + goto err_alloc_queues; process->pasid = kfd_pasid_alloc(); if (process->pasid == 0) goto err_alloc_pasid; - kref_init(&process->ref); mutex_init(&process->mutex); process->mm = thread->mm; /* register notifier */ process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; - err = mmu_notifier_register(&process->mmu_notifier, process->mm); + err = __mmu_notifier_register(&process->mmu_notifier, process->mm); if (err) goto err_mmu_notifier; @@ -581,7 +305,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, (uintptr_t)process->mm); process->lead_thread = thread->group_leader; - get_task_struct(process->lead_thread); + + process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE; INIT_LIST_HEAD(&process->per_device_data); @@ -597,28 +322,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, if (err != 0) goto err_init_apertures; - err = kfd_process_reserve_ib_mem(process); - if (err) - goto err_reserve_ib_mem; - err = kfd_process_init_cwsr(process, filep); - if (err) - goto err_init_cwsr; - - INIT_DELAYED_WORK(&process->eviction_work.dwork, kfd_evict_bo_worker); - INIT_DELAYED_WORK(&process->restore_work, kfd_restore_bo_worker); - process->last_restore_timestamp = get_jiffies_64(); - - /* If PeerDirect interface was not detected try to detect it again - * in case if network driver was loaded later. - */ - kfd_init_peer_direct(); - return process; -err_init_cwsr: -err_reserve_ib_mem: - kfd_process_free_outstanding_kfd_bos(process); - kfd_process_destroy_pdds(process); err_init_apertures: pqm_uninit(&process->pqm); err_process_pqm_init: @@ -629,36 +334,13 @@ static struct kfd_process *create_process(const struct task_struct *thread, mutex_destroy(&process->mutex); kfd_pasid_free(process->pasid); err_alloc_pasid: + kfree(process->queues); +err_alloc_queues: kfree(process); err_alloc_process: return ERR_PTR(err); } -static int init_doorbell_bitmap(struct qcm_process_device *qpd, - struct kfd_dev *dev) -{ - unsigned int i; - - if (!KFD_IS_SOC15(dev->device_info->asic_family)) - return 0; - - qpd->doorbell_bitmap = - kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, - BITS_PER_BYTE), GFP_KERNEL); - if (!qpd->doorbell_bitmap) - return -ENOMEM; - - /* Mask out any reserved doorbells */ - for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) - if ((dev->shared_resources.reserved_doorbell_mask & i) == - dev->shared_resources.reserved_doorbell_val) { - set_bit(i, qpd->doorbell_bitmap); - pr_debug("reserved doorbell 0x%03x\n", i); - } - - return 0; -} - struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process *p) { @@ -666,9 +348,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, list_for_each_entry(pdd, &p->per_device_data, per_device_list) if (pdd->dev == dev) - return pdd; + break; - return NULL; + return pdd; } struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, @@ -677,41 +359,16 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, struct kfd_process_device *pdd = NULL; pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); - if (!pdd) - return NULL; - - pdd->dev = dev; - INIT_LIST_HEAD(&pdd->qpd.queues_list); - INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); - pdd->qpd.dqm = dev->dqm; - pdd->qpd.pqm = &p->pqm; - pdd->qpd.evicted = 0; - pdd->process = p; - pdd->bound = PDD_UNBOUND; - pdd->already_dequeued = false; - list_add(&pdd->per_device_list, &p->per_device_data); - - /* Init idr used for memory handle translation */ - idr_init(&pdd->alloc_idr); - if (init_doorbell_bitmap(&pdd->qpd, dev)) { - pr_err("Failed to init doorbell for process\n"); - goto err_create_pdd; + if (pdd != NULL) { + pdd->dev = dev; + INIT_LIST_HEAD(&pdd->qpd.queues_list); + INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); + pdd->qpd.dqm = dev->dqm; + pdd->reset_wavefronts = false; + list_add(&pdd->per_device_list, &p->per_device_data); } - /* Create the GPUVM context for this specific device */ - if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm, - &p->process_info, &p->ef)) { - pr_err("Failed to create process VM object\n"); - goto err_create_pdd; - } return pdd; - -err_create_pdd: - kfree(pdd->qpd.doorbell_bitmap); - idr_destroy(&pdd->alloc_idr); - list_del(&pdd->per_device_list); - kfree(pdd); - return NULL; } /* @@ -725,6 +382,7 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process *p) { struct kfd_process_device *pdd; + int err; pdd = kfd_get_process_device_data(dev, p); if (!pdd) { @@ -732,89 +390,24 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, return ERR_PTR(-ENOMEM); } - if (pdd->bound == PDD_BOUND) + if (pdd->bound) return pdd; - if (pdd->bound == PDD_BOUND_SUSPENDED) { - pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); - return ERR_PTR(-EINVAL); - } + err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); + if (err < 0) + return ERR_PTR(err); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (dev->device_info->is_need_iommu_device) { - int err = amd_iommu_bind_pasid(dev->pdev, p->pasid, - p->lead_thread); - if (err < 0) - return ERR_PTR(err); - } -#endif - - pdd->bound = PDD_BOUND; + pdd->bound = true; return pdd; } -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -int kfd_bind_processes_to_device(struct kfd_dev *dev) +void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) { - struct kfd_process_device *pdd; struct kfd_process *p; - unsigned int temp; - int err = 0; - - int idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - mutex_lock(&p->mutex); - pdd = kfd_get_process_device_data(dev, p); - if (pdd->bound != PDD_BOUND_SUSPENDED) { - mutex_unlock(&p->mutex); - continue; - } - - err = amd_iommu_bind_pasid(dev->pdev, p->pasid, - p->lead_thread); - if (err < 0) { - pr_err("Unexpected pasid %d binding failure\n", - p->pasid); - mutex_unlock(&p->mutex); - break; - } - - pdd->bound = PDD_BOUND; - mutex_unlock(&p->mutex); - } - - srcu_read_unlock(&kfd_processes_srcu, idx); - - return err; -} - -void kfd_unbind_processes_from_device(struct kfd_dev *dev) -{ struct kfd_process_device *pdd; - struct kfd_process *p; - unsigned int temp; - - int idx = srcu_read_lock(&kfd_processes_srcu); - - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - mutex_lock(&p->mutex); - pdd = kfd_get_process_device_data(dev, p); - if (pdd->bound == PDD_BOUND) - pdd->bound = PDD_BOUND_SUSPENDED; - mutex_unlock(&p->mutex); - } - - srcu_read_unlock(&kfd_processes_srcu, idx); -} - -void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) -{ - struct kfd_process *p; - struct kfd_process_device *pdd; + BUG_ON(dev == NULL); /* * Look for the process that matches the pasid. If there is no such @@ -827,43 +420,43 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) pr_debug("Unbinding process %d from IOMMU\n", pasid); - mutex_lock(kfd_get_dbgmgr_mutex()); + if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) + kfd_dbgmgr_destroy(dev->dbgmgr); - if (dev->dbgmgr && (dev->dbgmgr->pasid == p->pasid)) { + pqm_uninit(&p->pqm); - if (kfd_dbgmgr_unregister(dev->dbgmgr, p) == 0) { - kfd_dbgmgr_destroy(dev->dbgmgr); - dev->dbgmgr = NULL; - } - } + pdd = kfd_get_process_device_data(dev, p); - mutex_unlock(kfd_get_dbgmgr_mutex()); + if (!pdd) { + mutex_unlock(&p->mutex); + return; + } - mutex_lock(&p->mutex); + if (pdd->reset_wavefronts) { + dbgdev_wave_reset_wavefronts(pdd->dev, p); + pdd->reset_wavefronts = false; + } - pdd = kfd_get_process_device_data(dev, p); - if (pdd) - /* For GPU relying on IOMMU, we need to dequeue here - * when PASID is still bound. - */ - kfd_process_dequeue_from_device(pdd); + /* + * Just mark pdd as unbound, because we still need it + * to call amd_iommu_unbind_pasid() in when the + * process exits. + * We don't call amd_iommu_unbind_pasid() here + * because the IOMMU called us. + */ + pdd->bound = false; mutex_unlock(&p->mutex); - - kfd_unref_process(p); } -#endif /* CONFIG_AMD_IOMMU_V2 */ -struct kfd_process_device *kfd_get_first_process_device_data( - struct kfd_process *p) +struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p) { return list_first_entry(&p->per_device_data, struct kfd_process_device, per_device_list); } -struct kfd_process_device *kfd_get_next_process_device_data( - struct kfd_process *p, +struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, struct kfd_process_device *pdd) { if (list_is_last(&pdd->per_device_list, &p->per_device_data)) @@ -876,272 +469,22 @@ bool kfd_has_process_device_data(struct kfd_process *p) return !(list_empty(&p->per_device_data)); } -/* Create specific handle mapped to mem from process local memory idr - * Assumes that the process lock is held. - */ -int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, - void *mem, uint64_t start, - uint64_t length, - struct kfd_ipc_obj *ipc_obj) -{ - int handle; - struct kfd_bo *buf_obj; - struct kfd_process *p; - - p = pdd->process; - - buf_obj = kzalloc(sizeof(*buf_obj), GFP_KERNEL); - - if (!buf_obj) - return -ENOMEM; - - buf_obj->it.start = start; - buf_obj->it.last = start + length - 1; - interval_tree_insert(&buf_obj->it, &p->bo_interval_tree); - - buf_obj->mem = mem; - buf_obj->dev = pdd->dev; - buf_obj->kfd_ipc_obj = ipc_obj; - - INIT_LIST_HEAD(&buf_obj->cb_data_head); - - idr_preload(GFP_KERNEL); - - handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID, - GFP_NOWAIT); - - idr_preload_end(); - - if (handle < 0) - kfree(buf_obj); - - return handle; -} - -struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, - int handle) -{ - if (handle < 0) - return NULL; - - return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle); -} - -/* Translate specific handle from process local memory idr - * Assumes that the process lock is held. - */ -void *kfd_process_device_translate_handle(struct kfd_process_device *pdd, - int handle) -{ - struct kfd_bo *buf_obj; - - buf_obj = kfd_process_device_find_bo(pdd, handle); - - return buf_obj->mem; -} - -void *kfd_process_find_bo_from_interval(struct kfd_process *p, - uint64_t start_addr, - uint64_t last_addr) -{ - struct interval_tree_node *it_node; - struct kfd_bo *buf_obj; - - it_node = interval_tree_iter_first(&p->bo_interval_tree, - start_addr, last_addr); - if (!it_node) { - pr_err("0x%llx-0x%llx does not relate to an existing buffer\n", - start_addr, last_addr); - return NULL; - } - - if (interval_tree_iter_next(it_node, start_addr, last_addr)) { - pr_err("0x%llx-0x%llx spans more than a single BO\n", - start_addr, last_addr); - return NULL; - } - - buf_obj = container_of(it_node, struct kfd_bo, it); - - return buf_obj; -} - -/* Remove specific handle from process local memory idr - * Assumes that the process lock is held. - */ -void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, - int handle) -{ - struct kfd_bo *buf_obj; - struct kfd_process *p; - - p = pdd->process; - - if (handle < 0) - return; - - buf_obj = kfd_process_device_find_bo(pdd, handle); - - if (buf_obj->kfd_ipc_obj) - ipc_obj_put(&buf_obj->kfd_ipc_obj); - - idr_remove(&pdd->alloc_idr, handle); - - interval_tree_remove(&buf_obj->it, &p->bo_interval_tree); - - kfree(buf_obj); -} - -/* This increments the process->ref counter. */ +/* This returns with process->mutex locked. */ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) { - struct kfd_process *p, *ret_p = NULL; + struct kfd_process *p; unsigned int temp; int idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { if (p->pasid == pasid) { - kref_get(&p->ref); - ret_p = p; + mutex_lock(&p->mutex); break; } } srcu_read_unlock(&kfd_processes_srcu, idx); - return ret_p; -} - -void kfd_suspend_all_processes(void) -{ - struct kfd_process *p; - unsigned int temp; - int idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - if (cancel_delayed_work_sync(&p->eviction_work.dwork)) - dma_fence_put(p->eviction_work.quiesce_fence); - cancel_delayed_work_sync(&p->restore_work); - - if (quiesce_process_mm(p)) - pr_err("Failed to suspend process %d\n", p->pasid); - dma_fence_signal(p->ef); - dma_fence_put(p->ef); - p->ef = NULL; - } - srcu_read_unlock(&kfd_processes_srcu, idx); -} - -int kfd_resume_all_processes(void) -{ - struct kfd_process *p; - unsigned int temp; - int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - if (!schedule_delayed_work(&p->restore_work, 0)) { - pr_err("Restore process %d failed during resume\n", - p->pasid); - ret = -EFAULT; - } - } - srcu_read_unlock(&kfd_processes_srcu, idx); - return ret; -} - -/* This increments the process->ref counter. */ -struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) -{ - struct kfd_process *p; - - int idx = srcu_read_lock(&kfd_processes_srcu); - - p = find_process_by_mm(mm); - if (p) - kref_get(&p->ref); - - srcu_read_unlock(&kfd_processes_srcu, idx); - return p; } - -int kfd_reserved_mem_mmap(struct kfd_process *process, - struct vm_area_struct *vma) -{ - unsigned long pfn, i; - int ret = 0; - struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); - struct kfd_process_device *temp, *pdd = NULL; - struct qcm_process_device *qpd = NULL; - - if (!dev) - return -EINVAL; - if (((vma->vm_end - vma->vm_start) != dev->cwsr_size) || - (vma->vm_start & (PAGE_SIZE - 1)) || - (vma->vm_end & (PAGE_SIZE - 1))) { - pr_err("KFD only support page aligned memory map and correct size.\n"); - return -EINVAL; - } - - pr_debug("kfd reserved mem mmap been called.\n"); - - list_for_each_entry_safe(pdd, temp, &process->per_device_data, - per_device_list) { - if (dev == pdd->dev) { - qpd = &pdd->qpd; - break; - } - } - if (!qpd) - return -EINVAL; - - qpd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, - get_order(dev->cwsr_size)); - if (!qpd->cwsr_pages) { - pr_err("amdkfd: error alloc CWSR isa memory per process.\n"); - return -ENOMEM; - } - qpd->cwsr_kaddr = kmap(qpd->cwsr_pages); - - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND - | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; - for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); ++i) { - pfn = page_to_pfn(&qpd->cwsr_pages[i]); - /* mapping the page to user process */ - ret = remap_pfn_range(vma, vma->vm_start + (i << PAGE_SHIFT), - pfn, PAGE_SIZE, vma->vm_page_prot); - if (ret) - break; - } - return ret; -} - -#if defined(CONFIG_DEBUG_FS) - -int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) -{ - struct kfd_process *p; - unsigned int temp; - int r = 0; - - int idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - seq_printf(m, "Process %d PASID %d:\n", - p->lead_thread->tgid, p->pasid); - - mutex_lock(&p->mutex); - r = pqm_debugfs_mqds(m, &p->pqm); - mutex_unlock(&p->mutex); - - if (r != 0) - break; - } - - srcu_read_unlock(&kfd_processes_srcu, idx); - - return r; -} - -#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index a87fcab..46f497e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -32,9 +32,12 @@ static inline struct process_queue_node *get_queue_by_qid( { struct process_queue_node *pqn; + BUG_ON(!pqm); + list_for_each_entry(pqn, &pqm->queues, process_queue_list) { - if ((pqn->q && pqn->q->properties.queue_id == qid) || - (pqn->kq && pqn->kq->queue->properties.queue_id == qid)) + if (pqn->q && pqn->q->properties.queue_id == qid) + return pqn; + if (pqn->kq && pqn->kq->queue->properties.queue_id == qid) return pqn; } @@ -46,13 +49,17 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, { unsigned long found; + BUG_ON(!pqm || !qid); + + pr_debug("kfd: in %s\n", __func__); + found = find_first_zero_bit(pqm->queue_slot_bitmap, KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); - pr_debug("The new slot id %lu\n", found); + pr_debug("kfd: the new slot id %lu\n", found); if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { - pr_info("Cannot open more queues for process with pasid %d\n", + pr_info("amdkfd: Can not open more queues for process with pasid %d\n", pqm->process->pasid); return -ENOMEM; } @@ -63,33 +70,15 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, return 0; } -void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) -{ - struct kfd_dev *dev = pdd->dev; - int retval; - - if (pdd->already_dequeued) - return; - - retval = dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd); - pdd->already_dequeued = true; -} - -void kfd_process_dequeue_from_all_devices(struct kfd_process *p) -{ - struct kfd_process_device *pdd; - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) - kfd_process_dequeue_from_device(pdd); -} - int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) { + BUG_ON(!pqm); + INIT_LIST_HEAD(&pqm->queues); pqm->queue_slot_bitmap = kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_BYTE), GFP_KERNEL); - if (!pqm->queue_slot_bitmap) + if (pqm->queue_slot_bitmap == NULL) return -ENOMEM; pqm->process = p; @@ -98,14 +87,25 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) void pqm_uninit(struct process_queue_manager *pqm) { + int retval; struct process_queue_node *pqn, *next; + BUG_ON(!pqm); + + pr_debug("In func %s\n", __func__); + list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { - uninit_queue(pqn->q); - list_del(&pqn->process_queue_list); - kfree(pqn); + retval = pqm_destroy_queue( + pqm, + (pqn->q != NULL) ? + pqn->q->properties.queue_id : + pqn->kq->queue->properties.queue_id); + + if (retval != 0) { + pr_err("kfd: failed to destroy queue\n"); + return; + } } - kfree(pqm->queue_slot_bitmap); pqm->queue_slot_bitmap = NULL; } @@ -117,39 +117,54 @@ static int create_cp_queue(struct process_queue_manager *pqm, { int retval; + retval = 0; + /* Doorbell initialized in user space*/ q_properties->doorbell_ptr = NULL; + q_properties->doorbell_off = + kfd_queue_id_to_doorbell(dev, pqm->process, qid); + /* let DQM handle it*/ q_properties->vmid = 0; q_properties->queue_id = qid; retval = init_queue(q, q_properties); if (retval != 0) - return retval; + goto err_init_queue; (*q)->device = dev; (*q)->process = pqm->process; - pr_debug("PQM After init queue"); + pr_debug("kfd: PQM After init queue"); return retval; + +err_init_queue: + return retval; } int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_dev *dev, struct file *f, struct queue_properties *properties, + unsigned int flags, + enum kfd_queue_type type, unsigned int *qid) { int retval; struct kfd_process_device *pdd; + struct queue_properties q_properties; struct queue *q; struct process_queue_node *pqn; struct kernel_queue *kq; - enum kfd_queue_type type = properties->type; - unsigned int max_queues = 127; /* HWS limit */ + int num_queues = 0; + struct queue *cur; + + BUG_ON(!pqm || !dev || !properties || !qid); + memset(&q_properties, 0, sizeof(struct queue_properties)); + memcpy(&q_properties, properties, sizeof(struct queue_properties)); q = NULL; kq = NULL; @@ -165,21 +180,24 @@ int pqm_create_queue(struct process_queue_manager *pqm, * If we are just about to create DIQ, the is_debug flag is not set yet * Hence we also check the type as well */ - if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ)) - max_queues = dev->device_info->max_no_of_hqd/2; - - if (pdd->qpd.queue_count >= max_queues) - return -ENOSPC; + if ((pdd->qpd.is_debug) || + (type == KFD_QUEUE_TYPE_DIQ)) { + list_for_each_entry(cur, &pdd->qpd.queues_list, list) + num_queues++; + if (num_queues >= dev->device_info->max_no_of_hqd/2) + return (-ENOSPC); + } retval = find_available_queue_slot(pqm, qid); if (retval != 0) return retval; - if (list_empty(&pdd->qpd.queues_list) && - list_empty(&pdd->qpd.priv_queue_list)) + if (list_empty(&pqm->queues)) { + pdd->qpd.pqm = pqm; dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); + } - pqn = kzalloc(sizeof(*pqn), GFP_KERNEL); + pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL); if (!pqn) { retval = -ENOMEM; goto err_allocate_pqn; @@ -187,35 +205,18 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: - if (dev->dqm->sdma_queue_count >= CIK_SDMA_QUEUES) { - pr_err("Over-subscription is not allowed for SDMA\n"); - retval = -EPERM; - goto err_create_queue; - } - - retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); - if (retval != 0) - goto err_create_queue; - pqn->q = q; - pqn->kq = NULL; - retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, - &q->properties.vmid); - pr_debug("DQM returned %d for create_queue\n", retval); - print_queue(q); - break; case KFD_QUEUE_TYPE_COMPUTE: /* check if there is over subscription */ - if ((dev->dqm->sched_policy == - KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && - ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || + if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && + ((dev->dqm->processes_count >= VMID_PER_DEVICE) || (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) { - pr_err("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); + pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); retval = -EPERM; goto err_create_queue; } - retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); + retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid); if (retval != 0) goto err_create_queue; pqn->q = q; @@ -227,7 +228,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, break; case KFD_QUEUE_TYPE_DIQ: kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ); - if (!kq) { + if (kq == NULL) { retval = -ENOMEM; goto err_create_queue; } @@ -238,31 +239,23 @@ int pqm_create_queue(struct process_queue_manager *pqm, kq, &pdd->qpd); break; default: - WARN(1, "Invalid queue type %d", type); - retval = -EINVAL; + BUG(); + break; } if (retval != 0) { - pr_err("DQM create queue failed\n"); + pr_debug("Error dqm create queue\n"); goto err_create_queue; } - if (q) - /* Return the doorbell offset within the doorbell page - * to the caller so it can be passed up to user mode - * (in bytes). - */ - properties->doorbell_off = - (q->properties.doorbell_off * sizeof(uint32_t)) & - (kfd_doorbell_process_slice(dev) - 1); - - pr_debug("PQM After DQM create queue\n"); + pr_debug("kfd: PQM After DQM create queue\n"); list_add(&pqn->process_queue_list, &pqm->queues); if (q) { - pr_debug("PQM done creating queue\n"); - print_queue_properties(&q->properties); + *properties = q->properties; + pr_debug("kfd: PQM done creating queue\n"); + print_queue_properties(properties); } return retval; @@ -272,8 +265,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, err_allocate_pqn: /* check if queues list is empty unregister process from device */ clear_bit(*qid, pqm->queue_slot_bitmap); - if (list_empty(&pdd->qpd.queues_list) && - list_empty(&pdd->qpd.priv_queue_list)) + if (list_empty(&pqm->queues)) dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); return retval; } @@ -288,11 +280,14 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) dqm = NULL; + BUG_ON(!pqm); retval = 0; + pr_debug("kfd: In Func %s\n", __func__); + pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { - pr_err("Queue id does not match any known queue\n"); + if (pqn == NULL) { + pr_err("kfd: queue id does not match any known queue\n"); return -EINVAL; } @@ -301,8 +296,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) dev = pqn->kq->dev; if (pqn->q) dev = pqn->q->device; - if (WARN_ON(!dev)) - return -ENODEV; + BUG_ON(!dev); pdd = kfd_get_process_device_data(dev, pqm->process); if (!pdd) { @@ -319,9 +313,10 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (pqn->q) { dqm = pqn->q->device->dqm; - kfree(pqn->q->properties.cu_mask); - pqn->q->properties.cu_mask = NULL; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); + if (retval != 0) + return retval; + uninit_queue(pqn->q); } @@ -329,8 +324,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) kfree(pqn); clear_bit(qid, pqm->queue_slot_bitmap); - if (list_empty(&pdd->qpd.queues_list) && - list_empty(&pdd->qpd.priv_queue_list)) + if (list_empty(&pqm->queues)) dqm->ops.unregister_process(dqm, &pdd->qpd); return retval; @@ -342,9 +336,12 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, int retval; struct process_queue_node *pqn; + BUG_ON(!pqm); + pqn = get_queue_by_qid(pqm, qid); if (!pqn) { - pr_debug("No queue %d exists for update operation\n", qid); + pr_debug("amdkfd: No queue %d exists for update operation\n", + qid); return -EFAULT; } @@ -361,40 +358,14 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, return 0; } -int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, - struct queue_properties *p) -{ - int retval; - struct process_queue_node *pqn; - - pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { - pr_debug("No queue %d exists for update operation\n", qid); - return -EFAULT; - } - - /* Free the old CU mask memory if it is already allocated, then - * allocate memory for the new CU mask. - */ - kfree(pqn->q->properties.cu_mask); - - pqn->q->properties.cu_mask_count = p->cu_mask_count; - pqn->q->properties.cu_mask = p->cu_mask; - - retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, - pqn->q); - if (retval != 0) - return retval; - - return 0; -} - struct kernel_queue *pqm_get_kernel_queue( struct process_queue_manager *pqm, unsigned int qid) { struct process_queue_node *pqn; + BUG_ON(!pqm); + pqn = get_queue_by_qid(pqm, qid); if (pqn && pqn->kq) return pqn->kq; @@ -402,89 +373,4 @@ struct kernel_queue *pqm_get_kernel_queue( return NULL; } -int pqm_get_wave_state(struct process_queue_manager *pqm, - unsigned int qid, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size) -{ - struct process_queue_node *pqn; - - pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { - pr_debug("amdkfd: No queue %d exists for operation\n", - qid); - return -EFAULT; - } - - return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm, - pqn->q, - ctl_stack, - ctl_stack_used_size, - save_area_used_size); -} - -#if defined(CONFIG_DEBUG_FS) - -int pqm_debugfs_mqds(struct seq_file *m, void *data) -{ - struct process_queue_manager *pqm = data; - struct process_queue_node *pqn; - struct queue *q; - enum KFD_MQD_TYPE mqd_type; - struct mqd_manager *mqd_manager; - int r = 0; - - list_for_each_entry(pqn, &pqm->queues, process_queue_list) { - if (pqn->q) { - q = pqn->q; - switch (q->properties.type) { - case KFD_QUEUE_TYPE_SDMA: - seq_printf(m, " SDMA queue on device %x\n", - q->device->id); - mqd_type = KFD_MQD_TYPE_SDMA; - break; - case KFD_QUEUE_TYPE_COMPUTE: - seq_printf(m, " Compute queue on device %x\n", - q->device->id); - mqd_type = KFD_MQD_TYPE_CP; - break; - default: - seq_printf(m, - " Bad user queue type %d on device %x\n", - q->properties.type, q->device->id); - continue; - } - mqd_manager = q->device->dqm->ops.get_mqd_manager( - q->device->dqm, mqd_type); - } else if (pqn->kq) { - q = pqn->kq->queue; - mqd_manager = pqn->kq->mqd; - switch (q->properties.type) { - case KFD_QUEUE_TYPE_DIQ: - seq_printf(m, " DIQ on device %x\n", - pqn->kq->dev->id); - mqd_type = KFD_MQD_TYPE_HIQ; - break; - default: - seq_printf(m, - " Bad kernel queue type %d on device %x\n", - q->properties.type, - pqn->kq->dev->id); - continue; - } - } else { - seq_printf(m, - " Weird: Queue node with neither kernel nor user queue\n"); - continue; - } - - r = mqd_manager->debugfs_show_mqd(m, q->mqd); - if (r != 0) - break; - } - - return r; -} -#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index a5315d4..0ab1970 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -65,15 +65,17 @@ void print_queue(struct queue *q) int init_queue(struct queue **q, const struct queue_properties *properties) { - struct queue *tmp_q; + struct queue *tmp; - tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL); - if (!tmp_q) + BUG_ON(!q); + + tmp = kzalloc(sizeof(struct queue), GFP_KERNEL); + if (!tmp) return -ENOMEM; - memcpy(&tmp_q->properties, properties, sizeof(*properties)); + memcpy(&tmp->properties, properties, sizeof(struct queue_properties)); - *q = tmp_q; + *q = tmp; return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c deleted file mode 100644 index 2f5cdb9..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Copyright 2015 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include "kfd_priv.h" - - -struct rdma_cb { - struct list_head node; - struct amd_p2p_info amd_p2p_data; - void (*free_callback)(void *client_priv); - void *client_priv; -}; - -/** - * This function makes the pages underlying a range of GPU virtual memory - * accessible for DMA operations from another PCIe device - * - * \param address - The start address in the Unified Virtual Address - * space in the specified process - * \param length - The length of requested mapping - * \param pid - Pointer to structure pid to which address belongs. - * Could be NULL for current process address space. - * \param p2p_data - On return: Pointer to structure describing - * underlying pages/locations - * \param free_callback - Pointer to callback which will be called when access - * to such memory must be stopped immediately: Memory - * was freed, GECC events, etc. - * Client should immediately stop any transfer - * operations and returned as soon as possible. - * After return all resources associated with address - * will be release and no access will be allowed. - * \param client_priv - Pointer to be passed as parameter on - * 'free_callback; - * - * \return 0 if operation was successful - */ -static int get_pages(uint64_t address, uint64_t length, struct pid *pid, - struct amd_p2p_info **amd_p2p_data, - void (*free_callback)(void *client_priv), - void *client_priv) -{ - struct kfd_bo *buf_obj; - struct kgd_mem *mem; - struct sg_table *sg_table_tmp; - struct kfd_dev *dev; - uint64_t last = address + length - 1; - uint64_t offset; - struct kfd_process *p; - struct rdma_cb *rdma_cb_data; - int ret = 0; - - p = kfd_lookup_process_by_pid(pid); - if (!p) { - pr_err("Could not find the process\n"); - return -EINVAL; - } - mutex_lock(&p->mutex); - - buf_obj = kfd_process_find_bo_from_interval(p, address, last); - if (!buf_obj) { - pr_err("Cannot find a kfd_bo for the range\n"); - ret = -EINVAL; - goto out; - } - - rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL); - if (!rdma_cb_data) { - *amd_p2p_data = NULL; - ret = -ENOMEM; - goto out; - } - - mem = buf_obj->mem; - dev = buf_obj->dev; - offset = address - buf_obj->it.start; - - ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem, - offset, length, &sg_table_tmp); - - if (ret) { - pr_err("pin_get_sg_table_bo failed.\n"); - *amd_p2p_data = NULL; - goto free_mem; - } - - rdma_cb_data->amd_p2p_data.va = address; - rdma_cb_data->amd_p2p_data.size = length; - rdma_cb_data->amd_p2p_data.pid = pid; - rdma_cb_data->amd_p2p_data.priv = buf_obj; - rdma_cb_data->amd_p2p_data.pages = sg_table_tmp; - - rdma_cb_data->free_callback = free_callback; - rdma_cb_data->client_priv = client_priv; - - list_add(&rdma_cb_data->node, &buf_obj->cb_data_head); - - *amd_p2p_data = &rdma_cb_data->amd_p2p_data; - - goto out; - -free_mem: - kfree(rdma_cb_data); -out: - mutex_unlock(&p->mutex); - kfd_unref_process(p); - - return ret; -} - -static int put_pages_helper(struct amd_p2p_info *p2p_data) -{ - struct kfd_bo *buf_obj; - struct kfd_dev *dev; - struct sg_table *sg_table_tmp; - struct rdma_cb *rdma_cb_data; - - if (!p2p_data) { - pr_err("amd_p2p_info pointer is invalid.\n"); - return -EINVAL; - } - - rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data); - - buf_obj = p2p_data->priv; - dev = buf_obj->dev; - sg_table_tmp = p2p_data->pages; - - list_del(&rdma_cb_data->node); - kfree(rdma_cb_data); - - dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp); - - - return 0; -} - -void run_rdma_free_callback(struct kfd_bo *buf_obj) -{ - struct rdma_cb *tmp, *rdma_cb_data; - - list_for_each_entry_safe(rdma_cb_data, tmp, - &buf_obj->cb_data_head, node) { - if (rdma_cb_data->free_callback) - rdma_cb_data->free_callback( - rdma_cb_data->client_priv); - - put_pages_helper(&rdma_cb_data->amd_p2p_data); - } -} - -/** - * - * This function release resources previously allocated by get_pages() call. - * - * \param p_p2p_data - A pointer to pointer to amd_p2p_info entries - * allocated by get_pages() call. - * - * \return 0 if operation was successful - */ -static int put_pages(struct amd_p2p_info **p_p2p_data) -{ - struct kfd_process *p = NULL; - int ret = 0; - - if (!(*p_p2p_data)) { - pr_err("amd_p2p_info pointer is invalid.\n"); - return -EINVAL; - } - - p = kfd_lookup_process_by_pid((*p_p2p_data)->pid); - if (!p) { - pr_err("Could not find the process\n"); - return -EINVAL; - } - - ret = put_pages_helper(*p_p2p_data); - - if (!ret) - *p_p2p_data = NULL; - - kfd_unref_process(p); - - return ret; -} - -/** - * Check if given address belongs to GPU address space. - * - * \param address - Address to check - * \param pid - Process to which given address belongs. - * Could be NULL if current one. - * - * \return 0 - This is not GPU address managed by AMD driver - * 1 - This is GPU address managed by AMD driver - */ -static int is_gpu_address(uint64_t address, struct pid *pid) -{ - struct kfd_bo *buf_obj; - struct kfd_process *p; - - p = kfd_lookup_process_by_pid(pid); - if (!p) { - pr_debug("Could not find the process\n"); - return 0; - } - - buf_obj = kfd_process_find_bo_from_interval(p, address, address); - - kfd_unref_process(p); - if (!buf_obj) - return 0; - - return 1; -} - -/** - * Return the single page size to be used when building scatter/gather table - * for given range. - * - * \param address - Address - * \param length - Range length - * \param pid - Process id structure. Could be NULL if current one. - * \param page_size - On return: Page size - * - * \return 0 if operation was successful - */ -static int get_page_size(uint64_t address, uint64_t length, struct pid *pid, - unsigned long *page_size) -{ - /* - * As local memory is always consecutive, we can assume the local - * memory page size to be arbitrary. - * Currently we assume the local memory page size to be the same - * as system memory, which is 4KB. - */ - *page_size = PAGE_SIZE; - - return 0; -} - - -/** - * Singleton object: rdma interface function pointers - */ -static const struct amd_rdma_interface rdma_ops = { - .get_pages = get_pages, - .put_pages = put_pages, - .is_gpu_address = is_gpu_address, - .get_page_size = get_page_size, -}; - -/** - * amdkfd_query_rdma_interface - Return interface (function pointers table) for - * rdma interface - * - * - * \param interace - OUT: Pointer to interface - * - * \return 0 if operation was successful. - */ -int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops) -{ - *ops = &rdma_ops; - - return 0; -} -EXPORT_SYMBOL(amdkfd_query_rdma_interface); - - - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index d08e3de..1e50647 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -28,32 +28,27 @@ #include #include #include -#include -#include #include "kfd_priv.h" #include "kfd_crat.h" #include "kfd_topology.h" -#include "kfd_device_queue_manager.h" -/* topology_device_list - Master list of all topology devices */ static struct list_head topology_device_list; +static int topology_crat_parsed; static struct kfd_system_properties sys_props; static DECLARE_RWSEM(topology_lock); -static atomic_t topology_crat_proximity_domain; -struct kfd_topology_device *kfd_topology_device_by_proximity_domain( - uint32_t proximity_domain) +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) { struct kfd_topology_device *top_dev; - struct kfd_topology_device *device = NULL; + struct kfd_dev *device = NULL; down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->proximity_domain == proximity_domain) { - device = top_dev; + if (top_dev->gpu_id == gpu_id) { + device = top_dev->gpu; break; } @@ -62,7 +57,7 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain( return device; } -struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) { struct kfd_topology_device *top_dev; struct kfd_dev *device = NULL; @@ -70,7 +65,7 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu_id == gpu_id) { + if (top_dev->gpu->pdev == pdev) { device = top_dev->gpu; break; } @@ -80,49 +75,300 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) return device; } -struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) +static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) { - struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; + struct acpi_table_header *crat_table; + acpi_status status; - down_read(&topology_lock); + if (!size) + return -EINVAL; - list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu && top_dev->gpu->pdev == pdev) { - device = top_dev->gpu; + /* + * Fetch the CRAT table from ACPI + */ + status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); + if (status == AE_NOT_FOUND) { + pr_warn("CRAT table not found\n"); + return -ENODATA; + } else if (ACPI_FAILURE(status)) { + const char *err = acpi_format_exception(status); + + pr_err("CRAT table error: %s\n", err); + return -EINVAL; + } + + if (*size >= crat_table->length && crat_image != NULL) + memcpy(crat_image, crat_table, crat_table->length); + + *size = crat_table->length; + + return 0; +} + +static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + BUG_ON(!dev); + BUG_ON(!cu); + + dev->node_props.cpu_cores_count = cu->num_cpu_cores; + dev->node_props.cpu_core_id_base = cu->processor_id_low; + if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + + pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, + cu->processor_id_low); +} + +static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + BUG_ON(!dev); + BUG_ON(!cu); + + dev->node_props.simd_id_base = cu->processor_id_low; + dev->node_props.simd_count = cu->num_simd_cores; + dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; + dev->node_props.max_waves_per_simd = cu->max_waves_simd; + dev->node_props.wave_front_size = cu->wave_front_size; + dev->node_props.mem_banks_count = cu->num_banks; + dev->node_props.array_count = cu->num_arrays; + dev->node_props.cu_per_simd_array = cu->num_cu_per_array; + dev->node_props.simd_per_cu = cu->num_simd_per_cu; + dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; + if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) + dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; + pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, + cu->processor_id_low); +} + +/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ +static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) +{ + struct kfd_topology_device *dev; + int i = 0; + + BUG_ON(!cu); + + pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", + cu->proximity_domain, cu->hsa_capability); + list_for_each_entry(dev, &topology_device_list, list) { + if (cu->proximity_domain == i) { + if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) + kfd_populated_cu_info_cpu(dev, cu); + + if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) + kfd_populated_cu_info_gpu(dev, cu); break; } + i++; + } - up_read(&topology_lock); + return 0; +} - return device; +/* + * kfd_parse_subtype_mem is called when the topology mutex is + * already acquired + */ +static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) +{ + struct kfd_mem_properties *props; + struct kfd_topology_device *dev; + int i = 0; + + BUG_ON(!mem); + + pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", + mem->promixity_domain); + list_for_each_entry(dev, &topology_device_list, list) { + if (mem->promixity_domain == i) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + if (dev->node_props.cpu_cores_count == 0) + props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; + else + props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; + + if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) + props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; + if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) + props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; + + props->size_in_bytes = + ((uint64_t)mem->length_high << 32) + + mem->length_low; + props->width = mem->width; + + dev->mem_bank_count++; + list_add_tail(&props->list, &dev->mem_props); + + break; + } + i++; + } + + return 0; } -struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd) +/* + * kfd_parse_subtype_cache is called when the topology mutex + * is already acquired + */ +static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) { - struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; + struct kfd_cache_properties *props; + struct kfd_topology_device *dev; + uint32_t id; - down_read(&topology_lock); + BUG_ON(!cache); + + id = cache->processor_id_low; + + pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); + list_for_each_entry(dev, &topology_device_list, list) + if (id == dev->node_props.cpu_core_id_base || + id == dev->node_props.simd_id_base) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->processor_id_low = id; + props->cache_level = cache->cache_level; + props->cache_size = cache->cache_size; + props->cacheline_size = cache->cache_line_size; + props->cachelines_per_tag = cache->lines_per_tag; + props->cache_assoc = cache->associativity; + props->cache_latency = cache->cache_latency; + + if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) + props->cache_type |= HSA_CACHE_TYPE_DATA; + if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) + props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; + if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) + props->cache_type |= HSA_CACHE_TYPE_CPU; + if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) + props->cache_type |= HSA_CACHE_TYPE_HSACU; + + dev->cache_count++; + dev->node_props.caches_count++; + list_add_tail(&props->list, &dev->cache_props); - list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu && top_dev->gpu->kgd == kgd) { - device = top_dev->gpu; break; } - up_read(&topology_lock); + return 0; +} - return device; +/* + * kfd_parse_subtype_iolink is called when the topology mutex + * is already acquired + */ +static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) +{ + struct kfd_iolink_properties *props; + struct kfd_topology_device *dev; + uint32_t i = 0; + uint32_t id_from; + uint32_t id_to; + + BUG_ON(!iolink); + + id_from = iolink->proximity_domain_from; + id_to = iolink->proximity_domain_to; + + pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); + list_for_each_entry(dev, &topology_device_list, list) { + if (id_from == i) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->node_from = id_from; + props->node_to = id_to; + props->ver_maj = iolink->version_major; + props->ver_min = iolink->version_minor; + + /* + * weight factor (derived from CDIR), currently always 1 + */ + props->weight = 1; + + props->min_latency = iolink->minimum_latency; + props->max_latency = iolink->maximum_latency; + props->min_bandwidth = iolink->minimum_bandwidth_mbs; + props->max_bandwidth = iolink->maximum_bandwidth_mbs; + props->rec_transfer_size = + iolink->recommended_transfer_size; + + dev->io_link_count++; + dev->node_props.io_links_count++; + list_add_tail(&props->list, &dev->io_link_props); + + break; + } + i++; + } + + return 0; +} + +static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) +{ + struct crat_subtype_computeunit *cu; + struct crat_subtype_memory *mem; + struct crat_subtype_cache *cache; + struct crat_subtype_iolink *iolink; + int ret = 0; + + BUG_ON(!sub_type_hdr); + + switch (sub_type_hdr->type) { + case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + ret = kfd_parse_subtype_cu(cu); + break; + case CRAT_SUBTYPE_MEMORY_AFFINITY: + mem = (struct crat_subtype_memory *)sub_type_hdr; + ret = kfd_parse_subtype_mem(mem); + break; + case CRAT_SUBTYPE_CACHE_AFFINITY: + cache = (struct crat_subtype_cache *)sub_type_hdr; + ret = kfd_parse_subtype_cache(cache); + break; + case CRAT_SUBTYPE_TLB_AFFINITY: + /* + * For now, nothing to do here + */ + pr_info("Found TLB entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: + /* + * For now, nothing to do here + */ + pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_IOLINK_AFFINITY: + iolink = (struct crat_subtype_iolink *)sub_type_hdr; + ret = kfd_parse_subtype_iolink(iolink); + break; + default: + pr_warn("Unknown subtype (%d) in CRAT\n", + sub_type_hdr->type); + } + + return ret; } -/* Called with write topology_lock acquired */ static void kfd_release_topology_device(struct kfd_topology_device *dev) { struct kfd_mem_properties *mem; struct kfd_cache_properties *cache; struct kfd_iolink_properties *iolink; - struct kfd_perf_properties *perf; + + BUG_ON(!dev); list_del(&dev->list); @@ -147,40 +393,30 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) kfree(iolink); } - while (dev->perf_props.next != &dev->perf_props) { - perf = container_of(dev->perf_props.next, - struct kfd_perf_properties, list); - list_del(&perf->list); - kfree(perf); - } - kfree(dev); + + sys_props.num_devices--; } -void kfd_release_topology_device_list(struct list_head *device_list) +static void kfd_release_live_view(void) { struct kfd_topology_device *dev; - while (!list_empty(device_list)) { - dev = list_first_entry(device_list, - struct kfd_topology_device, list); + while (topology_device_list.next != &topology_device_list) { + dev = container_of(topology_device_list.next, + struct kfd_topology_device, list); kfd_release_topology_device(dev); - } } -static void kfd_release_live_view(void) -{ - kfd_release_topology_device_list(&topology_device_list); memset(&sys_props, 0, sizeof(sys_props)); } -struct kfd_topology_device *kfd_create_topology_device( - struct list_head *device_list) +static struct kfd_topology_device *kfd_create_topology_device(void) { struct kfd_topology_device *dev; dev = kfd_alloc_struct(dev); - if (!dev) { + if (dev == NULL) { pr_err("No memory to allocate a topology device"); return NULL; } @@ -188,13 +424,66 @@ struct kfd_topology_device *kfd_create_topology_device( INIT_LIST_HEAD(&dev->mem_props); INIT_LIST_HEAD(&dev->cache_props); INIT_LIST_HEAD(&dev->io_link_props); - INIT_LIST_HEAD(&dev->perf_props); - list_add_tail(&dev->list, device_list); + list_add_tail(&dev->list, &topology_device_list); + sys_props.num_devices++; return dev; } +static int kfd_parse_crat_table(void *crat_image) +{ + struct kfd_topology_device *top_dev; + struct crat_subtype_generic *sub_type_hdr; + uint16_t node_id; + int ret; + struct crat_header *crat_table = (struct crat_header *)crat_image; + uint16_t num_nodes; + uint32_t image_len; + + if (!crat_image) + return -EINVAL; + + num_nodes = crat_table->num_domains; + image_len = crat_table->length; + + pr_info("Parsing CRAT table with %d nodes\n", num_nodes); + + for (node_id = 0; node_id < num_nodes; node_id++) { + top_dev = kfd_create_topology_device(); + if (!top_dev) { + kfd_release_live_view(); + return -ENOMEM; + } + } + + sys_props.platform_id = + (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); + sys_props.platform_rev = crat_table->revision; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < + ((char *)crat_image) + image_len) { + if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { + ret = kfd_parse_subtype(sub_type_hdr); + if (ret != 0) { + kfd_release_live_view(); + return ret; + } + } + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + } + + sys_props.generation_count++; + topology_crat_parsed = 1; + + return 0; +} + + #define sysfs_show_gen_prop(buffer, fmt, ...) \ snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) #define sysfs_show_32bit_prop(buffer, name, value) \ @@ -203,8 +492,6 @@ struct kfd_topology_device *kfd_create_topology_device( sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) #define sysfs_show_32bit_val(buffer, value) \ sysfs_show_gen_prop(buffer, "%u\n", value) -#define sysfs_show_64bit_val(buffer, value) \ - sysfs_show_gen_prop(buffer, "%llu\n", value) #define sysfs_show_str_val(buffer, value) \ sysfs_show_gen_prop(buffer, "%s\n", value) @@ -232,17 +519,11 @@ static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, return ret; } -static void kfd_topology_kobj_release(struct kobject *kobj) -{ - kfree(kobj); -} - static const struct sysfs_ops sysprops_ops = { .show = sysprops_show, }; static struct kobj_type sysprops_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &sysprops_ops, }; @@ -278,7 +559,6 @@ static const struct sysfs_ops iolink_ops = { }; static struct kobj_type iolink_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &iolink_ops, }; @@ -287,23 +567,11 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, { ssize_t ret; struct kfd_mem_properties *mem; - uint64_t used_mem; /* Making sure that the buffer is an empty string */ buffer[0] = 0; - if (strcmp(attr->name, "used_memory") == 0) { - mem = container_of(attr, struct kfd_mem_properties, - attr_used); - if (mem->gpu) { - used_mem = mem->gpu->kfd2kgd->get_vram_usage(mem->gpu->kgd); - return sysfs_show_64bit_val(buffer, used_mem); - } - /* TODO: Report APU/CPU-allocated memory; For now return 0 */ - return 0; - } - - mem = container_of(attr, struct kfd_mem_properties, attr_props); + mem = container_of(attr, struct kfd_mem_properties, attr); sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); sysfs_show_32bit_prop(buffer, "flags", mem->flags); @@ -318,7 +586,6 @@ static const struct sysfs_ops mem_ops = { }; static struct kobj_type mem_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &mem_ops, }; @@ -326,7 +593,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, char *buffer) { ssize_t ret; - uint32_t i, j; + uint32_t i; struct kfd_cache_properties *cache; /* Making sure that the buffer is an empty string */ @@ -344,18 +611,12 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); sysfs_show_32bit_prop(buffer, "type", cache->cache_type); snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); - for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) - for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { - /* Check each bit */ - if (cache->sibling_map[i] & (1 << j)) - ret = snprintf(buffer, PAGE_SIZE, - "%s%d%s", buffer, 1, ","); - else - ret = snprintf(buffer, PAGE_SIZE, - "%s%d%s", buffer, 0, ","); - } - /* Replace the last "," with end of line */ - *(buffer + strlen(buffer) - 1) = 0xA; + for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) + ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", + buffer, cache->sibling_map[i], + (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? + "\n" : ","); + return ret; } @@ -364,43 +625,9 @@ static const struct sysfs_ops cache_ops = { }; static struct kobj_type cache_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &cache_ops, }; -/****** Sysfs of Performance Counters ******/ - -struct kfd_perf_attr { - struct kobj_attribute attr; - uint32_t data; -}; - -static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, - char *buf) -{ - struct kfd_perf_attr *attr; - - buf[0] = 0; - attr = container_of(attrs, struct kfd_perf_attr, attr); - if (!attr->data) /* invalid data for PMC */ - return 0; - else - return sysfs_show_32bit_val(buf, attr->data); -} - -#define KFD_PERF_DESC(_name, _data) \ -{ \ - .attr = __ATTR(_name, 0444, perf_show, NULL), \ - .data = _data, \ -} - -static struct kfd_perf_attr perf_attr_iommu[] = { - KFD_PERF_DESC(max_concurrent, 0), - KFD_PERF_DESC(num_counters, 0), - KFD_PERF_DESC(counter_ids, 0), -}; -/****************************************/ - static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char *buffer) { @@ -408,7 +635,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; uint32_t i; uint32_t log_max_watch_addr; - struct kfd_local_mem_info local_mem_info; /* Making sure that the buffer is an empty string */ buffer[0] = 0; @@ -438,8 +664,18 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.cpu_cores_count); sysfs_show_32bit_prop(buffer, "simd_count", dev->node_props.simd_count); - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->node_props.mem_banks_count); + + if (dev->mem_bank_count < dev->node_props.mem_banks_count) { + pr_info_once("kfd: mem_banks_count truncated from %d to %d\n", + dev->node_props.mem_banks_count, + dev->mem_bank_count); + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->mem_bank_count); + } else { + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->node_props.mem_banks_count); + } + sysfs_show_32bit_prop(buffer, "caches_count", dev->node_props.caches_count); sysfs_show_32bit_prop(buffer, "io_links_count", @@ -487,28 +723,17 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); } - if (dev->gpu->device_info->asic_family == CHIP_TONGA) - dev->node_props.capability |= - HSA_CAP_AQL_QUEUE_DOUBLE_MAP; - sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", - dev->node_props.max_engine_clk_fcompute); + dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( + dev->gpu->kgd)); - /* - * If the ASIC is CZ, set local memory size to 0 to disable - * local memory support - */ - if (dev->gpu->device_info->asic_family != CHIP_CARRIZO) { - dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, - &local_mem_info); - sysfs_show_64bit_prop(buffer, "local_mem_size", - local_mem_info.local_mem_size_private + - local_mem_info.local_mem_size_public); - } else - sysfs_show_64bit_prop(buffer, "local_mem_size", 0ULL); + sysfs_show_64bit_prop(buffer, "local_mem_size", + (unsigned long long int) 0); sysfs_show_32bit_prop(buffer, "fw_version", - dev->gpu->mec_fw_version); + dev->gpu->kfd2kgd->get_fw_version( + dev->gpu->kgd, + KGD_ENGINE_MEC1)); sysfs_show_32bit_prop(buffer, "capability", dev->node_props.capability); } @@ -522,7 +747,6 @@ static const struct sysfs_ops node_ops = { }; static struct kobj_type node_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &node_ops, }; @@ -538,7 +762,8 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) struct kfd_iolink_properties *iolink; struct kfd_cache_properties *cache; struct kfd_mem_properties *mem; - struct kfd_perf_properties *perf; + + BUG_ON(!dev); if (dev->kobj_iolink) { list_for_each_entry(iolink, &dev->io_link_props, list) @@ -567,12 +792,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) if (dev->kobj_mem) { list_for_each_entry(mem, &dev->mem_props, list) if (mem->kobj) { - /* TODO: Remove when CPU/APU supported */ - if (dev->node_props.cpu_cores_count == 0) - sysfs_remove_file(mem->kobj, - &mem->attr_used); - kfd_remove_sysfs_file(mem->kobj, - &mem->attr_props); + kfd_remove_sysfs_file(mem->kobj, &mem->attr); mem->kobj = NULL; } kobject_del(dev->kobj_mem); @@ -580,16 +800,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) dev->kobj_mem = NULL; } - if (dev->kobj_perf) { - list_for_each_entry(perf, &dev->perf_props, list) { - kfree(perf->attr_group); - perf->attr_group = NULL; - } - kobject_del(dev->kobj_perf); - kobject_put(dev->kobj_perf); - dev->kobj_perf = NULL; - } - if (dev->kobj_node) { sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); sysfs_remove_file(dev->kobj_node, &dev->attr_name); @@ -606,18 +816,15 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, struct kfd_iolink_properties *iolink; struct kfd_cache_properties *cache; struct kfd_mem_properties *mem; - struct kfd_perf_properties *perf; - uint32_t num_attrs; - struct attribute **attrs; int ret; uint32_t i; - if (WARN_ON(dev->kobj_node)) - return -EEXIST; + BUG_ON(!dev); /* * Creating the sysfs folders */ + BUG_ON(dev->kobj_node); dev->kobj_node = kfd_alloc_struct(dev->kobj_node); if (!dev->kobj_node) return -ENOMEM; @@ -639,10 +846,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (!dev->kobj_iolink) return -ENOMEM; - dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node); - if (!dev->kobj_perf) - return -ENOMEM; - /* * Creating sysfs files for node properties */ @@ -675,23 +878,12 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (ret < 0) return ret; - mem->attr_props.name = "properties"; - mem->attr_props.mode = KFD_SYSFS_FILE_MODE; - sysfs_attr_init(&mem->attr_props); - ret = sysfs_create_file(mem->kobj, &mem->attr_props); + mem->attr.name = "properties"; + mem->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&mem->attr); + ret = sysfs_create_file(mem->kobj, &mem->attr); if (ret < 0) return ret; - - /* TODO: Support APU/CPU memory usage */ - if (dev->node_props.cpu_cores_count == 0) { - mem->attr_used.name = "used_memory"; - mem->attr_used.mode = KFD_SYSFS_FILE_MODE; - sysfs_attr_init(&mem->attr_used); - ret = sysfs_create_file(mem->kobj, &mem->attr_used); - if (ret < 0) - return ret; - } - i++; } @@ -731,38 +923,11 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (ret < 0) return ret; i++; - } - - /* All hardware blocks have the same number of attributes. */ - num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr); - list_for_each_entry(perf, &dev->perf_props, list) { - perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr) - * num_attrs + sizeof(struct attribute_group), - GFP_KERNEL); - if (!perf->attr_group) - return -ENOMEM; - - attrs = (struct attribute **)(perf->attr_group + 1); - if (!strcmp(perf->block_name, "iommu")) { - /* Information of IOMMU's num_counters and counter_ids is shown - * under /sys/bus/event_source/devices/amd_iommu. We don't - * duplicate here. - */ - perf_attr_iommu[0].data = perf->max_concurrent; - for (i = 0; i < num_attrs; i++) - attrs[i] = &perf_attr_iommu[i].attr.attr; - } - perf->attr_group->name = perf->block_name; - perf->attr_group->attrs = attrs; - ret = sysfs_create_group(dev->kobj_perf, perf->attr_group); - if (ret < 0) - return ret; - } +} return 0; } -/* Called with write topology lock acquired */ static int kfd_build_sysfs_node_tree(void) { struct kfd_topology_device *dev; @@ -779,7 +944,6 @@ static int kfd_build_sysfs_node_tree(void) return 0; } -/* Called with write topology lock acquired */ static void kfd_remove_sysfs_node_tree(void) { struct kfd_topology_device *dev; @@ -793,7 +957,7 @@ static int kfd_topology_update_sysfs(void) int ret; pr_info("Creating topology SYSFS entries\n"); - if (!sys_props.kobj_topology) { + if (sys_props.kobj_topology == NULL) { sys_props.kobj_topology = kfd_alloc_struct(sys_props.kobj_topology); if (!sys_props.kobj_topology) @@ -851,251 +1015,75 @@ static void kfd_topology_release_sysfs(void) } } -/* Called with write topology_lock acquired */ -static void kfd_topology_update_device_list(struct list_head *temp_list, - struct list_head *master_list) -{ - while (!list_empty(temp_list)) { - list_move_tail(temp_list->next, master_list); - sys_props.num_devices++; - } -} - -static void kfd_debug_print_topology(void) -{ - struct kfd_topology_device *dev; - - down_read(&topology_lock); - - dev = list_last_entry(&topology_device_list, - struct kfd_topology_device, list); - if (dev) { - if (dev->node_props.cpu_cores_count && - dev->node_props.simd_count) { - pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", - dev->node_props.device_id, - dev->node_props.vendor_id); - } else if (dev->node_props.cpu_cores_count) - pr_info("Topology: Add CPU node\n"); - else if (dev->node_props.simd_count) - pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", - dev->node_props.device_id, - dev->node_props.vendor_id); - } - up_read(&topology_lock); -} - -/* Helper function for intializing platform_xx members of kfd_system_properties - */ -static void kfd_update_system_properties(void) -{ - struct kfd_topology_device *dev; - - down_read(&topology_lock); - dev = list_last_entry(&topology_device_list, - struct kfd_topology_device, list); - if (dev) { - sys_props.platform_id = - (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; - sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); - sys_props.platform_rev = dev->oem_revision; - } - up_read(&topology_lock); -} - -static void find_system_memory(const struct dmi_header *dm, - void *private) -{ - struct kfd_mem_properties *mem; - u16 mem_width, mem_clock; - struct kfd_topology_device *kdev = - (struct kfd_topology_device *)private; - const u8 *dmi_data = (const u8 *)(dm + 1); - - if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { - mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); - mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); - list_for_each_entry(mem, &kdev->mem_props, list) { - if (mem_width != 0xFFFF && mem_width != 0) - mem->width = mem_width; - if (mem_clock != 0) - mem->mem_clk_max = mem_clock; - } - } -} - -/* - * Performance counters information is not part of CRAT but we would like to - * put them in the sysfs under topology directory for Thunk to get the data. - * This function is called before updating the sysfs. - */ -static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) -{ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - struct kfd_perf_properties *props; - - if (amd_iommu_pc_supported()) { - props = kfd_alloc_struct(props); - if (!props) - return -ENOMEM; - strcpy(props->block_name, "iommu"); - props->max_concurrent = amd_iommu_pc_get_max_banks(0) * - amd_iommu_pc_get_max_counters(0); /* assume one iommu */ - list_add_tail(&props->list, &kdev->perf_props); - } -#endif - - return 0; -} - -/* kfd_add_non_crat_information - Add information that is not currently - * defined in CRAT but is necessary for KFD topology - * @dev - topology device to which addition info is added - */ -static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) -{ - /* Check if CPU only node. */ - if (!kdev->gpu) { - /* Add system memory information */ - dmi_walk(find_system_memory, kdev); - } - /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ -} - -#ifdef CONFIG_ACPI -/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices. - * Ignore CRAT for all other devices. AMD APU is identified if both CPU - * and GPU cores are present. - * @device_list - topology device list created by parsing ACPI CRAT table. - * @return - TRUE if invalid, FALSE is valid. - */ -static bool kfd_is_acpi_crat_invalid(struct list_head *device_list) -{ - struct kfd_topology_device *dev; - - list_for_each_entry(dev, device_list, list) { - if (dev->node_props.cpu_cores_count && - dev->node_props.simd_count) - return false; - } - pr_info("Ignoring ACPI CRAT on non-APU system\n"); - return true; -} -#endif - int kfd_topology_init(void) { void *crat_image = NULL; size_t image_size = 0; int ret; - struct list_head temp_topology_device_list; - int cpu_only_node = 0; - struct kfd_topology_device *kdev; - int proximity_domain; - - /* topology_device_list - Master list of all topology devices - * temp_topology_device_list - temporary list created while parsing CRAT - * or VCRAT. Once parsing is complete the contents of list is moved to - * topology_device_list - */ - /* Initialize the head for the both the lists */ + /* + * Initialize the head for the topology device list + */ INIT_LIST_HEAD(&topology_device_list); - INIT_LIST_HEAD(&temp_topology_device_list); init_rwsem(&topology_lock); + topology_crat_parsed = 0; memset(&sys_props, 0, sizeof(sys_props)); - /* Proximity domains in ACPI CRAT tables start counting at - * 0. The same should be true for virtual CRAT tables created - * at this stage. GPUs added later in kfd_topology_add_device - * use a counter. - */ - proximity_domain = 0; - /* - * Get the CRAT image from the ACPI. If ACPI doesn't have one - * or if ACPI CRAT is invalid create a virtual CRAT. - * NOTE: The current implementation expects all AMD APUs to have - * CRAT. If no CRAT is available, it is assumed to be a CPU + * Get the CRAT image from the ACPI */ -#ifdef CONFIG_ACPI - ret = kfd_create_crat_image_acpi(&crat_image, &image_size); - if (ret == 0) { - ret = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, - proximity_domain); - if (ret || - kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { - - kfd_release_topology_device_list( - &temp_topology_device_list); - kfd_destroy_crat_image(crat_image); - crat_image = NULL; - } - } -#endif - if (!crat_image) { - ret = kfd_create_crat_image_virtual(&crat_image, &image_size, - COMPUTE_UNIT_CPU, NULL, - proximity_domain); - cpu_only_node = 1; - if (ret) { - pr_err("Error creating VCRAT table for CPU\n"); - return ret; - } - - ret = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, - proximity_domain); - if (ret) { - pr_err("Error parsing VCRAT table for CPU\n"); + ret = kfd_topology_get_crat_acpi(crat_image, &image_size); + if (ret == 0 && image_size > 0) { + pr_info("Found CRAT image with size=%zd\n", image_size); + crat_image = kmalloc(image_size, GFP_KERNEL); + if (!crat_image) { + ret = -ENOMEM; + pr_err("No memory for allocating CRAT image\n"); goto err; } - } - - kdev = list_first_entry(&temp_topology_device_list, - struct kfd_topology_device, list); - kfd_add_perf_to_topology(kdev); - - down_write(&topology_lock); - kfd_topology_update_device_list(&temp_topology_device_list, - &topology_device_list); - atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1); - ret = kfd_topology_update_sysfs(); - up_write(&topology_lock); - - if (ret == 0) { - sys_props.generation_count++; - kfd_update_system_properties(); - kfd_debug_print_topology(); - pr_info("Finished initializing topology\n"); - } else - pr_err("Failed to update topology in sysfs ret=%d\n", ret); - - /* For nodes with GPU, this information gets added - * when GPU is detected (kfd_topology_add_device). - */ - if (cpu_only_node) { - /* Add additional information to CPU only node created above */ - down_write(&topology_lock); - kdev = list_first_entry(&topology_device_list, - struct kfd_topology_device, list); - up_write(&topology_lock); - kfd_add_non_crat_information(kdev); + ret = kfd_topology_get_crat_acpi(crat_image, &image_size); + + if (ret == 0) { + down_write(&topology_lock); + ret = kfd_parse_crat_table(crat_image); + if (ret == 0) + ret = kfd_topology_update_sysfs(); + up_write(&topology_lock); + } else { + pr_err("Couldn't get CRAT table size from ACPI\n"); + } + kfree(crat_image); + } else if (ret == -ENODATA) { + ret = 0; + } else { + pr_err("Couldn't get CRAT table size from ACPI\n"); } err: - kfd_destroy_crat_image(crat_image); + pr_info("Finished initializing topology ret=%d\n", ret); return ret; } void kfd_topology_shutdown(void) { - down_write(&topology_lock); kfd_topology_release_sysfs(); kfd_release_live_view(); - up_write(&topology_lock); +} + +static void kfd_debug_print_topology(void) +{ + struct kfd_topology_device *dev; + uint32_t i = 0; + + pr_info("DEBUG PRINT OF TOPOLOGY:"); + list_for_each_entry(dev, &topology_device_list, list) { + pr_info("Node: %d\n", i); + pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); + pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); + pr_info("\tSIMD count: %d", dev->node_props.simd_count); + i++; + } } static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) @@ -1104,15 +1092,11 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) uint32_t buf[7]; uint64_t local_mem_size; int i; - struct kfd_local_mem_info local_mem_info; if (!gpu) return 0; - gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info); - - local_mem_size = local_mem_info.local_mem_size_private + - local_mem_info.local_mem_size_public; + local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd); buf[0] = gpu->pdev->devfn; buf[1] = gpu->pdev->subsystem_vendor; @@ -1127,32 +1111,20 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) return hashout; } -/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If - * the GPU device is not already present in the topology device - * list then return NULL. This means a new topology device has to - * be created for this GPU. - * TODO: Rather than assiging @gpu to first topology device withtout - * gpu attached, it will better to have more stringent check. - */ + static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) { struct kfd_topology_device *dev; struct kfd_topology_device *out_dev = NULL; - struct kfd_mem_properties *mem; - down_write(&topology_lock); + BUG_ON(!gpu); + list_for_each_entry(dev, &topology_device_list, list) - if (!dev->gpu && (dev->node_props.simd_count > 0)) { + if (dev->gpu == NULL && dev->node_props.simd_count > 0) { dev->gpu = gpu; out_dev = dev; - - /* Assign mem->gpu */ - list_for_each_entry(mem, &dev->mem_props, list) - mem->gpu = dev->gpu; - break; } - up_write(&topology_lock); return out_dev; } @@ -1165,202 +1137,88 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) */ } -/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, - * patch this after CRAT parsing. - */ -static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) -{ - struct kfd_mem_properties *mem; - struct kfd_local_mem_info local_mem_info; - - if (!dev) - return; - - /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with - * single bank of VRAM local memory. - * for dGPUs - VCRAT reports only one bank of Local Memory - * for APUs - If CRAT from ACPI reports more than one bank, then - * all the banks will report the same mem_clk_max information - */ - dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, - &local_mem_info); - - list_for_each_entry(mem, &dev->mem_props, list) - mem->mem_clk_max = local_mem_info.mem_clk_max; -} - -static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) -{ - struct kfd_iolink_properties *link; - - if (!dev || !dev->gpu) - return; - - /* GPU only creates direck links so apply flags setting to all */ - if (dev->gpu->device_info->asic_family == CHIP_HAWAII) - list_for_each_entry(link, &dev->io_link_props, list) - link->flags = CRAT_IOLINK_FLAGS_ENABLED | - CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | - CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; -} - int kfd_topology_add_device(struct kfd_dev *gpu) { uint32_t gpu_id; struct kfd_topology_device *dev; - struct kfd_cu_info cu_info; - int res = 0; - struct list_head temp_topology_device_list; - void *crat_image = NULL; - size_t image_size = 0; - int proximity_domain; + int res; - INIT_LIST_HEAD(&temp_topology_device_list); + BUG_ON(!gpu); gpu_id = kfd_generate_gpu_id(gpu); - pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); - - proximity_domain = atomic_inc_return(& - topology_crat_proximity_domain); + pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id); - /* Check to see if this gpu device exists in the topology_device_list. - * If so, assign the gpu to that device, - * else create a Virtual CRAT for this gpu device and then parse that - * CRAT to create a new topology device. Once created assign the gpu to - * that topology device + down_write(&topology_lock); + /* + * Try to assign the GPU to existing topology device (generated from + * CRAT table */ dev = kfd_assign_gpu(gpu); if (!dev) { - res = kfd_create_crat_image_virtual(&crat_image, &image_size, - COMPUTE_UNIT_GPU, - gpu, proximity_domain); - if (res) { - pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", - gpu_id); - return res; - } - res = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, proximity_domain); - if (res) { - pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + pr_info("GPU was not found in the current topology. Extending.\n"); + kfd_debug_print_topology(); + dev = kfd_create_topology_device(); + if (!dev) { + res = -ENOMEM; goto err; } + dev->gpu = gpu; - down_write(&topology_lock); - kfd_topology_update_device_list(&temp_topology_device_list, - &topology_device_list); + /* + * TODO: Make a call to retrieve topology information from the + * GPU vBIOS + */ - /* Update the SYSFS tree, since we added another topology - * device + /* + * Update the SYSFS tree, since we added another topology device */ - res = kfd_topology_update_sysfs(); - up_write(&topology_lock); - - if (res == 0) - sys_props.generation_count++; - else - pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", - gpu_id, res); - dev = kfd_assign_gpu(gpu); - if (!dev) { - pr_err("Could not assign GPU\n"); - res = -ENODEV; - goto err; - } + if (kfd_topology_update_sysfs() < 0) + kfd_topology_release_sysfs(); + } dev->gpu_id = gpu_id; gpu->id = gpu_id; - - /* TODO: Move the following lines to function - * kfd_add_non_crat_information - */ - - /* Fill-in additional information that is not available in CRAT but - * needed for the topology - */ - - dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); - dev->node_props.simd_arrays_per_engine = - cu_info.num_shader_arrays_per_engine; - dev->node_props.vendor_id = gpu->pdev->vendor; dev->node_props.device_id = gpu->pdev->device; - dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, - gpu->pdev->devfn); - dev->node_props.max_engine_clk_fcompute = - dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); - dev->node_props.max_engine_clk_ccompute = - cpufreq_quick_get_max(0) / 1000; - - kfd_fill_mem_clk_max_info(dev); - kfd_fill_iolink_non_crat_info(dev); - - switch (dev->gpu->device_info->asic_family) { - case CHIP_KAVERI: - case CHIP_HAWAII: - case CHIP_TONGA: - dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << - HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & - HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); - break; - case CHIP_CARRIZO: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - pr_debug("Adding doorbell packet type capability\n"); - dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << - HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & - HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); - break; - case CHIP_VEGA10: - case CHIP_RAVEN: - dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << - HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & - HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); - break; - default: - BUG(); - } - - /* Fix errors in CZ CRAT. - * simd_count: Carrizo CRAT reports wrong simd_count, probably because - * it doesn't consider masked out CUs - * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd. - * capability flag: Carrizo CRAT doesn't report IOMMU flags. + dev->node_props.location_id = (gpu->pdev->bus->number << 24) + + (gpu->pdev->devfn & 0xffffff); + /* + * TODO: Retrieve max engine clock values from KGD */ + if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { - dev->node_props.simd_count = - cu_info.simd_per_cu * cu_info.cu_active_number; - dev->node_props.max_waves_per_simd = 10; - dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; + pr_info("amdkfd: adding doorbell packet type capability\n"); } - kfd_debug_print_topology(); + res = 0; - if (!res) - kfd_notify_gpu_change(gpu_id, 1); err: - kfd_destroy_crat_image(crat_image); + up_write(&topology_lock); + + if (res == 0) + kfd_notify_gpu_change(gpu_id, 1); + return res; } int kfd_topology_remove_device(struct kfd_dev *gpu) { - struct kfd_topology_device *dev, *tmp; + struct kfd_topology_device *dev; uint32_t gpu_id; int res = -ENODEV; + BUG_ON(!gpu); + down_write(&topology_lock); - list_for_each_entry_safe(dev, tmp, &topology_device_list, list) + list_for_each_entry(dev, &topology_device_list, list) if (dev->gpu == gpu) { gpu_id = dev->gpu_id; kfd_remove_sysfs_node_entry(dev); kfd_release_topology_device(dev); - sys_props.num_devices--; res = 0; if (kfd_topology_update_sysfs() < 0) kfd_topology_release_sysfs(); @@ -1375,26 +1233,22 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) return res; } -/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD - * topology. If GPU device is found @idx, then valid kfd_dev pointer is - * returned through @kdev - * Return - 0: On success (@kdev will be NULL for non GPU nodes) - * -1: If end of list +/* + * When idx is out of bounds, the function will return NULL */ -int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) +struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) { struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; uint8_t device_idx = 0; - *kdev = NULL; down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) { if (device_idx == idx) { - *kdev = top_dev->gpu; - up_read(&topology_lock); - return 0; + device = top_dev->gpu; + break; } device_idx++; @@ -1402,89 +1256,6 @@ int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) up_read(&topology_lock); - return -1; - -} - -static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) -{ - int first_cpu_of_numa_node; - - if (!cpumask || (cpumask == cpu_none_mask)) - return -1; - first_cpu_of_numa_node = cpumask_first(cpumask); - if (first_cpu_of_numa_node >= nr_cpu_ids) - return -1; -#ifdef CONFIG_X86_64 - return cpu_data(first_cpu_of_numa_node).apicid; -#else - return first_cpu_of_numa_node; -#endif -} - -/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor - * of the given NUMA node (numa_node_id) - * Return -1 on failure - */ -int kfd_numa_node_to_apic_id(int numa_node_id) -{ - if (numa_node_id == -1) { - pr_warn("Invalid NUMA Node. Use online CPU mask\n"); - return kfd_cpumask_to_apic_id(cpu_online_mask); - } - return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); -} - -#if defined(CONFIG_DEBUG_FS) - -int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) -{ - struct kfd_topology_device *dev; - unsigned int i = 0; - int r = 0; - - down_read(&topology_lock); - - list_for_each_entry(dev, &topology_device_list, list) { - if (!dev->gpu) { - i++; - continue; - } - - seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); - r = device_queue_manager_debugfs_hqds(m, dev->gpu->dqm); - if (r != 0) - break; - } - - up_read(&topology_lock); - - return r; -} - -int kfd_debugfs_rls_by_device(struct seq_file *m, void *data) -{ - struct kfd_topology_device *dev; - unsigned int i = 0; - int r = 0; - - down_read(&topology_lock); - - list_for_each_entry(dev, &topology_device_list, list) { - if (!dev->gpu) { - i++; - continue; - } - - seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); - r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets); - if (r != 0) - break; - } - - up_read(&topology_lock); + return device; - return r; } - -#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index f22d420..c3ddb9b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -39,17 +39,8 @@ #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 -#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 -#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 -#define HSA_CAP_RESERVED 0xffffc000 - -#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 -#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 -#define HSA_CAP_DOORBELL_TYPE_2_0 0x2 -#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 -#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 +#define HSA_CAP_RESERVED 0xfffff000 #define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 -#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 struct kfd_node_properties { uint32_t cpu_cores_count; @@ -97,11 +88,11 @@ struct kfd_mem_properties { uint32_t width; uint32_t mem_clk_max; struct kobject *kobj; - struct kfd_dev *gpu; - struct attribute attr_props; - struct attribute attr_used; + struct attribute attr; }; +#define KFD_TOPOLOGY_CPU_SIBLINGS 256 + #define HSA_CACHE_TYPE_DATA 0x00000001 #define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 #define HSA_CACHE_TYPE_CPU 0x00000004 @@ -118,7 +109,7 @@ struct kfd_cache_properties { uint32_t cache_assoc; uint32_t cache_latency; uint32_t cache_type; - uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; struct kobject *kobj; struct attribute attr; }; @@ -141,36 +132,24 @@ struct kfd_iolink_properties { struct attribute attr; }; -struct kfd_perf_properties { - struct list_head list; - char block_name[16]; - uint32_t max_concurrent; - struct attribute_group *attr_group; -}; - struct kfd_topology_device { struct list_head list; uint32_t gpu_id; - uint32_t proximity_domain; struct kfd_node_properties node_props; + uint32_t mem_bank_count; struct list_head mem_props; uint32_t cache_count; struct list_head cache_props; uint32_t io_link_count; struct list_head io_link_props; - struct list_head perf_props; struct kfd_dev *gpu; struct kobject *kobj_node; struct kobject *kobj_mem; struct kobject *kobj_cache; struct kobject *kobj_iolink; - struct kobject *kobj_perf; struct attribute attr_gpuid; struct attribute attr_name; struct attribute attr_props; - uint8_t oem_id[CRAT_OEMID_LENGTH]; - uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; - uint32_t oem_revision; }; struct kfd_system_properties { @@ -185,14 +164,6 @@ struct kfd_system_properties { struct attribute attr_props; }; -struct kfd_topology_device *kfd_create_topology_device( - struct list_head *device_list); -void kfd_release_topology_device_list(struct list_head *device_list); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -extern bool amd_iommu_pc_supported(void); -extern u8 amd_iommu_pc_get_max_banks(u16 devid); -extern u8 amd_iommu_pc_get_max_counters(u16 devid); -#endif #endif /* __KFD_TOPOLOGY_H__ */ diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h deleted file mode 100644 index e00d03d..0000000 --- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef HSA_SOC15_INT_H_INCLUDED -#define HSA_SOC15_INT_H_INCLUDED -/* - * vega10+ IH clients - */ -enum soc15_ih_client_id { - SOC15_IH_CLIENTID_IH = 0x00, - SOC15_IH_CLIENTID_ACP = 0x01, - SOC15_IH_CLIENTID_ATHUB = 0x02, - SOC15_IH_CLIENTID_BIF = 0x03, - SOC15_IH_CLIENTID_DCE = 0x04, - SOC15_IH_CLIENTID_ISP = 0x05, - SOC15_IH_CLIENTID_PCIE0 = 0x06, - SOC15_IH_CLIENTID_RLC = 0x07, - SOC15_IH_CLIENTID_SDMA0 = 0x08, - SOC15_IH_CLIENTID_SDMA1 = 0x09, - SOC15_IH_CLIENTID_SE0SH = 0x0a, - SOC15_IH_CLIENTID_SE1SH = 0x0b, - SOC15_IH_CLIENTID_SE2SH = 0x0c, - SOC15_IH_CLIENTID_SE3SH = 0x0d, - SOC15_IH_CLIENTID_SYSHUB = 0x0e, - SOC15_IH_CLIENTID_THM = 0x0f, - SOC15_IH_CLIENTID_UVD = 0x10, - SOC15_IH_CLIENTID_VCE0 = 0x11, - SOC15_IH_CLIENTID_VMC = 0x12, - SOC15_IH_CLIENTID_XDMA = 0x13, - SOC15_IH_CLIENTID_GRBM_CP = 0x14, - SOC15_IH_CLIENTID_ATS = 0x15, - SOC15_IH_CLIENTID_ROM_SMUIO = 0x16, - SOC15_IH_CLIENTID_DF = 0x17, - SOC15_IH_CLIENTID_VCE1 = 0x18, - SOC15_IH_CLIENTID_PWR = 0x19, - SOC15_IH_CLIENTID_UTCL2 = 0x1b, - SOC15_IH_CLIENTID_EA = 0x1c, - SOC15_IH_CLIENTID_UTCL2LOG = 0x1d, - SOC15_IH_CLIENTID_MP0 = 0x1e, - SOC15_IH_CLIENTID_MP1 = 0x1f, - - SOC15_IH_CLIENTID_MAX -}; - - -#define SOC15_INTSRC_CP_END_OF_PIPE 181 -#define SOC15_INTSRC_CP_BAD_OPCODE 183 -#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 -#define SOC15_INTSRC_VMC_FAULT 0 -#define SOC15_INTSRC_SDMA_TRAP 224 - - -#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) -#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) -#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) -#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) -#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) -#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) -#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) -#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) -#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) -#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) - -#endif - diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h old mode 100755 new mode 100644 index b6cf2d5..36f3766 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -30,7 +30,6 @@ #include #include -#include struct pci_dev; @@ -41,46 +40,6 @@ struct kfd_dev; struct kgd_dev; struct kgd_mem; -struct kfd_process_device; -struct amdgpu_bo; - -enum kfd_preempt_type { - KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN = 0, - KFD_PREEMPT_TYPE_WAVEFRONT_RESET, -}; - -struct kfd_vm_fault_info { - uint64_t page_addr; - uint32_t vmid; - uint32_t mc_id; - uint32_t status; - bool prot_valid; - bool prot_read; - bool prot_write; - bool prot_exec; -}; - -struct kfd_cu_info { - uint32_t num_shader_engines; - uint32_t num_shader_arrays_per_engine; - uint32_t num_cu_per_sh; - uint32_t cu_active_number; - uint32_t cu_ao_mask; - uint32_t simd_per_cu; - uint32_t max_waves_per_simd; - uint32_t wave_front_size; - uint32_t max_scratch_slots_per_cu; - uint32_t lds_size; - uint32_t cu_bitmap[4][4]; -}; - -/* For getting GPU local memory information from KGD */ -struct kfd_local_mem_info { - uint64_t local_mem_size_private; - uint64_t local_mem_size_public; - uint32_t vram_width; - uint32_t mem_clk_max; -}; enum kgd_memory_pool { KGD_POOL_SYSTEM_CACHEABLE = 1, @@ -113,21 +72,6 @@ struct kgd2kfd_shared_resources { /* Bit n == 1 means Queue n is available for KFD */ DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); - /* Doorbell assignments (SOC15 and later chips only). Only - * specific doorbells are routed to each SDMA engine. Others - * are routed to IH and VCN. They are not usable by the CP. - * - * Any doorbell number D that satisfies the following condition - * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val - * - * KFD currently uses 1024 (= 0x3ff) doorbells per process. If - * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means - * mask would be set to 0x1f8 and val set to 0x0f0. - */ - unsigned int sdma_doorbell[2][2]; - unsigned int reserved_doorbell_mask; - unsigned int reserved_doorbell_val; - /* Base address of doorbell aperture. */ phys_addr_t doorbell_physical_address; @@ -136,41 +80,8 @@ struct kgd2kfd_shared_resources { /* Number of bytes at start of aperture reserved for KGD. */ size_t doorbell_start_offset; - - /* GPUVM address space size in bytes */ - uint64_t gpuvm_size; }; -struct tile_config { - uint32_t *tile_config_ptr; - uint32_t *macro_tile_config_ptr; - uint32_t num_tile_configs; - uint32_t num_macro_tile_configs; - - uint32_t gb_addr_config; - uint32_t num_banks; - uint32_t num_ranks; -}; - -/* - * Allocation flag domains currently only VRAM and GTT domain supported - */ -#define ALLOC_MEM_FLAGS_VRAM (1 << 0) -#define ALLOC_MEM_FLAGS_GTT (1 << 1) -#define ALLOC_MEM_FLAGS_USERPTR (1 << 2) -#define ALLOC_MEM_FLAGS_DOORBELL (1 << 3) - -/* - * Allocation flags attributes/access options. - */ -#define ALLOC_MEM_FLAGS_NONPAGED (1 << 31) -#define ALLOC_MEM_FLAGS_READONLY (1 << 30) -#define ALLOC_MEM_FLAGS_PUBLIC (1 << 29) -#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) -#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) -#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) -#define ALLOC_MEM_FLAGS_COHERENT (1 << 25) - /** * struct kfd2kgd_calls * @@ -179,7 +90,7 @@ struct tile_config { * * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture * - * @get_local_mem_info: Retrieves information about GPU local memory + * @get_vmem_size: Retrieves (physical) size of VRAM * * @get_gpu_clock_counter: Retrieves GPU clock counter * @@ -201,12 +112,6 @@ struct tile_config { * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot. * used only for no HWS mode. * - * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs. - * Array is allocated with kmalloc, needs to be freed with kfree by caller. - * - * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs. - * Array is allocated with kmalloc, needs to be freed with kfree by caller. - * * @hqd_is_occupies: Checks if a hqd slot is occupied. * * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot. @@ -216,34 +121,8 @@ struct tile_config { * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that * SDMA hqd slot. * - * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs - * - * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs - * * @get_fw_version: Returns FW versions from the header * - * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to - * IOMMU when address translation failed - * - * @get_cu_info: Retrieves activated cu info - * - * @get_dmabuf_info: Returns information about a dmabuf if it was - * created by the GPU driver - * - * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object - * Supports only DMA buffers created by GPU driver on the same GPU - * - * @export_dmabuf: Emports a KFD BO for sharing with other process - * - * @submit_ib: Submits an IB to the engine specified by inserting the IB to - * the corresonded ring (ring type). - * - * @restore_process_bos: Restore all BOs that belongs to the process - * - * @copy_mem_to_mem: Copies size bytes from source BO to destination BO - * - * @get_vram_usage: Returns current VRAM usage - * * This structure contains function pointers to services that the kgd driver * provides to amdkfd driver. * @@ -255,23 +134,11 @@ struct kfd2kgd_calls { void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); - void(*get_local_mem_info)(struct kgd_dev *kgd, - struct kfd_local_mem_info *mem_info); + uint64_t (*get_vmem_size)(struct kgd_dev *kgd); uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd); uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd); - int (*create_process_vm)(struct kgd_dev *kgd, void **vm, - void **process_info, struct dma_fence **ef); - void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm); - - int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem); - void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem); - - uint32_t (*get_process_page_dir)(void *vm); - - int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); - /* Register access functions */ void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid, uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, @@ -284,28 +151,16 @@ struct kfd2kgd_calls { uint32_t hpd_size, uint64_t hpd_gpu_addr); int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); - int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm); - - int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd, - uint32_t __user *wptr, struct mm_struct *mm); - - int (*hqd_dump)(struct kgd_dev *kgd, - uint32_t pipe_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs); + uint32_t queue_id, uint32_t __user *wptr); - int (*hqd_sdma_dump)(struct kgd_dev *kgd, - uint32_t engine_id, uint32_t queue_id, - uint32_t (**dump)[2], uint32_t *n_regs); + int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd); bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); - int (*hqd_destroy)(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, + int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id); @@ -313,7 +168,7 @@ struct kfd2kgd_calls { int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd, unsigned int timeout); - + int (*address_watch_disable)(struct kgd_dev *kgd); int (*address_watch_execute)(struct kgd_dev *kgd, unsigned int watch_point_id, @@ -332,72 +187,11 @@ struct kfd2kgd_calls { uint16_t (*get_atc_vmid_pasid_mapping_pasid)( struct kgd_dev *kgd, uint8_t vmid); - uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd); void (*write_vmid_invalidate_request)(struct kgd_dev *kgd, uint8_t vmid); - int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid); - - int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); - - int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va, - uint64_t size, void *vm, - struct kgd_mem **mem, uint64_t *offset, - uint32_t flags); - int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, - void *vm); - int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, - void *vm); - int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, - void *vm); - uint16_t (*get_fw_version)(struct kgd_dev *kgd, enum kgd_engine_type type); - - void (*set_num_of_requests)(struct kgd_dev *kgd, - uint8_t num_of_requests); - int (*alloc_memory_of_scratch)(struct kgd_dev *kgd, - uint64_t va, uint32_t vmid); - int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable, - uint8_t element_size, uint8_t index_stride, uint8_t mtype); - void (*get_cu_info)(struct kgd_dev *kgd, - struct kfd_cu_info *cu_info); - int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma); - int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd, - struct kgd_mem *mem, void **kptr); - void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid, - uint32_t page_table_base); - - int (*pin_get_sg_table_bo)(struct kgd_dev *kgd, - struct kgd_mem *mem, uint64_t offset, - uint64_t size, struct sg_table **ret_sg); - void (*unpin_put_sg_table_bo)(struct kgd_mem *mem, - struct sg_table *sg); - - int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd, - struct kgd_dev **dma_buf_kgd, uint64_t *bo_size, - void *metadata_buffer, size_t buffer_size, - uint32_t *metadata_size, uint32_t *flags); - int (*import_dmabuf)(struct kgd_dev *kgd, struct dma_buf *dmabuf, - uint64_t va, void *vm, struct kgd_mem **mem, - uint64_t *size, uint64_t *mmap_offset); - int (*export_dmabuf)(struct kgd_dev *kgd, void *vm, struct kgd_mem *mem, - struct dma_buf **dmabuf); - - int (*get_vm_fault_info)(struct kgd_dev *kgd, - struct kfd_vm_fault_info *info); - int (*submit_ib)(struct kgd_dev *kgd, enum kgd_engine_type engine, - uint32_t vmid, uint64_t gpu_addr, - uint32_t *ib_cmd, uint32_t ib_len); - int (*get_tile_config)(struct kgd_dev *kgd, - struct tile_config *config); - - int (*restore_process_bos)(void *process_info, struct dma_fence **ef); - int (*copy_mem_to_mem)(struct kgd_dev *kgd, struct kgd_mem *src_mem, - uint64_t src_offset, struct kgd_mem *dst_mem, - uint64_t dest_offset, uint64_t size, - struct dma_fence **f, uint64_t *actual_size); - uint64_t (*get_vram_usage)(struct kgd_dev *kgd); }; /** @@ -416,13 +210,6 @@ struct kfd2kgd_calls { * * @resume: Notifies amdkfd about a resume action done to a kgd device * - * @quiesce_mm: Quiesce all user queue access to specified MM address space - * - * @resume_mm: Resume user queue access to specified MM address space - * - * @schedule_evict_and_restore_process: Schedules work queue that will prepare - * for safe eviction of KFD BOs that belong to the specified process. - * * This structure contains function callback pointers so the kgd driver * will notify to the amdkfd about certain status changes. * @@ -437,13 +224,9 @@ struct kgd2kfd_calls { void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); void (*suspend)(struct kfd_dev *kfd); int (*resume)(struct kfd_dev *kfd); - int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm); - int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm); - int (*schedule_evict_and_restore_process)(struct mm_struct *mm, - struct dma_fence *fence); }; int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f); -#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ +#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ diff --git a/drivers/gpu/drm/drm_pci.c b/drivers/gpu/drm/drm_pci.c index 1235c98..7e5a1fe 100644 --- a/drivers/gpu/drm/drm_pci.c +++ b/drivers/gpu/drm/drm_pci.c @@ -149,6 +149,7 @@ int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master) master->unique_len = strlen(master->unique); return 0; } +EXPORT_SYMBOL(drm_pci_set_busid); static int drm_pci_irq_by_busid(struct drm_device *dev, struct drm_irq_busid *p) { diff --git a/drivers/gpu/drm/radeon/radeon_kfd.c b/drivers/gpu/drm/radeon/radeon_kfd.c index c7d2e7a..a2ab6dc 100755 --- a/drivers/gpu/drm/radeon/radeon_kfd.c +++ b/drivers/gpu/drm/radeon/radeon_kfd.c @@ -75,15 +75,12 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, uint32_t hpd_size, uint64_t hpd_gpu_addr); static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm); -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, - uint32_t __user *wptr, struct mm_struct *mm); + uint32_t queue_id, uint32_t __user *wptr); +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, +static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id); static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); @@ -110,6 +107,7 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); static const struct kfd2kgd_calls kfd2kgd = { .init_gtt_mem_allocation = alloc_gtt_mem, .free_gtt_mem = free_gtt_mem, + .get_vmem_size = get_vmem_size, .get_gpu_clock_counter = get_gpu_clock_counter, .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, .program_sh_mem_settings = kgd_program_sh_mem_settings, @@ -484,9 +482,7 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) } static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm) + uint32_t queue_id, uint32_t __user *wptr) { uint32_t wptr_shadow, is_wptr_shadow_valid; struct cik_mqd *m; @@ -562,8 +558,7 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, return 0; } -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, - uint32_t __user *wptr, struct mm_struct *mm) +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) { struct cik_sdma_rlc_registers *m; uint32_t sdma_base_addr; @@ -641,7 +636,7 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) return false; } -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, +static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id) { diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h index 3053049..f08273c 100644 --- a/include/drm/drm_drv.h +++ b/include/drm/drm_drv.h @@ -174,6 +174,8 @@ struct drm_driver { * to finalize the device and then freeing the struct themselves. */ void (*release) (struct drm_device *); + + int (*set_busid)(struct drm_device *dev, struct drm_master *master); /** * @get_vblank_counter: diff --git a/include/drm/drm_pci.h b/include/drm/drm_pci.h index 6745990..4d5daa8 100644 --- a/include/drm/drm_pci.h +++ b/include/drm/drm_pci.h @@ -49,6 +49,7 @@ void drm_legacy_pci_exit(struct drm_driver *driver, struct pci_driver *pdriver); int drm_get_pci_dev(struct pci_dev *pdev, const struct pci_device_id *ent, struct drm_driver *driver); +int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master); #else static inline int drm_get_pci_dev(struct pci_dev *pdev, const struct pci_device_id *ent, @@ -56,6 +57,12 @@ static inline int drm_get_pci_dev(struct pci_dev *pdev, { return -ENOSYS; } + +static inline int drm_pci_set_busid(struct drm_device *dev, + struct drm_master *master) +{ + return -ENOSYS; +} #endif #define DRM_PCIE_SPEED_25 1 -- 2.7.4