From 37d3acaf273e6505bf399d1a4fefa7a32b967671 Mon Sep 17 00:00:00 2001 From: Sanjay R Mehta Date: Wed, 16 May 2018 15:41:36 +0530 Subject: [PATCH 3339/4131] revert to old stack Signed-off-by: Sanjay R Mehta --- drivers/gpu/drm/amd/amdkfd/Kconfig | 3 +- drivers/gpu/drm/amd/amdkfd/Makefile | 21 +- drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 74 +- drivers/gpu/drm/amd/amdkfd/cik_int.h | 24 +- drivers/gpu/drm/amd/amdkfd/cik_regs.h | 3 +- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm | 1384 ------------------- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 1419 -------------------- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1188 +--------------- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 1339 ------------------ drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 42 +- drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 133 +- drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h | 32 - drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c | 3 +- drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 75 -- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 734 +--------- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1003 +++----------- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 59 +- .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c | 70 +- .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 83 -- .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c | 112 +- drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 109 +- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 688 +++++----- drivers/gpu/drm/amd/amdkfd/kfd_events.h | 18 +- drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 119 +- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 135 -- drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 88 +- drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 271 ---- drivers/gpu/drm/amd/amdkfd/kfd_ipc.h | 51 - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 105 +- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 17 +- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | 128 -- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 377 ------ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 361 ----- drivers/gpu/drm/amd/amdkfd/kfd_module.c | 66 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 55 - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 15 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 205 +-- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 524 -------- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 294 +--- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 389 ++++-- drivers/gpu/drm/amd/amdkfd/kfd_pasid.c | 90 +- drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c | 513 ------- drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h | 583 -------- drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h | 97 ++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 537 ++------ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 914 ++----------- .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 232 +--- drivers/gpu/drm/amd/amdkfd/kfd_rdma.c | 294 ---- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1142 ++++++---------- drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 42 +- drivers/gpu/drm/amd/amdkfd/soc15_int.h | 84 -- 51 files changed, 2132 insertions(+), 14212 deletions(-) delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_crat.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.h delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_rdma.c delete mode 100644 drivers/gpu/drm/amd/amdkfd/soc15_int.h diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index 95be0dd..e13c67c 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -4,7 +4,6 @@ config HSA_AMD tristate "HSA kernel driver for AMD GPU devices" - depends on (DRM_RADEON || DRM_AMDGPU) && (X86_64 || PPC64 || ARM64) - select DRM_AMDGPU_USERPTR + depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64 help Enable this if you want to use HSA features on AMD GPU devices. diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index fde693c..b400d56 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -2,25 +2,18 @@ # Makefile for Heterogenous System Architecture support for AMD GPU devices # -FULL_AMD_PATH=$(src)/.. - -ccflags-y := -I$(FULL_AMD_PATH)/include/ \ - -I$(FULL_AMD_PATH)/include/asic_reg +ccflags-y := -Idrivers/gpu/drm/amd/include/ \ + -Idrivers/gpu/drm/amd/include/asic_reg amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ kfd_process.o kfd_queue.o kfd_mqd_manager.o \ kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ - kfd_mqd_manager_v9.o \ kfd_kernel_queue.o kfd_kernel_queue_cik.o \ - kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ - kfd_packet_manager.o kfd_process_queue_manager.o \ - kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ - kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ - kfd_interrupt.o kfd_events.o cik_event_interrupt.o kfd_int_process_v9.o \ - kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o \ - kfd_peerdirect.o kfd_ipc.o - -amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o + kfd_kernel_queue_vi.o kfd_packet_manager.o \ + kfd_process_queue_manager.o kfd_device_queue_manager.o \ + kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ + kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ + kfd_dbgdev.o kfd_dbgmgr.o obj-$(CONFIG_HSA_AMD) += amdkfd.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 751c004..211fc48 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,90 +24,40 @@ #include "kfd_events.h" #include "cik_int.h" -static bool is_cpc_vm_fault(struct kfd_dev *dev, - const uint32_t *ih_ring_entry) -{ - const struct cik_ih_ring_entry *ihre = - (const struct cik_ih_ring_entry *)ih_ring_entry; - - if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || - ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && - ihre->vmid >= dev->vm_info.first_vmid_kfd && - ihre->vmid <= dev->vm_info.last_vmid_kfd) - return true; - return false; -} - static bool cik_event_interrupt_isr(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, - uint32_t *patched_ihre, - bool *patched_flag) + const uint32_t *ih_ring_entry) { + unsigned int pasid; const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; - const struct kfd2kgd_calls *f2g = dev->kfd2kgd; - struct cik_ih_ring_entry *tmp_ihre = - (struct cik_ih_ring_entry *) patched_ihre; - /* This workaround is due to HW/FW limitation on Hawaii that - * VMID and PASID are not written into ih_ring_entry - */ - if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || - ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && - dev->device_info->asic_family == CHIP_HAWAII) { - *patched_flag = true; - *tmp_ihre = *ihre; + pasid = (ihre->ring_id & 0xffff0000) >> 16; - tmp_ihre->vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); - tmp_ihre->pasid = f2g->get_atc_vmid_pasid_mapping_pasid( - dev->kgd, tmp_ihre->vmid); - return (tmp_ihre->pasid != 0) && - tmp_ihre->vmid >= dev->vm_info.first_vmid_kfd && - tmp_ihre->vmid <= dev->vm_info.last_vmid_kfd; - } /* Do not process in ISR, just request it to be forwarded to WQ. */ - return (ihre->pasid != 0) && + return (pasid != 0) && (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || - ihre->source_id == CIK_INTSRC_SDMA_TRAP || ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || - ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || - is_cpc_vm_fault(dev, ih_ring_entry)); + ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); } static void cik_event_interrupt_wq(struct kfd_dev *dev, const uint32_t *ih_ring_entry) { + unsigned int pasid; const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; - uint32_t context_id = ihre->data & 0xfffffff; - if (ihre->pasid == 0) + pasid = (ihre->ring_id & 0xffff0000) >> 16; + + if (pasid == 0) return; if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(ihre->pasid, context_id, 28); - else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) - kfd_signal_event_interrupt(ihre->pasid, context_id, 28); + kfd_signal_event_interrupt(pasid, 0, 0); else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) - kfd_signal_event_interrupt(ihre->pasid, context_id & 0xff, 8); + kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8); else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) - kfd_signal_hw_exception_event(ihre->pasid); - else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || - ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { - struct kfd_vm_fault_info info; - - kfd_process_vm_fault(dev->dqm, ihre->pasid); - - memset(&info, 0, sizeof(info)); - dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); - if (!info.page_addr && !info.status) - return; - - if (info.vmid == ihre->vmid) - kfd_signal_vm_fault_event(dev, ihre->pasid, &info); - else - kfd_signal_vm_fault_event(dev, ihre->pasid, NULL); - } + kfd_signal_hw_exception_event(pasid); } const struct kfd_event_interrupt_class event_interrupt_class_cik = { diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h index ff8255d..79a16d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_int.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h @@ -26,32 +26,16 @@ #include struct cik_ih_ring_entry { - uint32_t source_id:8; - uint32_t reserved1:8; - uint32_t reserved2:16; - - uint32_t data:28; - uint32_t reserved3:4; - - /* pipeid, meid and unused3 are officially called RINGID, - * but for our purposes, they always decode into pipe and ME. - */ - uint32_t pipeid:2; - uint32_t meid:2; - uint32_t reserved4:4; - uint32_t vmid:8; - uint32_t pasid:16; - - uint32_t reserved5; + uint32_t source_id; + uint32_t data; + uint32_t ring_id; + uint32_t reserved; }; #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF -#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 -#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 -#define CIK_INTSRC_SDMA_TRAP 0xE0 #endif diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h index 37ce6dd..48769d1 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h @@ -33,8 +33,7 @@ #define APE1_MTYPE(x) ((x) << 7) /* valid for both DEFAULT_MTYPE and APE1_MTYPE */ -#define MTYPE_CACHED_NV 0 -#define MTYPE_CACHED 1 +#define MTYPE_CACHED 0 #define MTYPE_NONCACHED 3 #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm deleted file mode 100644 index 751cc2e..0000000 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm +++ /dev/null @@ -1,1384 +0,0 @@ -/* - * Copyright 2015-2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#if 0 -HW (VI) source code for CWSR trap handler -#Version 18 + multiple trap handler - -// this performance-optimal version was originally from Seven Xu at SRDC - -// Revison #18 --... -/* Rev History -** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) -** #4. SR Memory Layout: -** 1. VGPR-SGPR-HWREG-{LDS} -** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. -** #5. Update: 1. Accurate g8sr_ts_save_d timestamp -** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) -** #7. Update: 1. don't barrier if noLDS -** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version -** 2. Fix SQ issue by s_sleep 2 -** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last -** 2. optimize s_buffer save by burst 16sgprs... -** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. -** #11. Update 1. Add 2 more timestamp for debug version -** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance -** #13. Integ 1. Always use MUBUF for PV trap shader... -** #14. Update 1. s_buffer_store soft clause... -** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. -** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree -** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] -** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... -** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 -** 2. FUNC - Handle non-CWSR traps -*/ - -var G8SR_WDMEM_HWREG_OFFSET = 0 -var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes - -// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. - -var G8SR_DEBUG_TIMESTAMP = 0 -var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset -var s_g8sr_ts_save_s = s[34:35] // save start -var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi -var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ -var s_g8sr_ts_save_d = s[40:41] // save end -var s_g8sr_ts_restore_s = s[42:43] // restore start -var s_g8sr_ts_restore_d = s[44:45] // restore end - -var G8SR_VGPR_SR_IN_DWX4 = 0 -var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes -var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 - - -/*************************************************************************/ -/* control on how to run the shader */ -/*************************************************************************/ -//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) -var EMU_RUN_HACK = 0 -var EMU_RUN_HACK_RESTORE_NORMAL = 0 -var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 -var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 -var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var SAVE_LDS = 1 -var WG_BASE_ADDR_LO = 0x9000a000 -var WG_BASE_ADDR_HI = 0x0 -var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem -var CTX_SAVE_CONTROL = 0x0 -var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) -var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes -var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing - -/**************************************************************************/ -/* variables */ -/**************************************************************************/ -var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 -var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 -var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 - -var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 -var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 -var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 -var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 -var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 -var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits - -var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 -var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask -var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 -var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 -var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 - -var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME -var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME -var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME -var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME -var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME - -var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 -var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 - - -/* Save */ -var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes -var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE - -var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -var S_SAVE_SPI_INIT_ATC_SHIFT = 27 -var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 -var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 - -var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used -var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME - -var s_save_spi_init_lo = exec_lo -var s_save_spi_init_hi = exec_hi - - //tba_lo and tba_hi need to be saved/restored -var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} -var s_save_pc_hi = ttmp1 -var s_save_exec_lo = ttmp2 -var s_save_exec_hi = ttmp3 -var s_save_status = ttmp4 -var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine -var s_save_xnack_mask_lo = ttmp6 -var s_save_xnack_mask_hi = ttmp7 -var s_save_buf_rsrc0 = ttmp8 -var s_save_buf_rsrc1 = ttmp9 -var s_save_buf_rsrc2 = ttmp10 -var s_save_buf_rsrc3 = ttmp11 - -var s_save_mem_offset = tma_lo -var s_save_alloc_size = s_save_trapsts //conflict -var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) -var s_save_m0 = tma_hi - -/* Restore */ -var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE -var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC - -var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 -var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 -var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 - -var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT -var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK -var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK - -var s_restore_spi_init_lo = exec_lo -var s_restore_spi_init_hi = exec_hi - -var s_restore_mem_offset = ttmp2 -var s_restore_alloc_size = ttmp3 -var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored -var s_restore_mem_offset_save = s_restore_tmp //no conflict - -var s_restore_m0 = s_restore_alloc_size //no conflict - -var s_restore_mode = ttmp7 - -var s_restore_pc_lo = ttmp0 -var s_restore_pc_hi = ttmp1 -var s_restore_exec_lo = tma_lo //no conflict -var s_restore_exec_hi = tma_hi //no conflict -var s_restore_status = ttmp4 -var s_restore_trapsts = ttmp5 -var s_restore_xnack_mask_lo = xnack_mask_lo -var s_restore_xnack_mask_hi = xnack_mask_hi -var s_restore_buf_rsrc0 = ttmp8 -var s_restore_buf_rsrc1 = ttmp9 -var s_restore_buf_rsrc2 = ttmp10 -var s_restore_buf_rsrc3 = ttmp11 - -/**************************************************************************/ -/* trap handler entry points */ -/**************************************************************************/ -/* Shader Main*/ - -shader main - asic(VI) - type(CS) - - - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore - //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC - s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. - s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE - //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE - s_branch L_SKIP_RESTORE //NOT restore, SAVE actually - else - s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save - end - -L_JUMP_TO_RESTORE: - s_branch L_RESTORE //restore - -L_SKIP_RESTORE: - - s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save - s_cbranch_scc1 L_SAVE //this is the operation for save - - // ********* Handle non-CWSR traps ******************* -if (!EMU_RUN_HACK) - /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ - s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 - s_waitcnt lgkmcnt(0) - s_or_b32 ttmp7, ttmp8, ttmp9 - s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) - s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler - -L_NO_NEXT_TRAP: - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception - s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. - s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 - s_addc_u32 ttmp1, ttmp1, 0 -L_EXCP_CASE: - s_and_b32 ttmp1, ttmp1, 0xFFFF - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) - s_rfe_b64 [ttmp0, ttmp1] -end - // ********* End handling of non-CWSR traps ******************* - -/**************************************************************************/ -/* save routine */ -/**************************************************************************/ - -L_SAVE: - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -end - - //check whether there is mem_viol - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_PC_REWIND - - //if so, need rewind PC assuming GDS operation gets NACKed - s_mov_b32 s_save_tmp, 0 //clear mem_viol bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 - s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc - -L_NO_PC_REWIND: - s_mov_b32 s_save_tmp, 0 //clear saveCtx bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit - - s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK - s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT - s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT - s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY - s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT - s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS - s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG - - s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp - - /* inform SPI the readiness and wait for SPI's go signal */ - s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI - s_mov_b32 s_save_exec_hi, exec_hi - s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_sq_save_msg - s_waitcnt lgkmcnt(0) -end - - if (EMU_RUN_HACK) - - else - s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC - end - - L_SLEEP: - s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 - - if (EMU_RUN_HACK) - - else - s_cbranch_execz L_SLEEP - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_spi_wrexec - s_waitcnt lgkmcnt(0) -end - - /* setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_save_tmp, v9, 0 - s_lshr_b32 s_save_tmp, s_save_tmp, 6 - s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - - - s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo - s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited - s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK - s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK - s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE - - //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) - s_mov_b32 s_save_m0, m0 //save M0 - - /* global mem offset */ - s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 - - - - - /* save HW registers */ - ////////////////////////////// - - L_SAVE_HWREG: - // HWREG SR memory offset : size(VGPR)+size(SGPR) - get_vgpr_size_bytes(s_save_mem_offset) - get_sgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp - - - s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 - - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO - s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI - end - - write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC - write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC - write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS - - //s_save_trapsts conflicts with s_save_alloc_size - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS - - write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO - write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI - - //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 - s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE - write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO - write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI - - - - /* the first wave in the threadgroup */ - // save fist_wave bits in tba_hi unused bit.26 - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit - //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] - s_mov_b32 s_save_exec_hi, 0x0 - s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] - - - /* save SGPRs */ - // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... - ////////////////////////////// - - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - // TODO, change RSRC word to rearrange memory layout for SGPRS - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - - if (SGPR_SAVE_USE_SQC) - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 - //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 - s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 - s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset - s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 - - s_mov_b32 m0, 0x0 //SGPR initial index value =0 - L_SAVE_SGPR_LOOP: - // SGPR is allocated in 16 SGPR granularity - s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] - s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] - s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] - s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] - s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] - s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] - s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] - s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] - - write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 - s_add_u32 m0, m0, 16 //next sgpr index - s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? - // restore s_save_buf_rsrc0,1 - //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo - s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo - - - - - /* save first 4 VGPR, then LDS save could use */ - // each wave will alloc 4 vgprs at least... - ///////////////////////////////////////////////////////////////////////////////////// - - s_mov_b32 s_save_mem_offset, 0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity - -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -end - - - - /* save LDS */ - ////////////////////////////// - - L_SAVE_LDS: - - // Change EXEC to all threads... - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size - s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? - s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE - - s_barrier //LDS is used? wait for other waves in the same TG - //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here - s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here - s_cbranch_scc0 L_SAVE_LDS_DONE - - // first wave do LDS save; - - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes - s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes - - // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) - // - get_vgpr_size_bytes(s_save_mem_offset) - get_sgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp - s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() - - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - s_mov_b32 m0, 0x0 //lds_offset initial value = 0 - - -var LDS_DMA_ENABLE = 0 -var UNROLL = 0 -if UNROLL==0 && LDS_DMA_ENABLE==1 - s_mov_b32 s3, 256*2 - s_nop 0 - s_nop 0 - s_nop 0 - L_SAVE_LDS_LOOP: - //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? - if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - end - - s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes - s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? - -elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss - // store from higest LDS address to lowest - s_mov_b32 s3, 256*2 - s_sub_u32 m0, s_save_alloc_size, s3 - s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 - s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... - s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest - s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction - s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc - s_nop 0 - s_nop 0 - s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes - s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved - s_add_u32 s0, s0,s_save_alloc_size - s_addc_u32 s1, s1, 0 - s_setpc_b64 s[0:1] - - - for var i =0; i< 128; i++ - // be careful to make here a 64Byte aligned address, which could improve performance... - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - - if i!=127 - s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline - s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 - end - end - -else // BUFFER_STORE - v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 - v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid - v_mul_i32_i24 v2, v3, 8 // tid*8 - v_mov_b32 v3, 256*2 - s_mov_b32 m0, 0x10000 - s_mov_b32 s0, s_save_buf_rsrc3 - s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT - -L_SAVE_LDS_LOOP_VECTOR: - ds_read_b64 v[0:1], v2 //x =LDS[a], byte address - s_waitcnt lgkmcnt(0) - buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 -// s_waitcnt vmcnt(0) - v_add_u32 v2, vcc[0:1], v2, v3 - v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size - s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR - - // restore rsrc3 - s_mov_b32 s_save_buf_rsrc3, s0 - -end - -L_SAVE_LDS_DONE: - - - /* save VGPRs - set the Rest VGPRs */ - ////////////////////////////////////////////////////////////////////////////////////// - L_SAVE_VGPR: - // VGPR SR memory offset: 0 - // TODO rearrange the RSRC words to use swizzle for VGPR save... - - s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity - -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, 4 // skip first 4 VGPRs - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs - - s_set_gpr_idx_on m0, 0x1 // This will change M0 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 -L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 // v0 = v[0+m0] - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - s_add_u32 m0, m0, 4 - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -L_SAVE_VGPR_LOOP_END: - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else - // VGPR store using dw burst - s_mov_b32 m0, 0x4 //VGPR initial index value =0 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_END - - - s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later - - L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 //v0 = v[0+m0] - v_mov_b32 v1, v1 //v0 = v[0+m0] - v_mov_b32 v2, v2 //v0 = v[0+m0] - v_mov_b32 v3, v3 //v0 = v[0+m0] - - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 - end - - s_add_u32 m0, m0, 4 //next vgpr index - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -end - -L_SAVE_VGPR_END: - - - - - - - /* S_PGM_END_SAVED */ //FIXME graphics ONLY - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - s_rfe_b64 s_save_pc_lo //Return to the main shader program - else - end - -// Save Done timestamp -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_d - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // Need reset rsrc2?? - s_mov_b32 m0, s_save_mem_offset - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 -end - - - s_branch L_END_PGM - - - -/**************************************************************************/ -/* restore routine */ -/**************************************************************************/ - -L_RESTORE: - /* Setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_restore_tmp, v9, 0 - s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 - s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE - s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL - else - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... - s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] - s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. -end - - - - s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo - s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) - s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK - s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position - s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK - s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position - s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE - - /* global mem offset */ -// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 - - /* the first wave in the threadgroup */ - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK - s_cbranch_scc0 L_RESTORE_VGPR - - /* restore LDS */ - ////////////////////////////// - L_RESTORE_LDS: - - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size - s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? - s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes - s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes - - // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) - // - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? - - - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - s_mov_b32 m0, 0x0 //lds_offset initial value = 0 - - L_RESTORE_LDS_LOOP: - if (SAVE_LDS) - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW - end - s_add_u32 m0, m0, 256*2 // 128 DW - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW - s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? - - - /* restore VGPRs */ - ////////////////////////////// - L_RESTORE_VGPR: - // VGPR SR memory offset : 0 - s_mov_b32 s_restore_mem_offset, 0x0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - -if G8SR_VGPR_SR_IN_DWX4 - get_vgpr_size_bytes(s_restore_mem_offset) - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, s_restore_alloc_size - s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 - -L_RESTORE_VGPR_LOOP: - buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - s_waitcnt vmcnt(0) - s_sub_u32 m0, m0, 4 - v_mov_b32 v0, v0 // v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_cmp_eq_u32 m0, 0x8000 - s_cbranch_scc0 L_RESTORE_VGPR_LOOP - s_set_gpr_idx_off - - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes - -else - // VGPR load using dw burst - s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_mov_b32 m0, 4 //VGPR initial index value = 1 - s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later - - L_RESTORE_VGPR_LOOP: - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 - end - s_waitcnt vmcnt(0) //ensure data ready - v_mov_b32 v0, v0 //v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_add_u32 m0, m0, 4 //next vgpr index - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes - s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? - s_set_gpr_idx_off - /* VGPR restore on v0 */ - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 - end - -end - - /* restore SGPRs */ - ////////////////////////////// - - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group - // TODO, change RSRC word to rearrange memory layout for SGPRS - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - - if (SGPR_SAVE_USE_SQC) - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), - However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG - */ - s_mov_b32 m0, s_restore_alloc_size - - L_RESTORE_SGPR_LOOP: - read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made - s_waitcnt lgkmcnt(0) //ensure data ready - - s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] - - s_movreld_b64 s0, s0 //s[0+m0] = s0 - s_movreld_b64 s2, s2 - s_movreld_b64 s4, s4 - s_movreld_b64 s6, s6 - s_movreld_b64 s8, s8 - s_movreld_b64 s10, s10 - s_movreld_b64 s12, s12 - s_movreld_b64 s14, s14 - - s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? - - /* restore HW registers */ - ////////////////////////////// - L_RESTORE_HWREG: - - -if G8SR_DEBUG_TIMESTAMP - s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo - s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi -end - - // HWREG SR memory offset : size(VGPR)+size(SGPR) - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - - - s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 - read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC - read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC - read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS - read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS - read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO - read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI - read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE - read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO - read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI - - s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS - - s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS - - //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - - s_mov_b32 m0, s_restore_m0 - s_mov_b32 exec_lo, s_restore_exec_lo - s_mov_b32 exec_hi, s_restore_exec_hi - - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 - //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore - s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - //reuse s_restore_m0 as a temp register - s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT - s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT - s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero - s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT - s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT - s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT - s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp - - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu - - s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_d - s_waitcnt lgkmcnt(0) -end - -// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution - s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc - - -/**************************************************************************/ -/* the END */ -/**************************************************************************/ -L_END_PGM: - s_endpgm - -end - - -/**************************************************************************/ -/* the helper functions */ -/**************************************************************************/ - -//Only for save hwreg to mem -function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) - s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on - s_mov_b32 m0, s_mem_offset - s_buffer_store_dword s, s_rsrc, m0 glc:1 - s_add_u32 s_mem_offset, s_mem_offset, 4 - s_mov_b32 m0, exec_lo -end - - -// HWREG are saved before SGPRs, so all HWREG could be use. -function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) - - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 - s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 - s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 - s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc -end - - -function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 - s_add_u32 s_mem_offset, s_mem_offset, 4 -end - -function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 - s_sub_u32 s_mem_offset, s_mem_offset, 4*16 -end - - - -function get_lds_size_bytes(s_lds_size_byte) - // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW - s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size - s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW -end - -function get_vgpr_size_bytes(s_vgpr_size_byte) - s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 - s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible -end - -function get_sgpr_size_bytes(s_sgpr_size_byte) - s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 - s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) -end - -function get_hwreg_size_bytes - return 128 //HWREG size 128 bytes -end - - -#endif - -static const uint32_t cwsr_trap_gfx8_hex[] = { - 0xbf820001, 0xbf820123, - 0xb8f4f802, 0x89748674, - 0xb8f5f803, 0x8675ff75, - 0x00000400, 0xbf850011, - 0xc00a1e37, 0x00000000, - 0xbf8c007f, 0x87777978, - 0xbf840002, 0xb974f802, - 0xbe801d78, 0xb8f5f803, - 0x8675ff75, 0x000001ff, - 0xbf850002, 0x80708470, - 0x82718071, 0x8671ff71, - 0x0000ffff, 0xb974f802, - 0xbe801f70, 0xb8f5f803, - 0x8675ff75, 0x00000100, - 0xbf840006, 0xbefa0080, - 0xb97a0203, 0x8671ff71, - 0x0000ffff, 0x80f08870, - 0x82f18071, 0xbefa0080, - 0xb97a0283, 0xbef60068, - 0xbef70069, 0xb8fa1c07, - 0x8e7a9c7a, 0x87717a71, - 0xb8fa03c7, 0x8e7a9b7a, - 0x87717a71, 0xb8faf807, - 0x867aff7a, 0x00007fff, - 0xb97af807, 0xbef2007e, - 0xbef3007f, 0xbefe0180, - 0xbf900004, 0xbf8e0002, - 0xbf88fffe, 0xbef8007e, - 0x8679ff7f, 0x0000ffff, - 0x8779ff79, 0x00040000, - 0xbefa0080, 0xbefb00ff, - 0x00807fac, 0x867aff7f, - 0x08000000, 0x8f7a837a, - 0x877b7a7b, 0x867aff7f, - 0x70000000, 0x8f7a817a, - 0x877b7a7b, 0xbeef007c, - 0xbeee0080, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8fa1605, 0x807a817a, - 0x8e7a867a, 0x806e7a6e, - 0xbefa0084, 0xbefa00ff, - 0x01000000, 0xbefe007c, - 0xbefc006e, 0xc0611bfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611c3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611c7c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611cbc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611cfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611d3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xb8f5f803, - 0xbefe007c, 0xbefc006e, - 0xc0611d7c, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xbefe007c, 0xbefc006e, - 0xc0611dbc, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xbefe007c, 0xbefc006e, - 0xc0611dfc, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xb8eff801, 0xbefe007c, - 0xbefc006e, 0xc0611bfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611b3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611b7c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbef30080, - 0x8773737a, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8f51605, 0x80758175, - 0x8e758475, 0x8e7a8275, - 0xbefa00ff, 0x01000000, - 0xbef60178, 0x80786e78, - 0x82798079, 0xbefc0080, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003c, 0x00000000, - 0xc06b013c, 0x00000010, - 0xc06b023c, 0x00000020, - 0xc06b033c, 0x00000030, - 0x8078c078, 0x82798079, - 0x807c907c, 0xbf0a757c, - 0xbf85ffeb, 0xbef80176, - 0xbeee0080, 0xbefe00c1, - 0xbeff00c1, 0xbefa00ff, - 0x01000000, 0xe0724000, - 0x6e1e0000, 0xe0724100, - 0x6e1e0100, 0xe0724200, - 0x6e1e0200, 0xe0724300, - 0x6e1e0300, 0xbefe00c1, - 0xbeff00c1, 0xb8f54306, - 0x8675c175, 0xbf84002c, - 0xbf8a0000, 0x867aff73, - 0x04000000, 0xbf840028, - 0x8e758675, 0x8e758275, - 0xbefa0075, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8fa1605, 0x807a817a, - 0x8e7a867a, 0x806e7a6e, - 0x806eff6e, 0x00000080, - 0xbefa00ff, 0x01000000, - 0xbefc0080, 0xd28c0002, - 0x000100c1, 0xd28d0003, - 0x000204c1, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe80007b, - 0x867bff7b, 0xff7fffff, - 0x877bff7b, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8c007f, 0xe0765000, - 0x6e1e0002, 0x32040702, - 0xd0c9006a, 0x0000eb02, - 0xbf87fff7, 0xbefb0000, - 0xbeee00ff, 0x00000400, - 0xbefe00c1, 0xbeff00c1, - 0xb8f52a05, 0x80758175, - 0x8e758275, 0x8e7a8875, - 0xbefa00ff, 0x01000000, - 0xbefc0084, 0xbf0a757c, - 0xbf840015, 0xbf11017c, - 0x8075ff75, 0x00001000, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0xe0724000, 0x6e1e0000, - 0xe0724100, 0x6e1e0100, - 0xe0724200, 0x6e1e0200, - 0xe0724300, 0x6e1e0300, - 0x807c847c, 0x806eff6e, - 0x00000400, 0xbf0a757c, - 0xbf85ffef, 0xbf9c0000, - 0xbf8200ca, 0xbef8007e, - 0x8679ff7f, 0x0000ffff, - 0x8779ff79, 0x00040000, - 0xbefa0080, 0xbefb00ff, - 0x00807fac, 0x8676ff7f, - 0x08000000, 0x8f768376, - 0x877b767b, 0x8676ff7f, - 0x70000000, 0x8f768176, - 0x877b767b, 0x8676ff7f, - 0x04000000, 0xbf84001e, - 0xbefe00c1, 0xbeff00c1, - 0xb8f34306, 0x8673c173, - 0xbf840019, 0x8e738673, - 0x8e738273, 0xbefa0073, - 0xb8f22a05, 0x80728172, - 0x8e728a72, 0xb8f61605, - 0x80768176, 0x8e768676, - 0x80727672, 0x8072ff72, - 0x00000080, 0xbefa00ff, - 0x01000000, 0xbefc0080, - 0xe0510000, 0x721e0000, - 0xe0510100, 0x721e0000, - 0x807cff7c, 0x00000200, - 0x8072ff72, 0x00000200, - 0xbf0a737c, 0xbf85fff6, - 0xbef20080, 0xbefe00c1, - 0xbeff00c1, 0xb8f32a05, - 0x80738173, 0x8e738273, - 0x8e7a8873, 0xbefa00ff, - 0x01000000, 0xbef60072, - 0x8072ff72, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0x8073ff73, 0x00008000, - 0xe0524000, 0x721e0000, - 0xe0524100, 0x721e0100, - 0xe0524200, 0x721e0200, - 0xe0524300, 0x721e0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8072ff72, 0x00000400, - 0xbf0a737c, 0xbf85ffee, - 0xbf9c0000, 0xe0524000, - 0x761e0000, 0xe0524100, - 0x761e0100, 0xe0524200, - 0x761e0200, 0xe0524300, - 0x761e0300, 0xb8f22a05, - 0x80728172, 0x8e728a72, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x80727672, - 0x80f2c072, 0xb8f31605, - 0x80738173, 0x8e738473, - 0x8e7a8273, 0xbefa00ff, - 0x01000000, 0xbefc0073, - 0xc031003c, 0x00000072, - 0x80f2c072, 0xbf8c007f, - 0x80fc907c, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff1, 0xb8f22a05, - 0x80728172, 0x8e728a72, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x80727672, - 0xbefa0084, 0xbefa00ff, - 0x01000000, 0xc0211cfc, - 0x00000072, 0x80728472, - 0xc0211c3c, 0x00000072, - 0x80728472, 0xc0211c7c, - 0x00000072, 0x80728472, - 0xc0211bbc, 0x00000072, - 0x80728472, 0xc0211bfc, - 0x00000072, 0x80728472, - 0xc0211d3c, 0x00000072, - 0x80728472, 0xc0211d7c, - 0x00000072, 0x80728472, - 0xc0211a3c, 0x00000072, - 0x80728472, 0xc0211a7c, - 0x00000072, 0x80728472, - 0xc0211dfc, 0x00000072, - 0x80728472, 0xc0211b3c, - 0x00000072, 0x80728472, - 0xc0211b7c, 0x00000072, - 0x80728472, 0xbf8c007f, - 0x8671ff71, 0x0000ffff, - 0xbefc0073, 0xbefe006e, - 0xbeff006f, 0x867375ff, - 0x000003ff, 0xb9734803, - 0x867375ff, 0xfffff800, - 0x8f738b73, 0xb973a2c3, - 0xb977f801, 0x8673ff71, - 0xf0000000, 0x8f739c73, - 0x8e739073, 0xbef60080, - 0x87767376, 0x8673ff71, - 0x08000000, 0x8f739b73, - 0x8e738f73, 0x87767376, - 0x8673ff74, 0x00800000, - 0x8f739773, 0xb976f807, - 0x86fe7e7e, 0x86ea6a6a, - 0xb974f802, 0xbf8a0000, - 0x95807370, 0xbf810000, -}; - diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm deleted file mode 100644 index f9e819b..0000000 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ /dev/null @@ -1,1419 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#if 0 -HW (GFX9) source code for CWSR trap handler -#Version 18 + multiple trap handler - -// this performance-optimal version was originally from Seven Xu at SRDC - -// Revison #18 --... -/* Rev History -** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) -** #4. SR Memory Layout: -** 1. VGPR-SGPR-HWREG-{LDS} -** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. -** #5. Update: 1. Accurate g8sr_ts_save_d timestamp -** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) -** #7. Update: 1. don't barrier if noLDS -** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version -** 2. Fix SQ issue by s_sleep 2 -** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last -** 2. optimize s_buffer save by burst 16sgprs... -** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. -** #11. Update 1. Add 2 more timestamp for debug version -** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance -** #13. Integ 1. Always use MUBUF for PV trap shader... -** #14. Update 1. s_buffer_store soft clause... -** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. -** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree -** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] -** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... -** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 -** 2. FUNC - Handle non-CWSR traps -*/ - -var G8SR_WDMEM_HWREG_OFFSET = 0 -var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes - -// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. - -var G8SR_DEBUG_TIMESTAMP = 0 -var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset -var s_g8sr_ts_save_s = s[34:35] // save start -var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi -var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ -var s_g8sr_ts_save_d = s[40:41] // save end -var s_g8sr_ts_restore_s = s[42:43] // restore start -var s_g8sr_ts_restore_d = s[44:45] // restore end - -var G8SR_VGPR_SR_IN_DWX4 = 0 -var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes -var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 - - -/*************************************************************************/ -/* control on how to run the shader */ -/*************************************************************************/ -//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) -var EMU_RUN_HACK = 0 -var EMU_RUN_HACK_RESTORE_NORMAL = 0 -var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 -var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 -var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var SAVE_LDS = 1 -var WG_BASE_ADDR_LO = 0x9000a000 -var WG_BASE_ADDR_HI = 0x0 -var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem -var CTX_SAVE_CONTROL = 0x0 -var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) -var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes -var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing -var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency - -/**************************************************************************/ -/* variables */ -/**************************************************************************/ -var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 -var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 -var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 -var SQ_WAVE_STATUS_HALT_MASK = 0x2000 - -var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 -var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 -var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 -var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 -var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 -var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits - -var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 -var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask -var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 -var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 -var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 -var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 - -var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME -var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME -var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME - -var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 -var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 - - -/* Save */ -var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes -var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE - -var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -var S_SAVE_SPI_INIT_ATC_SHIFT = 27 -var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 -var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 - -var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used -var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME - -var s_save_spi_init_lo = exec_lo -var s_save_spi_init_hi = exec_hi - -var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} -var s_save_pc_hi = ttmp1 -var s_save_exec_lo = ttmp2 -var s_save_exec_hi = ttmp3 -var s_save_status = ttmp4 -var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine -var s_save_xnack_mask_lo = ttmp6 -var s_save_xnack_mask_hi = ttmp7 -var s_save_buf_rsrc0 = ttmp8 -var s_save_buf_rsrc1 = ttmp9 -var s_save_buf_rsrc2 = ttmp10 -var s_save_buf_rsrc3 = ttmp11 - -var s_save_mem_offset = ttmp14 -var s_save_alloc_size = s_save_trapsts //conflict -var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) -var s_save_m0 = ttmp15 - -/* Restore */ -var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE -var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC - -var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 -var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 -var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 - -var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT -var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK -var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK - -var s_restore_spi_init_lo = exec_lo -var s_restore_spi_init_hi = exec_hi - -var s_restore_mem_offset = ttmp12 -var s_restore_alloc_size = ttmp3 -var s_restore_tmp = ttmp6 -var s_restore_mem_offset_save = s_restore_tmp //no conflict - -var s_restore_m0 = s_restore_alloc_size //no conflict - -var s_restore_mode = ttmp7 - -var s_restore_pc_lo = ttmp0 -var s_restore_pc_hi = ttmp1 -var s_restore_exec_lo = ttmp14 -var s_restore_exec_hi = ttmp15 -var s_restore_status = ttmp4 -var s_restore_trapsts = ttmp5 -var s_restore_xnack_mask_lo = xnack_mask_lo -var s_restore_xnack_mask_hi = xnack_mask_hi -var s_restore_buf_rsrc0 = ttmp8 -var s_restore_buf_rsrc1 = ttmp9 -var s_restore_buf_rsrc2 = ttmp10 -var s_restore_buf_rsrc3 = ttmp11 - -/**************************************************************************/ -/* trap handler entry points */ -/**************************************************************************/ -/* Shader Main*/ - -shader main - asic(GFX9) - type(CS) - - - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore - //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC - s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. - s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE - //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE - s_branch L_SKIP_RESTORE //NOT restore, SAVE actually - else - s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save - end - -L_JUMP_TO_RESTORE: - s_branch L_RESTORE //restore - -L_SKIP_RESTORE: - - s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save - s_cbranch_scc1 L_SAVE //this is the operation for save - - // ********* Handle non-CWSR traps ******************* -if (!EMU_RUN_HACK) - // Illegal instruction is a non-maskable exception which blocks context save. - // Halt the wavefront and return from the trap. - s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK - s_cbranch_scc1 L_HALT_WAVE - - // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. - // Instead, halt the wavefront and return from the trap. - s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_MEM_VIOL - -L_HALT_WAVE: - // If STATUS.HALT is set then this fault must come from SQC instruction fetch. - // We cannot prevent further faults so just terminate the wavefront. - s_and_b32 ttmp8, s_save_status, SQ_WAVE_STATUS_HALT_MASK - s_cbranch_scc0 L_NOT_ALREADY_HALTED - s_endpgm -L_NOT_ALREADY_HALTED: - s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK - s_branch L_EXCP_CASE - -L_NO_MEM_VIOL: - /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ - s_getreg_b32 ttmp14,hwreg(HW_REG_SQ_SHADER_TMA_LO) - s_getreg_b32 ttmp15,hwreg(HW_REG_SQ_SHADER_TMA_HI) - s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [ttmp14, ttmp15], 0 - s_waitcnt lgkmcnt(0) - s_or_b32 ttmp7, ttmp8, ttmp9 - s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) - s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler - -L_NO_NEXT_TRAP: - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception - s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. - s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 - s_addc_u32 ttmp1, ttmp1, 0 -L_EXCP_CASE: - s_and_b32 ttmp1, ttmp1, 0xFFFF - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) - s_rfe_b64 [ttmp0, ttmp1] -end - // ********* End handling of non-CWSR traps ******************* - -/**************************************************************************/ -/* save routine */ -/**************************************************************************/ - -L_SAVE: - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -end - - //check whether there is mem_viol - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_PC_REWIND - - //if so, need rewind PC assuming GDS operation gets NACKed - s_mov_b32 s_save_tmp, 0 //clear mem_viol bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 - s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc - -L_NO_PC_REWIND: - s_mov_b32 s_save_tmp, 0 //clear saveCtx bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit - - s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK - s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT - s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT - s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY - s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT - s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp - s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS - s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG - - s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp - - /* inform SPI the readiness and wait for SPI's go signal */ - s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI - s_mov_b32 s_save_exec_hi, exec_hi - s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_sq_save_msg - s_waitcnt lgkmcnt(0) -end - - if (EMU_RUN_HACK) - - else - s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC - end - - L_SLEEP: - s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 - - if (EMU_RUN_HACK) - - else - s_cbranch_execz L_SLEEP - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_spi_wrexec - s_waitcnt lgkmcnt(0) -end - - /* setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_save_tmp, v9, 0 - s_lshr_b32 s_save_tmp, s_save_tmp, 6 - s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - - - s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo - s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited - s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK - s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK - s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE - - //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) - s_mov_b32 s_save_m0, m0 //save M0 - - /* global mem offset */ - s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 - - - - - /* save HW registers */ - ////////////////////////////// - - L_SAVE_HWREG: - // HWREG SR memory offset : size(VGPR)+size(SGPR) - get_vgpr_size_bytes(s_save_mem_offset) - get_sgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp - - - s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 - - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - end - - write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC - write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC - write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) - write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS - - //s_save_trapsts conflicts with s_save_alloc_size - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS - - write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO - write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI - - //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 - s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE - write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) - - - - /* the first wave in the threadgroup */ - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit - s_mov_b32 s_save_exec_hi, 0x0 - s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] - - - /* save SGPRs */ - // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... - ////////////////////////////// - - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - // TODO, change RSRC word to rearrange memory layout for SGPRS - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - - if (SGPR_SAVE_USE_SQC) - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 - //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 - s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 - s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset - s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 - - s_mov_b32 m0, 0x0 //SGPR initial index value =0 - s_nop 0x0 //Manually inserted wait states - L_SAVE_SGPR_LOOP: - // SGPR is allocated in 16 SGPR granularity - s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] - s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] - s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] - s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] - s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] - s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] - s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] - s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] - - write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 - s_add_u32 m0, m0, 16 //next sgpr index - s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? - // restore s_save_buf_rsrc0,1 - //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo - s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo - - - - - /* save first 4 VGPR, then LDS save could use */ - // each wave will alloc 4 vgprs at least... - ///////////////////////////////////////////////////////////////////////////////////// - - s_mov_b32 s_save_mem_offset, 0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity - -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -end - - - - /* save LDS */ - ////////////////////////////// - - L_SAVE_LDS: - - // Change EXEC to all threads... - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size - s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? - s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE - - s_barrier //LDS is used? wait for other waves in the same TG - s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here - s_cbranch_scc0 L_SAVE_LDS_DONE - - // first wave do LDS save; - - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes - s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes - - // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) - // - get_vgpr_size_bytes(s_save_mem_offset) - get_sgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp - s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() - - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - s_mov_b32 m0, 0x0 //lds_offset initial value = 0 - - -var LDS_DMA_ENABLE = 0 -var UNROLL = 0 -if UNROLL==0 && LDS_DMA_ENABLE==1 - s_mov_b32 s3, 256*2 - s_nop 0 - s_nop 0 - s_nop 0 - L_SAVE_LDS_LOOP: - //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? - if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - end - - s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes - s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? - -elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss - // store from higest LDS address to lowest - s_mov_b32 s3, 256*2 - s_sub_u32 m0, s_save_alloc_size, s3 - s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 - s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... - s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest - s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction - s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc - s_nop 0 - s_nop 0 - s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes - s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved - s_add_u32 s0, s0,s_save_alloc_size - s_addc_u32 s1, s1, 0 - s_setpc_b64 s[0:1] - - - for var i =0; i< 128; i++ - // be careful to make here a 64Byte aligned address, which could improve performance... - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - - if i!=127 - s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline - s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 - end - end - -else // BUFFER_STORE - v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 - v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid - v_mul_i32_i24 v2, v3, 8 // tid*8 - v_mov_b32 v3, 256*2 - s_mov_b32 m0, 0x10000 - s_mov_b32 s0, s_save_buf_rsrc3 - s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid - s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT - -L_SAVE_LDS_LOOP_VECTOR: - ds_read_b64 v[0:1], v2 //x =LDS[a], byte address - s_waitcnt lgkmcnt(0) - buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 -// s_waitcnt vmcnt(0) -// v_add_u32 v2, vcc[0:1], v2, v3 - v_add_u32 v2, v2, v3 - v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size - s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR - - // restore rsrc3 - s_mov_b32 s_save_buf_rsrc3, s0 - -end - -L_SAVE_LDS_DONE: - - - /* save VGPRs - set the Rest VGPRs */ - ////////////////////////////////////////////////////////////////////////////////////// - L_SAVE_VGPR: - // VGPR SR memory offset: 0 - // TODO rearrange the RSRC words to use swizzle for VGPR save... - - s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity - -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, 4 // skip first 4 VGPRs - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs - - s_set_gpr_idx_on m0, 0x1 // This will change M0 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 -L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 // v0 = v[0+m0] - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - s_add_u32 m0, m0, 4 - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -L_SAVE_VGPR_LOOP_END: - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else - // VGPR store using dw burst - s_mov_b32 m0, 0x4 //VGPR initial index value =0 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_END - - - s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later - - L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 //v0 = v[0+m0] - v_mov_b32 v1, v1 //v0 = v[0+m0] - v_mov_b32 v2, v2 //v0 = v[0+m0] - v_mov_b32 v3, v3 //v0 = v[0+m0] - - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 - end - - s_add_u32 m0, m0, 4 //next vgpr index - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -end - -L_SAVE_VGPR_END: - - - - - - - /* S_PGM_END_SAVED */ //FIXME graphics ONLY - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - s_rfe_b64 s_save_pc_lo //Return to the main shader program - else - end - -// Save Done timestamp -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_d - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // Need reset rsrc2?? - s_mov_b32 m0, s_save_mem_offset - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 -end - - - s_branch L_END_PGM - - - -/**************************************************************************/ -/* restore routine */ -/**************************************************************************/ - -L_RESTORE: - /* Setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_restore_tmp, v9, 0 - s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 - s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE - s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL - else - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... - s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] - s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. -end - - - - s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo - s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) - s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK - s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position - s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK - s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position - s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE - - /* global mem offset */ -// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 - - /* the first wave in the threadgroup */ - s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK - s_cbranch_scc0 L_RESTORE_VGPR - - /* restore LDS */ - ////////////////////////////// - L_RESTORE_LDS: - - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size - s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? - s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes - s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes - - // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) - // - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? - - - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - s_mov_b32 m0, 0x0 //lds_offset initial value = 0 - - L_RESTORE_LDS_LOOP: - if (SAVE_LDS) - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW - end - s_add_u32 m0, m0, 256*2 // 128 DW - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW - s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? - - - /* restore VGPRs */ - ////////////////////////////// - L_RESTORE_VGPR: - // VGPR SR memory offset : 0 - s_mov_b32 s_restore_mem_offset, 0x0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead - s_mov_b32 exec_hi, 0xFFFFFFFF - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - -if G8SR_VGPR_SR_IN_DWX4 - get_vgpr_size_bytes(s_restore_mem_offset) - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, s_restore_alloc_size - s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 - -L_RESTORE_VGPR_LOOP: - buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - s_waitcnt vmcnt(0) - s_sub_u32 m0, m0, 4 - v_mov_b32 v0, v0 // v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_cmp_eq_u32 m0, 0x8000 - s_cbranch_scc0 L_RESTORE_VGPR_LOOP - s_set_gpr_idx_off - - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes - -else - // VGPR load using dw burst - s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_mov_b32 m0, 4 //VGPR initial index value = 1 - s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later - - L_RESTORE_VGPR_LOOP: - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 - end - s_waitcnt vmcnt(0) //ensure data ready - v_mov_b32 v0, v0 //v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_add_u32 m0, m0, 4 //next vgpr index - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes - s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? - s_set_gpr_idx_off - /* VGPR restore on v0 */ - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 - end - -end - - /* restore SGPRs */ - ////////////////////////////// - - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group - // TODO, change RSRC word to rearrange memory layout for SGPRS - - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - - if (SGPR_SAVE_USE_SQC) - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - s_mov_b32 m0, s_restore_alloc_size - - L_RESTORE_SGPR_LOOP: - read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made - s_waitcnt lgkmcnt(0) //ensure data ready - - s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] - s_nop 0 // hazard SALU M0=> S_MOVREL - - s_movreld_b64 s0, s0 //s[0+m0] = s0 - s_movreld_b64 s2, s2 - s_movreld_b64 s4, s4 - s_movreld_b64 s6, s6 - s_movreld_b64 s8, s8 - s_movreld_b64 s10, s10 - s_movreld_b64 s12, s12 - s_movreld_b64 s14, s14 - - s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? - - /* restore HW registers */ - ////////////////////////////// - L_RESTORE_HWREG: - - -if G8SR_DEBUG_TIMESTAMP - s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo - s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi -end - - // HWREG SR memory offset : size(VGPR)+size(SGPR) - get_vgpr_size_bytes(s_restore_mem_offset) - get_sgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp - - - s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 - read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC - read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC - read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS - read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS - read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO - read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI - read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE - - s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS - - s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS - - //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - - s_mov_b32 m0, s_restore_m0 - s_mov_b32 exec_lo, s_restore_exec_lo - s_mov_b32 exec_hi, s_restore_exec_hi - - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 - //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore - s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - //reuse s_restore_m0 as a temp register - s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT - s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT - s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero - s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT - s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT - s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT - s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp - - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu - - s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_d - s_waitcnt lgkmcnt(0) -end - -// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution - s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc - - -/**************************************************************************/ -/* the END */ -/**************************************************************************/ -L_END_PGM: - s_endpgm - -end - - -/**************************************************************************/ -/* the helper functions */ -/**************************************************************************/ - -//Only for save hwreg to mem -function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) - s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on - s_mov_b32 m0, s_mem_offset - s_buffer_store_dword s, s_rsrc, m0 glc:1 -if ACK_SQC_STORE - s_waitcnt lgkmcnt(0) -end - s_add_u32 s_mem_offset, s_mem_offset, 4 - s_mov_b32 m0, exec_lo -end - - -// HWREG are saved before SGPRs, so all HWREG could be use. -function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) - - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 -if ACK_SQC_STORE - s_waitcnt lgkmcnt(0) -end - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 -if ACK_SQC_STORE - s_waitcnt lgkmcnt(0) -end - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 -if ACK_SQC_STORE - s_waitcnt lgkmcnt(0) -end - s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 -if ACK_SQC_STORE - s_waitcnt lgkmcnt(0) -end - s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 - s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc -end - - -function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 - s_add_u32 s_mem_offset, s_mem_offset, 4 -end - -function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 - s_sub_u32 s_mem_offset, s_mem_offset, 4*16 -end - - - -function get_lds_size_bytes(s_lds_size_byte) - // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW - s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size - s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW -end - -function get_vgpr_size_bytes(s_vgpr_size_byte) - s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size - s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 - s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible -end - -function get_sgpr_size_bytes(s_sgpr_size_byte) - s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size - s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 - s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) -end - -function get_hwreg_size_bytes - return 128 //HWREG size 128 bytes -end - - - -#endif - -static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf820136, - 0xb8f0f802, 0x89708670, - 0xb8f1f803, 0x8674ff71, - 0x00000400, 0xbf850021, - 0x8674ff71, 0x00000800, - 0xbf850003, 0x8674ff71, - 0x00000100, 0xbf840007, - 0x8674ff70, 0x00002000, - 0xbf840001, 0xbf810000, - 0x8770ff70, 0x00002000, - 0xbf820010, 0xb8faf812, - 0xb8fbf813, 0x8efa887a, - 0xc00a1d3d, 0x00000000, - 0xbf8cc07f, 0x87737574, - 0xbf840002, 0xb970f802, - 0xbe801d74, 0xb8f1f803, - 0x8671ff71, 0x000001ff, - 0xbf850002, 0x806c846c, - 0x826d806d, 0x866dff6d, - 0x0000ffff, 0xb970f802, - 0xbe801f6c, 0xb8f1f803, - 0x8671ff71, 0x00000100, - 0xbf840006, 0xbef60080, - 0xb9760203, 0x866dff6d, - 0x0000ffff, 0x80ec886c, - 0x82ed806d, 0xbef60080, - 0xb9760283, 0xbef20068, - 0xbef30069, 0xb8f62407, - 0x8e769c76, 0x876d766d, - 0xb8f603c7, 0x8e769b76, - 0x876d766d, 0xb8f6f807, - 0x8676ff76, 0x00007fff, - 0xb976f807, 0xbeee007e, - 0xbeef007f, 0xbefe0180, - 0xbf900004, 0xbf8e0002, - 0xbf88fffe, 0xbef4007e, - 0x8675ff7f, 0x0000ffff, - 0x8775ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x00807fac, 0x8676ff7f, - 0x08000000, 0x8f768376, - 0x87777677, 0x8676ff7f, - 0x70000000, 0x8f768176, - 0x87777677, 0xbefb007c, - 0xbefa0080, 0xb8fa2a05, - 0x807a817a, 0x8e7a8a7a, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x807a767a, - 0xbef60084, 0xbef600ff, - 0x01000000, 0xbefe007c, - 0xbefc007a, 0xc0611efa, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611b3a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611b7a, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611bba, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611bfa, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611c3a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xb8f1f803, - 0xbefe007c, 0xbefc007a, - 0xc0611c7a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611cba, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611cfa, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xb8fbf801, - 0xbefe007c, 0xbefc007a, - 0xc0611efa, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0x8676ff7f, - 0x04000000, 0xbeef0080, - 0x876f6f76, 0xb8fa2a05, - 0x807a817a, 0x8e7a8a7a, - 0xb8f11605, 0x80718171, - 0x8e718471, 0x8e768271, - 0xbef600ff, 0x01000000, - 0xbef20174, 0x80747a74, - 0x82758075, 0xbefc0080, - 0xbf800000, 0xbe802b00, - 0xbe822b02, 0xbe842b04, - 0xbe862b06, 0xbe882b08, - 0xbe8a2b0a, 0xbe8c2b0c, - 0xbe8e2b0e, 0xc06b003a, - 0x00000000, 0xbf8cc07f, - 0xc06b013a, 0x00000010, - 0xbf8cc07f, 0xc06b023a, - 0x00000020, 0xbf8cc07f, - 0xc06b033a, 0x00000030, - 0xbf8cc07f, 0x8074c074, - 0x82758075, 0x807c907c, - 0xbf0a717c, 0xbf85ffe7, - 0xbef40172, 0xbefa0080, - 0xbefe00c1, 0xbeff00c1, - 0xbef600ff, 0x01000000, - 0xe0724000, 0x7a1d0000, - 0xe0724100, 0x7a1d0100, - 0xe0724200, 0x7a1d0200, - 0xe0724300, 0x7a1d0300, - 0xbefe00c1, 0xbeff00c1, - 0xb8f14306, 0x8671c171, - 0xbf84002c, 0xbf8a0000, - 0x8676ff6f, 0x04000000, - 0xbf840028, 0x8e718671, - 0x8e718271, 0xbef60071, - 0xb8fa2a05, 0x807a817a, - 0x8e7a8a7a, 0xb8f61605, - 0x80768176, 0x8e768676, - 0x807a767a, 0x807aff7a, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, - 0xd1060002, 0x00011103, - 0x7e0602ff, 0x00000200, - 0xbefc00ff, 0x00010000, - 0xbe800077, 0x8677ff77, - 0xff7fffff, 0x8777ff77, - 0x00058000, 0xd8ec0000, - 0x00000002, 0xbf8cc07f, - 0xe0765000, 0x7a1d0002, - 0x68040702, 0xd0c9006a, - 0x0000e302, 0xbf87fff7, - 0xbef70000, 0xbefa00ff, - 0x00000400, 0xbefe00c1, - 0xbeff00c1, 0xb8f12a05, - 0x80718171, 0x8e718271, - 0x8e768871, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a717c, 0xbf840015, - 0xbf11017c, 0x8071ff71, - 0x00001000, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x7a1d0000, 0xe0724100, - 0x7a1d0100, 0xe0724200, - 0x7a1d0200, 0xe0724300, - 0x7a1d0300, 0x807c847c, - 0x807aff7a, 0x00000400, - 0xbf0a717c, 0xbf85ffef, - 0xbf9c0000, 0xbf8200c5, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x8672ff7f, 0x08000000, - 0x8f728372, 0x87777277, - 0x8672ff7f, 0x70000000, - 0x8f728172, 0x87777277, - 0x8672ff7f, 0x04000000, - 0xbf84001e, 0xbefe00c1, - 0xbeff00c1, 0xb8ef4306, - 0x866fc16f, 0xbf840019, - 0x8e6f866f, 0x8e6f826f, - 0xbef6006f, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8f21605, 0x80728172, - 0x8e728672, 0x80787278, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbef80080, - 0xbefe00c1, 0xbeff00c1, - 0xb8ef2a05, 0x806f816f, - 0x8e6f826f, 0x8e76886f, - 0xbef600ff, 0x01000000, - 0xbef20078, 0x8078ff78, - 0x00000400, 0xbefc0084, - 0xbf11087c, 0x806fff6f, - 0x00008000, 0xe0524000, - 0x781d0000, 0xe0524100, - 0x781d0100, 0xe0524200, - 0x781d0200, 0xe0524300, - 0x781d0300, 0xbf8c0f70, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0x807c847c, 0x8078ff78, - 0x00000400, 0xbf0a6f7c, - 0xbf85ffee, 0xbf9c0000, - 0xe0524000, 0x721d0000, - 0xe0524100, 0x721d0100, - 0xe0524200, 0x721d0200, - 0xe0524300, 0x721d0300, - 0xb8f82a05, 0x80788178, - 0x8e788a78, 0xb8f21605, - 0x80728172, 0x8e728672, - 0x80787278, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8f21605, 0x80728172, - 0x8e728672, 0x80787278, - 0xbef60084, 0xbef600ff, - 0x01000000, 0xc0211bfa, - 0x00000078, 0x80788478, - 0xc0211b3a, 0x00000078, - 0x80788478, 0xc0211b7a, - 0x00000078, 0x80788478, - 0xc0211eba, 0x00000078, - 0x80788478, 0xc0211efa, - 0x00000078, 0x80788478, - 0xc0211c3a, 0x00000078, - 0x80788478, 0xc0211c7a, - 0x00000078, 0x80788478, - 0xc0211a3a, 0x00000078, - 0x80788478, 0xc0211a7a, - 0x00000078, 0x80788478, - 0xc0211cfa, 0x00000078, - 0x80788478, 0xbf8cc07f, - 0x866dff6d, 0x0000ffff, - 0xbefc006f, 0xbefe007a, - 0xbeff007b, 0x866f71ff, - 0x000003ff, 0xb96f4803, - 0x866f71ff, 0xfffff800, - 0x8f6f8b6f, 0xb96fa2c3, - 0xb973f801, 0x866fff6d, - 0xf0000000, 0x8f6f9c6f, - 0x8e6f906f, 0xbef20080, - 0x87726f72, 0x866fff6d, - 0x08000000, 0x8f6f9b6f, - 0x8e6f8f6f, 0x87726f72, - 0x866fff70, 0x00800000, - 0x8f6f976f, 0xb972f807, - 0x86fe7e7e, 0x86ea6a6a, - 0xb970f802, 0xbf8a0000, - 0x95806f6c, 0xbf810000, -}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 933af56..660b3fb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -34,17 +33,13 @@ #include #include #include -#include - #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" -#include "kfd_ipc.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); static int kfd_mmap(struct file *, struct vm_area_struct *); -static bool kfd_dev_is_large_bar(struct kfd_dev *dev); static const char kfd_dev_name[] = "kfd"; @@ -60,14 +55,6 @@ static int kfd_char_dev_major = -1; static struct class *kfd_class; struct device *kfd_device; -static char *kfd_devnode(struct device *dev, umode_t *mode) -{ - if (mode && dev->devt == MKDEV(kfd_char_dev_major, 0)) - *mode = 0666; - - return NULL; -} - int kfd_chardev_init(void) { int err = 0; @@ -82,8 +69,6 @@ int kfd_chardev_init(void) if (IS_ERR(kfd_class)) goto err_class_create; - kfd_class->devnode = kfd_devnode; - kfd_device = device_create(kfd_class, NULL, MKDEV(kfd_char_dev_major, 0), NULL, kfd_dev_name); @@ -132,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep) return -EPERM; } - process = kfd_create_process(filep); + process = kfd_create_process(current); if (IS_ERR(process)) return PTR_ERR(process); @@ -221,7 +206,6 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->ctx_save_restore_area_address = args->ctx_save_restore_address; q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; - q_properties->ctl_stack_size = args->ctl_stack_size; if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) q_properties->type = KFD_QUEUE_TYPE_COMPUTE; @@ -298,7 +282,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, p->pasid, dev->id); - err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id); + err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, + 0, q_properties.type, &queue_id); if (err != 0) goto err_create_queue; @@ -306,16 +291,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, /* Return gpu_id as doorbell offset for mmap usage */ - args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; - args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); + args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); args->doorbell_offset <<= PAGE_SHIFT; - if (KFD_IS_SOC15(dev->device_info->asic_family)) - /* On SOC15 ASICs, doorbell allocation must be - * per-device, and independent from the per-process - * queue_id. Return the doorbell offset within the - * doorbell aperture to user mode. - */ - args->doorbell_offset |= q_properties.doorbell_off; mutex_unlock(&p->mutex); @@ -403,58 +380,6 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, return retval; } -static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, - void *data) -{ - int retval; - const int max_num_cus = 1024; - struct kfd_ioctl_set_cu_mask_args *args = data; - struct queue_properties properties; - uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; - size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32); - - if ((args->num_cu_mask % 32) != 0) { - pr_debug("num_cu_mask 0x%x must be a multiple of 32", - args->num_cu_mask); - return -EINVAL; - } - - properties.cu_mask_count = args->num_cu_mask; - if (properties.cu_mask_count == 0) { - pr_debug("CU mask cannot be 0"); - return -EINVAL; - } - - /* To prevent an unreasonably large CU mask size, set an arbitrary - * limit of max_num_cus bits. We can then just drop any CU mask bits - * past max_num_cus bits and just use the first max_num_cus bits. - */ - if (properties.cu_mask_count > max_num_cus) { - pr_debug("CU mask cannot be greater than 1024 bits"); - properties.cu_mask_count = max_num_cus; - cu_mask_size = sizeof(uint32_t) * (max_num_cus/32); - } - - properties.cu_mask = kzalloc(cu_mask_size, GFP_KERNEL); - if (!properties.cu_mask) - return -ENOMEM; - - retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size); - if (retval) { - pr_debug("Could not copy CU mask from userspace"); - kfree(properties.cu_mask); - return -EFAULT; - } - - mutex_lock(&p->mutex); - - retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); - - mutex_unlock(&p->mutex); - - return retval; -} - static int kfd_ioctl_set_memory_policy(struct file *filep, struct kfd_process *p, void *data) { @@ -507,38 +432,6 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, return err; } -static int kfd_ioctl_set_trap_handler(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_set_trap_handler_args *args = data; - struct kfd_dev *dev; - int err = 0; - struct kfd_process_device *pdd; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - err = -ESRCH; - goto out; - } - - if (dev->dqm->ops.set_trap_handler(dev->dqm, - &pdd->qpd, - args->tba_addr, - args->tma_addr)) - err = -EINVAL; - -out: - mutex_unlock(&p->mutex); - - return err; -} - static int kfd_ioctl_dbg_register(struct file *filep, struct kfd_process *p, void *data) { @@ -553,8 +446,13 @@ static int kfd_ioctl_dbg_register(struct file *filep, if (!dev) return -EINVAL; - mutex_lock(&p->mutex); + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); + return -EINVAL; + } + mutex_lock(kfd_get_dbgmgr_mutex()); + mutex_lock(&p->mutex); /* * make sure that we have pdd, if this the first queue created for @@ -582,8 +480,8 @@ static int kfd_ioctl_dbg_register(struct file *filep, } out: - mutex_unlock(kfd_get_dbgmgr_mutex()); mutex_unlock(&p->mutex); + mutex_unlock(kfd_get_dbgmgr_mutex()); return status; } @@ -596,9 +494,14 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, long status; dev = kfd_device_by_id(args->gpu_id); - if (!dev || !dev->dbgmgr) + if (!dev) return -EINVAL; + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n"); + return -EINVAL; + } + mutex_lock(kfd_get_dbgmgr_mutex()); status = kfd_dbgmgr_unregister(dev->dbgmgr, p); @@ -639,6 +542,11 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, if (!dev) return -EINVAL; + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); + return -EINVAL; + } + cmd_from_user = (void __user *) args->content_ptr; /* Validate arguments */ @@ -742,6 +650,11 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, if (!dev) return -EINVAL; + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); + return -EINVAL; + } + /* input size must match the computed "compact" size */ if (args->buf_size_in_bytes != computed_buff_size) { pr_debug("size mismatch, computed : actual %u : %u\n", @@ -800,37 +713,22 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, { struct kfd_ioctl_get_clock_counters_args *args = data; struct kfd_dev *dev; -#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \ - || (defined OS_NAME_RHEL_7_2) - struct timespec time; -#else struct timespec64 time; -#endif dev = kfd_device_by_id(args->gpu_id); - if (dev) - /* Reading GPU clock counter from KGD */ - args->gpu_clock_counter = - dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); - else - /* Node without GPU resource */ - args->gpu_clock_counter = 0; + if (dev == NULL) + return -EINVAL; + + /* Reading GPU clock counter from KGD */ + args->gpu_clock_counter = + dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); /* No access to rdtsc. Using raw monotonic time */ -#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \ - || (defined OS_NAME_RHEL_7_2) - getrawmonotonic(&time); - args->cpu_clock_counter = (uint64_t)timespec_to_ns(&time); - - get_monotonic_boottime(&time); - args->system_clock_counter = (uint64_t)timespec_to_ns(&time); -#else getrawmonotonic64(&time); args->cpu_clock_counter = (uint64_t)timespec64_to_ns(&time); get_monotonic_boottime64(&time); args->system_clock_counter = (uint64_t)timespec64_to_ns(&time); -#endif /* Since the counter is in nano-seconds we use 1GHz frequency */ args->system_clock_freq = 1000000000; @@ -895,152 +793,19 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, return 0; } -static int kfd_ioctl_get_process_apertures_new(struct file *filp, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_get_process_apertures_new_args *args = data; - struct kfd_process_device_apertures *pa; - struct kfd_process_device *pdd; - uint32_t nodes = 0; - int ret; - - dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); - - if (args->num_of_nodes == 0) { - /* Return number of nodes, so that user space can alloacate - * sufficient memory - */ - mutex_lock(&p->mutex); - - if (!kfd_has_process_device_data(p)) - goto out_upwrite; - - /* Run over all pdd of the process */ - pdd = kfd_get_first_process_device_data(p); - do { - args->num_of_nodes++; - pdd = kfd_get_next_process_device_data(p, pdd); - } while (pdd); - - goto out_upwrite; - } - - /* Fill in process-aperture information for all available - * nodes, but not more than args->num_of_nodes as that is - * the amount of memory allocated by user - */ - pa = kzalloc((sizeof(struct kfd_process_device_apertures) * - args->num_of_nodes), GFP_KERNEL); - if (!pa) - return -ENOMEM; - - mutex_lock(&p->mutex); - - if (!kfd_has_process_device_data(p)) { - args->num_of_nodes = 0; - kfree(pa); - goto out_upwrite; - } - - /* Run over all pdd of the process */ - pdd = kfd_get_first_process_device_data(p); - do { - pa[nodes].gpu_id = pdd->dev->id; - pa[nodes].lds_base = pdd->lds_base; - pa[nodes].lds_limit = pdd->lds_limit; - pa[nodes].gpuvm_base = pdd->gpuvm_base; - pa[nodes].gpuvm_limit = pdd->gpuvm_limit; - pa[nodes].scratch_base = pdd->scratch_base; - pa[nodes].scratch_limit = pdd->scratch_limit; - - dev_dbg(kfd_device, - "gpu id %u\n", pdd->dev->id); - dev_dbg(kfd_device, - "lds_base %llX\n", pdd->lds_base); - dev_dbg(kfd_device, - "lds_limit %llX\n", pdd->lds_limit); - dev_dbg(kfd_device, - "gpuvm_base %llX\n", pdd->gpuvm_base); - dev_dbg(kfd_device, - "gpuvm_limit %llX\n", pdd->gpuvm_limit); - dev_dbg(kfd_device, - "scratch_base %llX\n", pdd->scratch_base); - dev_dbg(kfd_device, - "scratch_limit %llX\n", pdd->scratch_limit); - nodes++; - - pdd = kfd_get_next_process_device_data(p, pdd); - } while (pdd && (nodes < args->num_of_nodes)); - mutex_unlock(&p->mutex); - - args->num_of_nodes = nodes; - ret = copy_to_user( - (void __user *)args->kfd_process_device_apertures_ptr, - pa, - (nodes * sizeof(struct kfd_process_device_apertures))); - kfree(pa); - return ret ? -EFAULT : 0; - -out_upwrite: - mutex_unlock(&p->mutex); - return 0; -} - static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_create_event_args *args = data; - struct kfd_dev *kfd; - struct kfd_process_device *pdd; - int err = -EINVAL; - void *mem, *kern_addr = NULL; - - pr_debug("Event page offset 0x%llx\n", args->event_page_offset); - - if (args->event_page_offset) { - kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); - if (!kfd) { - pr_err("Getting device by id failed in %s\n", __func__); - return -EFAULT; - } - if (!kfd->device_info->is_need_iommu_device) { - mutex_lock(&p->mutex); - pdd = kfd_bind_process_to_device(kfd, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto out_upwrite; - } - mem = kfd_process_device_translate_handle(pdd, - GET_IDR_HANDLE(args->event_page_offset)); - if (!mem) { - pr_err("Can't find BO, offset is 0x%llx\n", - args->event_page_offset); - err = -EFAULT; - goto out_upwrite; - } - mutex_unlock(&p->mutex); - - /* Map dGPU gtt BO to kernel */ - kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, - mem, &kern_addr); - } - } + int err; - err = kfd_event_create(filp, p, - args->event_type, - args->auto_reset != 0, - args->node_id, - &args->event_id, - &args->event_trigger_data, - &args->event_page_offset, - &args->event_slot_index, - kern_addr); + err = kfd_event_create(filp, p, args->event_type, + args->auto_reset != 0, args->node_id, + &args->event_id, &args->event_trigger_data, + &args->event_page_offset, + &args->event_slot_index); return err; - -out_upwrite: - mutex_unlock(&p->mutex); - return err; } static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, @@ -1071,26 +836,26 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_wait_events_args *args = data; + enum kfd_event_wait_result wait_result; int err; err = kfd_wait_on_events(p, args->num_events, (void __user *)args->events_ptr, (args->wait_for_all != 0), - args->timeout, &args->wait_result); + args->timeout, &wait_result); + + args->wait_result = wait_result; return err; } -static int kfd_ioctl_alloc_scratch_memory(struct file *filep, +static int kfd_ioctl_set_scratch_backing_va(struct file *filep, struct kfd_process *p, void *data) { - struct kfd_ioctl_alloc_memory_of_scratch_args *args = data; + struct kfd_ioctl_set_scratch_backing_va_args *args = data; struct kfd_process_device *pdd; struct kfd_dev *dev; long err; - if (args->size == 0) - return -EINVAL; - dev = kfd_device_by_id(args->gpu_id); if (!dev) return -EINVAL; @@ -1107,521 +872,17 @@ static int kfd_ioctl_alloc_scratch_memory(struct file *filep, mutex_unlock(&p->mutex); - if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && - pdd->qpd.vmid != 0) { - err = dev->kfd2kgd->alloc_memory_of_scratch( + if (sched_policy == KFD_SCHED_POLICY_NO_HWS && pdd->qpd.vmid != 0) + dev->kfd2kgd->set_scratch_backing_va( dev->kgd, args->va_addr, pdd->qpd.vmid); - if (err != 0) - goto alloc_memory_of_scratch_failed; - } return 0; bind_process_to_device_fail: mutex_unlock(&p->mutex); -alloc_memory_of_scratch_failed: - return -EFAULT; -} - -bool kfd_dev_is_large_bar(struct kfd_dev *dev) -{ - struct kfd_local_mem_info mem_info; - - if (debug_largebar) { - pr_debug("Simulate large-bar allocation on non large-bar machine\n"); - return true; - } - - if (dev->device_info->is_need_iommu_device) - return false; - - dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info); - if (mem_info.local_mem_size_private == 0 && - mem_info.local_mem_size_public > 0) - return true; - return false; -} - -static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_alloc_memory_of_gpu_args *args = data; - struct kfd_process_device *pdd; - void *mem; - struct kfd_dev *dev; - int idr_handle; - long err; - uint64_t offset = args->mmap_offset; - uint32_t flags = args->flags; - struct vm_area_struct *vma; - - if (args->size == 0) - return -EINVAL; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) && - (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) && - !kfd_dev_is_large_bar(dev)) { - pr_err("Alloc host visible vram on small bar is not allowed\n"); - return -EINVAL; - } - - if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { - /* Check if the userptr corresponds to another (or third-party) - * device local memory. If so treat is as a doorbell. User - * space will be oblivious of this and will use this doorbell - * BO as a regular userptr BO - */ - vma = find_vma(current->mm, args->mmap_offset); - if (vma && (vma->vm_flags & VM_IO)) { - unsigned long pfn; - - follow_pfn(vma, args->mmap_offset, &pfn); - flags |= KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL; - flags &= ~KFD_IOC_ALLOC_MEM_FLAGS_USERPTR; - offset = (pfn << PAGE_SHIFT); - } - } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { - if (args->size != kfd_doorbell_process_slice(dev)) - return -EINVAL; - offset = kfd_get_process_doorbells(dev, p); - } - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto err_unlock; - } - - err = dev->kfd2kgd->alloc_memory_of_gpu( - dev->kgd, args->va_addr, args->size, - pdd->vm, (struct kgd_mem **) &mem, &offset, - flags); - - if (err) - goto err_unlock; - - idr_handle = kfd_process_device_create_obj_handle(pdd, mem, - args->va_addr, args->size, NULL); - if (idr_handle < 0) { - err = -EFAULT; - goto err_free; - } - - mutex_unlock(&p->mutex); - - args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); - args->mmap_offset = offset; - - return 0; - -err_free: - dev->kfd2kgd->free_memory_of_gpu(dev->kgd, - (struct kgd_mem *) mem, - pdd->vm); -err_unlock: - mutex_unlock(&p->mutex); return err; } -static int kfd_ioctl_free_memory_of_gpu(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_free_memory_of_gpu_args *args = data; - struct kfd_process_device *pdd; - struct kfd_bo *buf_obj; - struct kfd_dev *dev; - int ret; - - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); - if (!dev) - return -EINVAL; - - mutex_lock(&p->mutex); - - pdd = kfd_get_process_device_data(dev, p); - if (!pdd) { - pr_err("Process device data doesn't exist\n"); - ret = -EINVAL; - goto err_unlock; - } - - buf_obj = kfd_process_device_find_bo(pdd, - GET_IDR_HANDLE(args->handle)); - if (!buf_obj) { - ret = -EINVAL; - goto err_unlock; - } - run_rdma_free_callback(buf_obj); - - ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem, - pdd->vm); - - /* If freeing the buffer failed, leave the handle in place for - * clean-up during process tear-down. - */ - if (ret == 0) - kfd_process_device_remove_obj_handle( - pdd, GET_IDR_HANDLE(args->handle)); - -err_unlock: - mutex_unlock(&p->mutex); - return ret; -} - -static int kfd_ioctl_map_memory_to_gpu(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_map_memory_to_gpu_args *args = data; - struct kfd_process_device *pdd, *peer_pdd; - void *mem; - struct kfd_dev *dev, *peer; - long err = 0; - int i, num_dev = 0; - uint32_t *devices_arr = NULL; - - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); - if (!dev) - return -EINVAL; - - if (args->device_ids_array_size == 0) { - pr_debug("Device ID array size is 0\n"); - return -EINVAL; - } - - if (args->device_ids_array_size % sizeof(uint32_t)) { - pr_debug("Node IDs array size %u\n", - args->device_ids_array_size); - return -EFAULT; - } - - devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); - if (!devices_arr) - return -ENOMEM; - - err = copy_from_user(devices_arr, - (void __user *)args->device_ids_array_ptr, - args->device_ids_array_size); - if (err != 0) { - err = -EFAULT; - goto copy_from_user_failed; - } - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto bind_process_to_device_failed; - } - - mem = kfd_process_device_translate_handle(pdd, - GET_IDR_HANDLE(args->handle)); - if (!mem) { - err = -ENOMEM; - goto get_mem_obj_from_handle_failed; - } - - num_dev = args->device_ids_array_size / sizeof(uint32_t); - for (i = 0 ; i < num_dev; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (!peer) { - pr_debug("Getting device by id failed for 0x%x\n", - devices_arr[i]); - err = -EFAULT; - goto get_mem_obj_from_handle_failed; - } - - peer_pdd = kfd_bind_process_to_device(peer, p); - if (!peer_pdd) { - err = -EFAULT; - goto get_mem_obj_from_handle_failed; - } - err = peer->kfd2kgd->map_memory_to_gpu( - peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); - if (err != 0) { - pr_err("Failed to map to gpu %d, num_dev=%d\n", - i, num_dev); - goto map_memory_to_gpu_failed; - } - } - - mutex_unlock(&p->mutex); - - err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true); - if (err) { - pr_debug("Sync memory failed, wait interrupted by user signal\n"); - goto sync_memory_failed; - } - - /* Flush TLBs after waiting for the page table updates to complete */ - for (i = 0; i < num_dev; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (WARN_ON_ONCE(!peer)) - continue; - kfd_flush_tlb(peer, p); - } - - kfree(devices_arr); - - return err; - -bind_process_to_device_failed: -get_mem_obj_from_handle_failed: -map_memory_to_gpu_failed: - mutex_unlock(&p->mutex); -copy_from_user_failed: -sync_memory_failed: - kfree(devices_arr); - - return err; -} - -int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd) -{ - int err; - struct kfd_dev *dev = pdd->dev; - - err = dev->kfd2kgd->unmap_memory_to_gpu( - dev->kgd, (struct kgd_mem *) mem, pdd->vm); - - if (err != 0) - return err; - - kfd_flush_tlb(dev, pdd->process); - - return 0; -} - -static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; - struct kfd_process_device *pdd, *peer_pdd; - void *mem; - struct kfd_dev *dev, *peer; - long err = 0; - uint32_t *devices_arr = NULL, num_dev, i; - - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); - if (!dev) - return -EINVAL; - - if (args->device_ids_array_size == 0) { - pr_debug("Device ID array size is 0\n"); - return -EINVAL; - } - - if (args->device_ids_array_size % sizeof(uint32_t)) { - pr_debug("Node IDs array size %u\n", - args->device_ids_array_size); - return -EFAULT; - } - - devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); - if (!devices_arr) - return -ENOMEM; - - err = copy_from_user(devices_arr, - (void __user *)args->device_ids_array_ptr, - args->device_ids_array_size); - if (err != 0) { - err = -EFAULT; - goto copy_from_user_failed; - } - - mutex_lock(&p->mutex); - - pdd = kfd_get_process_device_data(dev, p); - if (!pdd) { - pr_debug("Process device data doesn't exist\n"); - err = -ENODEV; - goto bind_process_to_device_failed; - } - - mem = kfd_process_device_translate_handle(pdd, - GET_IDR_HANDLE(args->handle)); - if (!mem) { - err = -ENOMEM; - goto get_mem_obj_from_handle_failed; - } - - num_dev = args->device_ids_array_size / sizeof(uint32_t); - for (i = 0 ; i < num_dev; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (!peer) { - err = -EFAULT; - goto get_mem_obj_from_handle_failed; - } - - peer_pdd = kfd_get_process_device_data(peer, p); - if (!peer_pdd) { - err = -EFAULT; - goto get_mem_obj_from_handle_failed; - } - kfd_unmap_memory_from_gpu(mem, peer_pdd); - } - kfree(devices_arr); - - mutex_unlock(&p->mutex); - - return 0; - -bind_process_to_device_failed: -get_mem_obj_from_handle_failed: - mutex_unlock(&p->mutex); -copy_from_user_failed: - kfree(devices_arr); - return err; -} - -static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_set_process_dgpu_aperture_args *args = data; - struct kfd_dev *dev; - struct kfd_process_device *pdd; - long err; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto exit; - } - - err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base, - args->dgpu_limit); - -exit: - mutex_unlock(&p->mutex); - return err; -} - -static int kfd_ioctl_get_dmabuf_info(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_get_dmabuf_info_args *args = data; - struct kfd_dev *dev = NULL; - struct kgd_dev *dma_buf_kgd; - void *metadata_buffer = NULL; - uint32_t flags; - unsigned int i; - int r; - - /* Find a KFD GPU device that supports the get_dmabuf_info query */ - for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) - if (dev && dev->kfd2kgd->get_dmabuf_info) - break; - if (!dev) - return -EINVAL; - - if (args->metadata_ptr) { - metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL); - if (!metadata_buffer) - return -ENOMEM; - } - - /* Get dmabuf info from KGD */ - r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd, - &dma_buf_kgd, &args->size, - metadata_buffer, args->metadata_size, - &args->metadata_size, &flags); - if (r) - goto exit; - - /* Reverse-lookup gpu_id from kgd pointer */ - dev = kfd_device_by_kgd(dma_buf_kgd); - if (!dev) { - r = -EINVAL; - goto exit; - } - args->gpu_id = dev->id; - args->flags = flags; - - /* Copy metadata buffer to user mode */ - if (metadata_buffer) { - r = copy_to_user((void __user *)args->metadata_ptr, - metadata_buffer, args->metadata_size); - if (r != 0) - r = -EFAULT; - } - -exit: - kfree(metadata_buffer); - - return r; -} - -static int kfd_ioctl_import_dmabuf(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_import_dmabuf_args *args = data; - struct kfd_dev *dev; - int r; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - r = kfd_ipc_import_dmabuf(dev, p, args->gpu_id, args->dmabuf_fd, - args->va_addr, &args->handle, NULL); - if (r) - pr_err("Failed to import dmabuf\n"); - - return r; -} - -static int kfd_ioctl_ipc_export_handle(struct file *filep, - struct kfd_process *p, - void *data) -{ - struct kfd_ioctl_ipc_export_handle_args *args = data; - struct kfd_dev *dev; - int r; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - r = kfd_ipc_export_as_handle(dev, p, args->handle, args->share_handle); - if (r) - pr_err("Failed to export IPC handle\n"); - - return r; -} - -static int kfd_ioctl_ipc_import_handle(struct file *filep, - struct kfd_process *p, - void *data) -{ - struct kfd_ioctl_ipc_import_handle_args *args = data; - struct kfd_dev *dev = NULL; - int r; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - r = kfd_ipc_import_handle(dev, p, args->gpu_id, args->share_handle, - args->va_addr, &args->handle, - &args->mmap_offset); - if (r) - pr_err("Failed to import IPC handle\n"); - - return r; -} - static int kfd_ioctl_get_tile_config(struct file *filep, struct kfd_process *p, void *data) { @@ -1664,283 +925,6 @@ static int kfd_ioctl_get_tile_config(struct file *filep, return 0; } -#ifndef PTRACE_MODE_ATTACH_REALCREDS -#define PTRACE_MODE_ATTACH_REALCREDS PTRACE_MODE_ATTACH -#endif - -static int kfd_ioctl_cross_memory_copy(struct file *filep, - struct kfd_process *local_p, void *data) -{ - struct kfd_ioctl_cross_memory_copy_args *args = data; - struct kfd_memory_range *src_array, *dst_array; - struct kfd_bo *src_bo, *dst_bo; - struct kfd_process *remote_p, *src_p, *dst_p; - struct task_struct *remote_task; - struct mm_struct *remote_mm; - struct pid *remote_pid; - struct dma_fence *fence = NULL, *lfence = NULL; - uint64_t dst_va_addr; - uint64_t copied, total_copied = 0; - uint64_t src_offset, dst_offset, dst_va_addr_end; - const char *cma_op; - int i, j = 0, err = 0; - - /* Check parameters */ - if (args->src_mem_range_array == 0 || args->dst_mem_range_array == 0 || - args->src_mem_array_size == 0 || args->dst_mem_array_size == 0) - return -EINVAL; - args->bytes_copied = 0; - - /* Allocate space for source and destination arrays */ - src_array = kmalloc_array((args->src_mem_array_size + - args->dst_mem_array_size), - sizeof(struct kfd_memory_range), - GFP_KERNEL); - if (!src_array) - return -ENOMEM; - dst_array = &src_array[args->src_mem_array_size]; - - if (copy_from_user(src_array, (void __user *)args->src_mem_range_array, - args->src_mem_array_size * - sizeof(struct kfd_memory_range))) { - err = -EFAULT; - goto copy_from_user_fail; - } - if (copy_from_user(dst_array, (void __user *)args->dst_mem_range_array, - args->dst_mem_array_size * - sizeof(struct kfd_memory_range))) { - err = -EFAULT; - goto copy_from_user_fail; - } - - /* Get remote process */ - remote_pid = find_get_pid(args->pid); - if (!remote_pid) { - pr_err("Cross mem copy failed. Invalid PID %d\n", args->pid); - err = -ESRCH; - goto copy_from_user_fail; - } - - remote_task = get_pid_task(remote_pid, PIDTYPE_PID); - if (!remote_pid) { - pr_err("Cross mem copy failed. Invalid PID or task died %d\n", - args->pid); - err = -ESRCH; - goto get_pid_task_fail; - } - - /* Check access permission */ - remote_mm = mm_access(remote_task, PTRACE_MODE_ATTACH_REALCREDS); - if (!remote_mm || IS_ERR(remote_mm)) { - err = IS_ERR(remote_mm) ? PTR_ERR(remote_mm) : -ESRCH; - if (err == -EACCES) { - pr_err("Cross mem copy failed. Permission error\n"); - err = -EPERM; - } else - pr_err("Cross mem copy failed. Invalid task %d\n", - err); - goto mm_access_fail; - } - - remote_p = kfd_get_process(remote_task); - if (!remote_p) { - pr_err("Cross mem copy failed. Invalid kfd process %d\n", - args->pid); - err = -EINVAL; - goto kfd_process_fail; - } - - if (KFD_IS_CROSS_MEMORY_WRITE(args->flags)) { - src_p = local_p; - dst_p = remote_p; - cma_op = "WRITE"; - pr_debug("CMA WRITE: local -> remote\n"); - } else { - src_p = remote_p; - dst_p = local_p; - cma_op = "READ"; - pr_debug("CMA READ: remote -> local\n"); - } - - - /* For each source kfd_range: - * - Find the BO. Each range has to be within the same BO. - * - Copy this range to single or multiple destination BOs. - * - dst_va_addr - will point to next va address into which data will - * be copied. - * - dst_bo & src_bo - the current destination and source BOs - * - src_offset & dst_offset - offset into the respective BOs from - * data will be sourced or copied - */ - dst_va_addr = dst_array[0].va_addr; - dst_va_addr_end = dst_va_addr + dst_array[0].size - 1; - mutex_lock(&dst_p->mutex); - dst_bo = kfd_process_find_bo_from_interval(dst_p, - dst_va_addr, - dst_va_addr_end); - mutex_unlock(&dst_p->mutex); - if (!dst_bo || dst_va_addr_end > dst_bo->it.last) { - pr_err("CMA %s failed. Invalid dst range\n", cma_op); - err = -EFAULT; - goto kfd_process_fail; - } - dst_offset = dst_va_addr - dst_bo->it.start; - - for (i = 0; i < args->src_mem_array_size; i++) { - uint64_t src_va_addr_end = src_array[i].va_addr + - src_array[i].size - 1; - uint64_t src_size_to_copy = src_array[i].size; - - mutex_lock(&src_p->mutex); - src_bo = kfd_process_find_bo_from_interval(src_p, - src_array[i].va_addr, - src_va_addr_end); - mutex_unlock(&src_p->mutex); - if (!src_bo || src_va_addr_end > src_bo->it.last) { - pr_err("CMA %s failed. Invalid src range\n", cma_op); - err = -EFAULT; - break; - } - - src_offset = src_array[i].va_addr - src_bo->it.start; - - /* Copy src_bo to one or multiple dst_bo(s) based on size and - * and current copy location. - */ - while (j < args->dst_mem_array_size) { - uint64_t copy_size; - int64_t space_left; - - /* Find the current copy_size. This will be smaller of - * the following - * - space left in the current dest memory range - * - data left to copy from source range - */ - space_left = (dst_array[j].va_addr + dst_array[j].size) - - dst_va_addr; - copy_size = (src_size_to_copy < space_left) ? - src_size_to_copy : space_left; - - /* Check both BOs belong to same device */ - if (src_bo->dev->kgd != dst_bo->dev->kgd) { - pr_err("CMA %s fail. Not same dev\n", cma_op); - err = -EINVAL; - break; - } - - /* Store prev fence. Release it when a later fence is - * created - */ - lfence = fence; - fence = NULL; - - err = dst_bo->dev->kfd2kgd->copy_mem_to_mem( - src_bo->dev->kgd, - src_bo->mem, src_offset, - dst_bo->mem, dst_offset, - copy_size, - &fence, &copied); - - if (err) { - pr_err("GPU CMA %s failed\n", cma_op); - err = -EFAULT; - break; - } - - /* Later fence available. Release old fence */ - if (fence && lfence) { - dma_fence_put(lfence); - lfence = NULL; - } - - total_copied += copied; - src_size_to_copy -= copied; - space_left -= copied; - dst_va_addr += copied; - dst_offset += copied; - src_offset += copied; - if (dst_va_addr > dst_bo->it.last + 1) { - pr_err("CMA %s fail. Mem overflow\n", cma_op); - err = -EFAULT; - break; - } - - /* If the cur dest range is full move to next one */ - if (space_left <= 0) { - if (++j >= args->dst_mem_array_size) - break; - - dst_va_addr = dst_array[j].va_addr; - dst_va_addr_end = dst_va_addr + - dst_array[j].size - 1; - dst_bo = kfd_process_find_bo_from_interval( - dst_p, - dst_va_addr, - dst_va_addr_end); - if (!dst_bo || - dst_va_addr_end > dst_bo->it.last) { - pr_err("CMA %s failed. Invalid dst range\n", - cma_op); - err = -EFAULT; - break; - } - dst_offset = dst_va_addr - dst_bo->it.start; - } - - /* If the cur src range is done, move to next one */ - if (src_size_to_copy <= 0) - break; - } - if (err) - break; - } - - /* Wait for the last fence irrespective of error condition */ - if (fence) { - if (dma_fence_wait_timeout(fence, false, msecs_to_jiffies(1000)) - < 0) - pr_err("CMA %s failed. BO timed out\n", cma_op); - dma_fence_put(fence); - } else if (lfence) { - pr_debug("GPU copy fail. But wait for prev DMA to finish\n"); - dma_fence_wait_timeout(lfence, true, msecs_to_jiffies(1000)); - dma_fence_put(lfence); - } - -kfd_process_fail: - mmput(remote_mm); -mm_access_fail: - put_task_struct(remote_task); -get_pid_task_fail: - put_pid(remote_pid); -copy_from_user_fail: - kfree(src_array); - - /* An error could happen after partial copy. In that case this will - * reflect partial amount of bytes copied - */ - args->bytes_copied = total_copied; - return err; -} - -static int kfd_ioctl_get_queue_wave_state(struct file *filep, - struct kfd_process *p, void *data) -{ - struct kfd_ioctl_get_queue_wave_state_args *args = data; - int r; - - mutex_lock(&p->mutex); - - r = pqm_get_wave_state(&p->pqm, args->queue_id, - (void __user *)args->ctl_stack_address, - &args->ctl_stack_used_size, - &args->save_area_used_size); - - mutex_unlock(&p->mutex); - - return r; -} - #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1995,54 +979,11 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, kfd_ioctl_dbg_wave_control, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, - kfd_ioctl_alloc_memory_of_gpu, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU, - kfd_ioctl_free_memory_of_gpu, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU, - kfd_ioctl_map_memory_to_gpu, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, - kfd_ioctl_unmap_memory_from_gpu, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH, - kfd_ioctl_alloc_scratch_memory, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, - kfd_ioctl_set_cu_mask, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, - kfd_ioctl_set_process_dgpu_aperture, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, - kfd_ioctl_set_trap_handler, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, - kfd_ioctl_get_process_apertures_new, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO, - kfd_ioctl_get_dmabuf_info, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, - kfd_ioctl_import_dmabuf, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA, + kfd_ioctl_set_scratch_backing_va, 0), AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, - kfd_ioctl_get_tile_config, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_IMPORT_HANDLE, - kfd_ioctl_ipc_import_handle, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_EXPORT_HANDLE, - kfd_ioctl_ipc_export_handle, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_CROSS_MEMORY_COPY, - kfd_ioctl_cross_memory_copy, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE, - kfd_ioctl_get_queue_wave_state, 0) - + kfd_ioctl_get_tile_config, 0) }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) @@ -2138,34 +1079,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) { struct kfd_process *process; - struct kfd_dev *kfd; - unsigned long vm_pgoff; - unsigned long long mmap_type; process = kfd_get_process(current); if (IS_ERR(process)) return PTR_ERR(process); - vm_pgoff = vma->vm_pgoff; - vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff); - mmap_type = vm_pgoff & KFD_MMAP_TYPE_MASK; - - switch (mmap_type) { - case KFD_MMAP_TYPE_DOORBELL: - kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); - if (!kfd) - return -EFAULT; - return kfd_doorbell_mmap(kfd, process, vma); - - case KFD_MMAP_TYPE_EVENTS: + if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == + KFD_MMAP_DOORBELL_MASK) { + vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; + return kfd_doorbell_mmap(process, vma); + } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == + KFD_MMAP_EVENTS_MASK) { + vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; return kfd_event_mmap(process, vma); - - case KFD_MMAP_TYPE_RESERVED_MEM: - return kfd_reserved_mem_mmap(process, vma); - - default: - pr_err("Unsupported kfd mmap type %llx\n", mmap_type); - break; } return -EFAULT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c deleted file mode 100644 index 71525cf..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ /dev/null @@ -1,1339 +0,0 @@ -#include -#include -#include -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -#include -#endif -#include -#include "kfd_crat.h" -#include "kfd_priv.h" -#include "kfd_topology.h" - -/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. - * GPU processor ID are expressed with Bit[31]=1. - * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs - * used in the CRAT. - */ -static uint32_t gpu_processor_id_low = 0x80001000; - -/* Return the next available gpu_processor_id and increment it for next GPU - * @total_cu_count - Total CUs present in the GPU including ones - * masked off - */ -static inline unsigned int get_and_inc_gpu_processor_id( - unsigned int total_cu_count) -{ - int current_id = gpu_processor_id_low; - - gpu_processor_id_low += total_cu_count; - return current_id; -} - -/* Static table to describe GPU Cache information */ -struct kfd_gpu_cache_info { - uint32_t cache_size; - uint32_t cache_level; - uint32_t flags; - /* Indicates how many Compute Units share this cache - * Value = 1 indicates the cache is not shared - */ - uint32_t num_cu_shared; -}; - -static struct kfd_gpu_cache_info kaveri_cache_info[] = { - { - /* TCP L1 Cache per CU */ - .cache_size = 16, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_DATA_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 1, - - }, - { - /* Scalar L1 Instruction Cache (in SQC module) per bank */ - .cache_size = 16, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_INST_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 2, - }, - { - /* Scalar L1 Data Cache (in SQC module) per bank */ - .cache_size = 8, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_DATA_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 2, - }, - - /* TODO: Add L2 Cache information */ -}; - - -static struct kfd_gpu_cache_info carrizo_cache_info[] = { - { - /* TCP L1 Cache per CU */ - .cache_size = 16, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_DATA_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 1, - }, - { - /* Scalar L1 Instruction Cache (in SQC module) per bank */ - .cache_size = 8, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_INST_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 4, - }, - { - /* Scalar L1 Data Cache (in SQC module) per bank. */ - .cache_size = 4, - .cache_level = 1, - .flags = (CRAT_CACHE_FLAGS_ENABLED | - CRAT_CACHE_FLAGS_DATA_CACHE | - CRAT_CACHE_FLAGS_SIMD_CACHE), - .num_cu_shared = 4, - }, - - /* TODO: Add L2 Cache information */ -}; - -/* NOTE: In future if more information is added to struct kfd_gpu_cache_info - * the following ASICs may need a separate table. - */ -#define hawaii_cache_info kaveri_cache_info -#define tonga_cache_info carrizo_cache_info -#define fiji_cache_info carrizo_cache_info -#define polaris10_cache_info carrizo_cache_info -#define polaris11_cache_info carrizo_cache_info -/* TODO - check & update Vega10 cache details */ -#define vega10_cache_info carrizo_cache_info -#define raven_cache_info carrizo_cache_info - -static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, - struct crat_subtype_computeunit *cu) -{ - dev->node_props.cpu_cores_count = cu->num_cpu_cores; - dev->node_props.cpu_core_id_base = cu->processor_id_low; -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) - dev->node_props.capability |= HSA_CAP_ATS_PRESENT; -#endif - - pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, - cu->processor_id_low); -} - -static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, - struct crat_subtype_computeunit *cu) -{ - dev->node_props.simd_id_base = cu->processor_id_low; - dev->node_props.simd_count = cu->num_simd_cores; - dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; - dev->node_props.max_waves_per_simd = cu->max_waves_simd; - dev->node_props.wave_front_size = cu->wave_front_size; - dev->node_props.array_count = cu->array_count; - dev->node_props.cu_per_simd_array = cu->num_cu_per_array; - dev->node_props.simd_per_cu = cu->num_simd_per_cu; - dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; - if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) - dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; - pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); -} - -/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct - * topology device present in the device_list - */ -static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, - struct list_head *device_list) -{ - struct kfd_topology_device *dev; - - pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", - cu->proximity_domain, cu->hsa_capability); - list_for_each_entry(dev, device_list, list) { - if (cu->proximity_domain == dev->proximity_domain) { - if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) - kfd_populated_cu_info_cpu(dev, cu); - - if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) - kfd_populated_cu_info_gpu(dev, cu); - break; - } - } - - return 0; -} - -static struct kfd_mem_properties * -find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width, - struct kfd_topology_device *dev) -{ - struct kfd_mem_properties *props; - - list_for_each_entry(props, &dev->mem_props, list) { - if (props->heap_type == heap_type - && props->flags == flags - && props->width == width) - return props; - } - - return NULL; -} -/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct - * topology device present in the device_list - */ -static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, - struct list_head *device_list) -{ - struct kfd_mem_properties *props; - struct kfd_topology_device *dev; - uint32_t heap_type; - uint64_t size_in_bytes; - uint32_t flags = 0; - uint32_t width; - - pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", - mem->proximity_domain); - list_for_each_entry(dev, device_list, list) { - if (mem->proximity_domain == dev->proximity_domain) { - /* We're on GPU node */ - if (dev->node_props.cpu_cores_count == 0) { - /* APU */ - if (mem->visibility_type == 0) - heap_type = - HSA_MEM_HEAP_TYPE_FB_PRIVATE; - /* dGPU */ - else - heap_type = mem->visibility_type; - } else - heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; - - if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) - flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; - if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) - flags |= HSA_MEM_FLAGS_NON_VOLATILE; - - size_in_bytes = - ((uint64_t)mem->length_high << 32) + - mem->length_low; - width = mem->width; - - /* Multiple banks of the same type are aggregated into - * one. User mode doesn't care about multiple physical - * memory segments. It's managed as a single virtual - * heap for user mode. - */ - props = find_subtype_mem(heap_type, flags, width, dev); - if (props) { - props->size_in_bytes += size_in_bytes; - break; - } - - props = kfd_alloc_struct(props); - if (!props) - return -ENOMEM; - - props->heap_type = heap_type; - props->flags = flags; - props->size_in_bytes = size_in_bytes; - props->width = width; - - dev->node_props.mem_banks_count++; - list_add_tail(&props->list, &dev->mem_props); - - break; - } - } - - return 0; -} - -/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct - * topology device present in the device_list - */ -static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, - struct list_head *device_list) -{ - struct kfd_cache_properties *props; - struct kfd_topology_device *dev; - uint32_t id; - uint32_t total_num_of_cu; - - id = cache->processor_id_low; - - list_for_each_entry(dev, device_list, list) { - total_num_of_cu = (dev->node_props.array_count * - dev->node_props.cu_per_simd_array); - - /* Cache infomration in CRAT doesn't have proximity_domain - * information as it is associated with a CPU core or GPU - * Compute Unit. So map the cache using CPU core Id or SIMD - * (GPU) ID. - * TODO: This works because currently we can safely assume that - * Compute Units are parsed before caches are parsed. In - * future, remove this dependency - */ - if ((id >= dev->node_props.cpu_core_id_base && - id <= dev->node_props.cpu_core_id_base + - dev->node_props.cpu_cores_count) || - (id >= dev->node_props.simd_id_base && - id < dev->node_props.simd_id_base + - total_num_of_cu)) { - props = kfd_alloc_struct(props); - if (!props) - return -ENOMEM; - - props->processor_id_low = id; - props->cache_level = cache->cache_level; - props->cache_size = cache->cache_size; - props->cacheline_size = cache->cache_line_size; - props->cachelines_per_tag = cache->lines_per_tag; - props->cache_assoc = cache->associativity; - props->cache_latency = cache->cache_latency; - memcpy(props->sibling_map, cache->sibling_map, - sizeof(props->sibling_map)); - - if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) - props->cache_type |= HSA_CACHE_TYPE_DATA; - if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) - props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; - if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) - props->cache_type |= HSA_CACHE_TYPE_CPU; - if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) - props->cache_type |= HSA_CACHE_TYPE_HSACU; - - dev->cache_count++; - dev->node_props.caches_count++; - list_add_tail(&props->list, &dev->cache_props); - - break; - } - } - - return 0; -} - -/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct - * topology device present in the device_list - */ -static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, - struct list_head *device_list) -{ - struct kfd_iolink_properties *props = NULL, *props2; - struct kfd_topology_device *dev, *cpu_dev; - uint32_t id_from; - uint32_t id_to; - - id_from = iolink->proximity_domain_from; - id_to = iolink->proximity_domain_to; - - pr_debug("Found IO link entry in CRAT table with id_from=%d\n", - id_from); - list_for_each_entry(dev, device_list, list) { - if (id_from == dev->proximity_domain) { - props = kfd_alloc_struct(props); - if (!props) - return -ENOMEM; - - props->node_from = id_from; - props->node_to = id_to; - props->ver_maj = iolink->version_major; - props->ver_min = iolink->version_minor; - props->iolink_type = iolink->io_interface_type; - - if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) - props->weight = 20; - else - props->weight = node_distance(id_from, id_to); - - props->min_latency = iolink->minimum_latency; - props->max_latency = iolink->maximum_latency; - props->min_bandwidth = iolink->minimum_bandwidth_mbs; - props->max_bandwidth = iolink->maximum_bandwidth_mbs; - props->rec_transfer_size = - iolink->recommended_transfer_size; - - dev->io_link_count++; - dev->node_props.io_links_count++; - list_add_tail(&props->list, &dev->io_link_props); - break; - } - } - - /* CPU topology is created before GPUs are detected, so CPU->GPU - * links are not built at that time. If a PCIe type is discovered, it - * means a GPU is detected and we are adding GPU->CPU to the topology. - * At this time, also add the corresponded CPU->GPU link. - */ - if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) { - cpu_dev = kfd_topology_device_by_proximity_domain(id_to); - if (!cpu_dev) - return -ENODEV; - /* same everything but the other direction */ - props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); - props2->node_from = id_to; - props2->node_to = id_from; - props2->kobj = NULL; - cpu_dev->io_link_count++; - cpu_dev->node_props.io_links_count++; - list_add_tail(&props2->list, &cpu_dev->io_link_props); - } - - return 0; -} - -/* kfd_parse_subtype - parse subtypes and attach it to correct topology device - * present in the device_list - * @sub_type_hdr - subtype section of crat_image - * @device_list - list of topology devices present in this crat_image - */ -static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, - struct list_head *device_list) -{ - struct crat_subtype_computeunit *cu; - struct crat_subtype_memory *mem; - struct crat_subtype_cache *cache; - struct crat_subtype_iolink *iolink; - int ret = 0; - - switch (sub_type_hdr->type) { - case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: - cu = (struct crat_subtype_computeunit *)sub_type_hdr; - ret = kfd_parse_subtype_cu(cu, device_list); - break; - case CRAT_SUBTYPE_MEMORY_AFFINITY: - mem = (struct crat_subtype_memory *)sub_type_hdr; - ret = kfd_parse_subtype_mem(mem, device_list); - break; - case CRAT_SUBTYPE_CACHE_AFFINITY: - cache = (struct crat_subtype_cache *)sub_type_hdr; - ret = kfd_parse_subtype_cache(cache, device_list); - break; - case CRAT_SUBTYPE_TLB_AFFINITY: - /* For now, nothing to do here */ - pr_debug("Found TLB entry in CRAT table (not processing)\n"); - break; - case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: - /* For now, nothing to do here */ - pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); - break; - case CRAT_SUBTYPE_IOLINK_AFFINITY: - iolink = (struct crat_subtype_iolink *)sub_type_hdr; - ret = kfd_parse_subtype_iolink(iolink, device_list); - break; - default: - pr_warn("Unknown subtype %d in CRAT\n", - sub_type_hdr->type); - } - - return ret; -} - -/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT - * create a kfd_topology_device and add in to device_list. Also parse - * CRAT subtypes and attach it to appropriate kfd_topology_device - * @crat_image - input image containing CRAT - * @device_list - [OUT] list of kfd_topology_device generated after - * parsing crat_image - * @proximity_domain - Proximity domain of the first device in the table - * - * Return - 0 if successful else -ve value - */ -int kfd_parse_crat_table(void *crat_image, - struct list_head *device_list, - uint32_t proximity_domain) -{ - struct kfd_topology_device *top_dev = NULL; - struct crat_subtype_generic *sub_type_hdr; - uint16_t node_id; - int ret = 0; - struct crat_header *crat_table = (struct crat_header *)crat_image; - uint16_t num_nodes; - uint32_t image_len; - uint32_t last_header_type, last_header_length; - - if (!crat_image) - return -EINVAL; - - if (!list_empty(device_list)) { - pr_warn("Error device list should be empty\n"); - return -EINVAL; - } - - num_nodes = crat_table->num_domains; - image_len = crat_table->length; - - pr_info("Parsing CRAT table with %d nodes\n", num_nodes); - - for (node_id = 0; node_id < num_nodes; node_id++) { - top_dev = kfd_create_topology_device(device_list); - if (!top_dev) - break; - top_dev->proximity_domain = proximity_domain++; - } - - if (!top_dev) { - ret = -ENOMEM; - goto err; - } - - memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); - memcpy(top_dev->oem_table_id, crat_table->oem_table_id, - CRAT_OEMTABLEID_LENGTH); - top_dev->oem_revision = crat_table->oem_revision; - - last_header_type = last_header_length = 0; - sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); - while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < - ((char *)crat_image) + image_len) { - pr_debug("Parsing CRAT subtype header %p enabled: %s type: 0x%x length %d\n", - sub_type_hdr, - (sub_type_hdr->flags & - CRAT_SUBTYPE_FLAGS_ENABLED) - ? "true" : "false", - sub_type_hdr->type, - sub_type_hdr->length); - - if (sub_type_hdr->length == 0) { - pr_err("Parsing wrong CRAT's subtype header last header type: %d last header len %d\n", - last_header_type, last_header_type); - pr_err("Current header type %d length %d\n", - sub_type_hdr->type, sub_type_hdr->length); - break; - } - - if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { - ret = kfd_parse_subtype(sub_type_hdr, device_list); - if (ret != 0) - break; - } - - last_header_type = sub_type_hdr->type; - last_header_length = sub_type_hdr->length; - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - } - -err: - if (ret) - kfd_release_topology_device_list(device_list); - - return ret; -} - -/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ -static int fill_in_pcache(struct crat_subtype_cache *pcache, - struct kfd_gpu_cache_info *pcache_info, - struct kfd_cu_info *cu_info, - int mem_available, - int cu_bitmask, - int cache_type, unsigned int cu_processor_id, - int cu_block) -{ - unsigned int cu_sibling_map_mask; - int first_active_cu; - - /* First check if enough memory is available */ - if (sizeof(struct crat_subtype_cache) > mem_available) - return -ENOMEM; - - cu_sibling_map_mask = cu_bitmask; - cu_sibling_map_mask >>= cu_block; - cu_sibling_map_mask &= - ((1 << pcache_info[cache_type].num_cu_shared) - 1); - first_active_cu = ffs(cu_sibling_map_mask); - - /* CU could be inactive. In case of shared cache find the first active - * CU. and incase of non-shared cache check if the CU is inactive. If - * inactive active skip it - */ - if (first_active_cu) { - memset(pcache, 0, sizeof(struct crat_subtype_cache)); - pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; - pcache->length = sizeof(struct crat_subtype_cache); - pcache->flags = pcache_info[cache_type].flags; - pcache->processor_id_low = cu_processor_id - + (first_active_cu - 1); - pcache->cache_level = pcache_info[cache_type].cache_level; - pcache->cache_size = pcache_info[cache_type].cache_size; - - /* Sibling map is w.r.t processor_id_low, so shift out - * inactive CU - */ - cu_sibling_map_mask = - cu_sibling_map_mask >> (first_active_cu - 1); - - pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); - pcache->sibling_map[1] = - (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); - pcache->sibling_map[2] = - (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); - pcache->sibling_map[3] = - (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); - return 0; - } - return 1; -} - -/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info - * tables - * - * @kdev - [IN] GPU device - * @gpu_processor_id - [IN] GPU processor ID to which these caches - * associate - * @available_size - [IN] Amount of memory available in pcache - * @cu_info - [IN] Compute Unit info obtained from KGD - * @pcache - [OUT] memory into which cache data is to be filled in. - * @size_filled - [OUT] amount of data used up in pcache. - * @num_of_entries - [OUT] number of caches added - */ -static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, - int gpu_processor_id, - int available_size, - struct kfd_cu_info *cu_info, - struct crat_subtype_cache *pcache, - int *size_filled, - int *num_of_entries) -{ - struct kfd_gpu_cache_info *pcache_info; - int num_of_cache_types = 0; - int i, j, k; - int ct = 0; - int mem_available = available_size; - unsigned int cu_processor_id; - int ret; - - switch (kdev->device_info->asic_family) { - case CHIP_KAVERI: - pcache_info = kaveri_cache_info; - num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); - break; - case CHIP_HAWAII: - pcache_info = hawaii_cache_info; - num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); - break; - case CHIP_CARRIZO: - pcache_info = carrizo_cache_info; - num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); - break; - case CHIP_TONGA: - pcache_info = tonga_cache_info; - num_of_cache_types = ARRAY_SIZE(tonga_cache_info); - break; - case CHIP_FIJI: - pcache_info = fiji_cache_info; - num_of_cache_types = ARRAY_SIZE(fiji_cache_info); - break; - case CHIP_POLARIS10: - pcache_info = polaris10_cache_info; - num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); - break; - case CHIP_POLARIS11: - pcache_info = polaris11_cache_info; - num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); - break; - case CHIP_VEGA10: - pcache_info = vega10_cache_info; - num_of_cache_types = ARRAY_SIZE(vega10_cache_info); - break; - case CHIP_RAVEN: - pcache_info = raven_cache_info; - num_of_cache_types = ARRAY_SIZE(raven_cache_info); - break; - default: - return -EINVAL; - } - - *size_filled = 0; - *num_of_entries = 0; - - /* For each type of cache listed in the kfd_gpu_cache_info table, - * go through all available Compute Units. - * The [i,j,k] loop will - * if kfd_gpu_cache_info.num_cu_shared = 1 - * will parse through all available CU - * If (kfd_gpu_cache_info.num_cu_shared != 1) - * then it will consider only one CU from - * the shared unit - */ - - for (ct = 0; ct < num_of_cache_types; ct++) { - cu_processor_id = gpu_processor_id; - for (i = 0; i < cu_info->num_shader_engines; i++) { - for (j = 0; j < cu_info->num_shader_arrays_per_engine; - j++) { - for (k = 0; k < cu_info->num_cu_per_sh; - k += pcache_info[ct].num_cu_shared) { - - ret = fill_in_pcache(pcache, - pcache_info, - cu_info, - mem_available, - cu_info->cu_bitmap[i][j], - ct, - cu_processor_id, - k); - - if (ret < 0) - break; - - if (!ret) { - pcache++; - (*num_of_entries)++; - mem_available -= - sizeof(*pcache); - (*size_filled) += - sizeof(*pcache); - } - - /* Move to next CU block */ - cu_processor_id += - pcache_info[ct].num_cu_shared; - } - } - } - } - - pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); - - return 0; -} - -/* - * kfd_create_crat_image_acpi - Allocates memory for CRAT image and - * copies CRAT from ACPI (if available). - * NOTE: Call kfd_destroy_crat_image to free CRAT image memory - * - * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then - * crat_image will be NULL - * @size: [OUT] size of crat_image - * - * Return 0 if successful else return -ve value - */ -#ifdef CONFIG_ACPI -int kfd_create_crat_image_acpi(void **crat_image, size_t *size) -{ - struct acpi_table_header *crat_table; - acpi_status status; - void *pcrat_image; - - if (!crat_image) - return -EINVAL; - - *crat_image = NULL; - - /* Fetch the CRAT table from ACPI */ - status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); - if (status == AE_NOT_FOUND) { - pr_warn("CRAT table not found\n"); - return -ENODATA; - } else if (ACPI_FAILURE(status)) { - const char *err = acpi_format_exception(status); - - pr_err("CRAT table error: %s\n", err); - return -EINVAL; - } - - if (ignore_crat) { - pr_info("CRAT table disabled by module option\n"); - return -ENODATA; - } - - pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); - if (!pcrat_image) { - pr_err("No memory for allocating CRAT image\n"); - return -ENOMEM; - } - - memcpy(pcrat_image, crat_table, crat_table->length); - - *crat_image = pcrat_image; - *size = crat_table->length; - - return 0; -} -#endif - -/* Memory required to create Virtual CRAT. - * Since there is no easy way to predict the amount of memory required, the - * following amount are allocated for CPU and GPU Virtual CRAT. This is - * expected to cover all known conditions. But to be safe additional check - * is put in the code to ensure we don't overwrite. - */ -#define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) -#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) - -/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node - * - * @numa_node_id: CPU NUMA node id - * @avail_size: Available size in the memory - * @sub_type_hdr: Memory into which compute info will be filled in - * - * Return 0 if successful else return -ve value - */ -static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, - int proximity_domain, - struct crat_subtype_computeunit *sub_type_hdr) -{ - const struct cpumask *cpumask; - - *avail_size -= sizeof(struct crat_subtype_computeunit); - if (*avail_size < 0) - return -ENOMEM; - - memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); - - /* Fill in subtype header data */ - sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); - sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; - - cpumask = cpumask_of_node(numa_node_id); - - /* Fill in CU data */ - sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; - sub_type_hdr->proximity_domain = proximity_domain; - sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); - if (sub_type_hdr->processor_id_low == -1) - return -EINVAL; - - sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); - - return 0; -} - -/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node - * - * @numa_node_id: CPU NUMA node id - * @avail_size: Available size in the memory - * @sub_type_hdr: Memory into which compute info will be filled in - * - * Return 0 if successful else return -ve value - */ -static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, - int proximity_domain, - struct crat_subtype_memory *sub_type_hdr) -{ - uint64_t mem_in_bytes = 0; - pg_data_t *pgdat; - int zone_type; - - *avail_size -= sizeof(struct crat_subtype_memory); - if (*avail_size < 0) - return -ENOMEM; - - memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); - - /* Fill in subtype header data */ - sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_memory); - sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; - - /* Fill in Memory Subunit data */ - - /* Unlike si_meminfo, si_meminfo_node is not exported. So - * the following lines are duplicated from si_meminfo_node - * function - */ - pgdat = NODE_DATA(numa_node_id); - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; - mem_in_bytes <<= PAGE_SHIFT; - - sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); - sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); - sub_type_hdr->proximity_domain = proximity_domain; - - return 0; -} - -#ifdef CONFIG_X86_64 -static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, - uint32_t *num_entries, - struct crat_subtype_iolink *sub_type_hdr) -{ - int nid; - struct cpuinfo_x86 *c = &cpu_data(0); - uint8_t link_type; - - if (c->x86_vendor == X86_VENDOR_AMD) - link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; - else - link_type = CRAT_IOLINK_TYPE_QPI_1_1; - - *num_entries = 0; - - /* Create IO links from this node to other CPU nodes */ - for_each_online_node(nid) { - if (nid == numa_node_id) /* node itself */ - continue; - - *avail_size -= sizeof(struct crat_subtype_iolink); - if (*avail_size < 0) - return -ENOMEM; - - memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); - - /* Fill in subtype header data */ - sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_iolink); - sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; - - /* Fill in IO link data */ - sub_type_hdr->proximity_domain_from = numa_node_id; - sub_type_hdr->proximity_domain_to = nid; - sub_type_hdr->io_interface_type = link_type; - - (*num_entries)++; - sub_type_hdr++; - } - - return 0; -} -#endif - -/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU - * - * @pcrat_image: Fill in VCRAT for CPU - * @size: [IN] allocated size of crat_image. - * [OUT] actual size of data filled in crat_image - */ -static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) -{ - struct crat_header *crat_table = (struct crat_header *)pcrat_image; - struct crat_subtype_generic *sub_type_hdr; - int avail_size = *size; - int numa_node_id; - int ret = 0; -#ifdef CONFIG_ACPI - struct acpi_table_header *acpi_table; - acpi_status status; -#endif -#ifdef CONFIG_X86_64 - uint32_t entries = 0; -#endif - - if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) - return -EINVAL; - - /* Fill in CRAT Header. - * Modify length and total_entries as subunits are added. - */ - avail_size -= sizeof(struct crat_header); - if (avail_size < 0) - return -ENOMEM; - - memset(crat_table, 0, sizeof(struct crat_header)); - memcpy(&crat_table->signature, CRAT_SIGNATURE, - sizeof(crat_table->signature)); - crat_table->length = sizeof(struct crat_header); - -#ifdef CONFIG_ACPI - status = acpi_get_table("DSDT", 0, &acpi_table); - if (status == AE_NOT_FOUND) - pr_warn("DSDT table not found for OEM information\n"); - else { - crat_table->oem_revision = acpi_table->revision; - memcpy(crat_table->oem_id, acpi_table->oem_id, - CRAT_OEMID_LENGTH); - memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, - CRAT_OEMTABLEID_LENGTH); - } -#else - crat_table->oem_revision = 0; - memcpy(crat_table->oem_id, "INV", CRAT_OEMID_LENGTH); - memcpy(crat_table->oem_table_id, "UNAVAIL", CRAT_OEMTABLEID_LENGTH); -#endif - crat_table->total_entries = 0; - crat_table->num_domains = 0; - - sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); - - for_each_online_node(numa_node_id) { - if (kfd_numa_node_to_apic_id(numa_node_id) == -1) - continue; - - /* Fill in Subtype: Compute Unit */ - ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, - crat_table->num_domains, - (struct crat_subtype_computeunit *)sub_type_hdr); - if (ret < 0) - return ret; - crat_table->length += sub_type_hdr->length; - crat_table->total_entries++; - - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - - /* Fill in Subtype: Memory */ - ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, - crat_table->num_domains, - (struct crat_subtype_memory *)sub_type_hdr); - if (ret < 0) - return ret; - crat_table->length += sub_type_hdr->length; - crat_table->total_entries++; - - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - - /* Fill in Subtype: IO Link */ -#ifdef CONFIG_X86_64 - ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, - &entries, - (struct crat_subtype_iolink *)sub_type_hdr); - if (ret < 0) - return ret; - crat_table->length += (sub_type_hdr->length * entries); - crat_table->total_entries += entries; - - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length * entries); -#else - pr_info("IO link not available for non x86 platforms\n"); -#endif - - crat_table->num_domains++; - } - - /* TODO: Add cache Subtype for CPU. - * Currently, CPU cache information is available in function - * detect_cache_attributes(cpu) defined in the file - * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not - * exported and to get the same information the code needs to be - * duplicated. - */ - - *size = crat_table->length; - pr_info("Virtual CRAT table created for CPU\n"); - - return 0; -} - -static int kfd_fill_gpu_memory_affinity(int *avail_size, - struct kfd_dev *kdev, uint8_t type, uint64_t size, - struct crat_subtype_memory *sub_type_hdr, - uint32_t proximity_domain, - const struct kfd_local_mem_info *local_mem_info) -{ - *avail_size -= sizeof(struct crat_subtype_memory); - if (*avail_size < 0) - return -ENOMEM; - - memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); - sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_memory); - sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; - - sub_type_hdr->proximity_domain = proximity_domain; - - pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", - type, size); - - sub_type_hdr->length_low = lower_32_bits(size); - sub_type_hdr->length_high = upper_32_bits(size); - - sub_type_hdr->width = local_mem_info->vram_width; - sub_type_hdr->visibility_type = type; - - return 0; -} - -/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU - * to its NUMA node - * @avail_size: Available size in the memory - * @kdev - [IN] GPU device - * @sub_type_hdr: Memory into which io link info will be filled in - * @proximity_domain - proximity domain of the GPU node - * - * Return 0 if successful else return -ve value - */ -static int kfd_fill_gpu_direct_io_link(int *avail_size, - struct kfd_dev *kdev, - struct crat_subtype_iolink *sub_type_hdr, - uint32_t proximity_domain) -{ - *avail_size -= sizeof(struct crat_subtype_iolink); - if (*avail_size < 0) - return -ENOMEM; - - memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); - - /* Fill in subtype header data */ - sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_iolink); - sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; - - /* Fill in IOLINK subtype. - * TODO: Fill-in other fields of iolink subtype - */ - sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; - sub_type_hdr->proximity_domain_from = proximity_domain; -#ifdef CONFIG_NUMA - if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) - sub_type_hdr->proximity_domain_to = 0; - else - sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; -#else - sub_type_hdr->proximity_domain_to = 0; -#endif - return 0; -} - -/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU - * - * @pcrat_image: Fill in VCRAT for GPU - * @size: [IN] allocated size of crat_image. - * [OUT] actual size of data filled in crat_image - */ -static int kfd_create_vcrat_image_gpu(void *pcrat_image, - size_t *size, struct kfd_dev *kdev, - uint32_t proximity_domain) -{ - struct crat_header *crat_table = (struct crat_header *)pcrat_image; - struct crat_subtype_generic *sub_type_hdr; - struct crat_subtype_computeunit *cu; - struct kfd_cu_info cu_info; - int avail_size = *size; - uint32_t total_num_of_cu; - int num_of_cache_entries = 0; - int cache_mem_filled = 0; - int ret = 0; -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - struct amd_iommu_device_info iommu_info; - const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | - AMD_IOMMU_DEVICE_FLAG_PRI_SUP | - AMD_IOMMU_DEVICE_FLAG_PASID_SUP; -#endif - struct kfd_local_mem_info local_mem_info; - - if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) - return -EINVAL; - - /* Fill the CRAT Header. - * Modify length and total_entries as subunits are added. - */ - avail_size -= sizeof(struct crat_header); - if (avail_size < 0) - return -ENOMEM; - - memset(crat_table, 0, sizeof(struct crat_header)); - - memcpy(&crat_table->signature, CRAT_SIGNATURE, - sizeof(crat_table->signature)); - /* Change length as we add more subtypes*/ - crat_table->length = sizeof(struct crat_header); - crat_table->num_domains = 1; - crat_table->total_entries = 0; - - /* Fill in Subtype: Compute Unit - * First fill in the sub type header and then sub type data - */ - avail_size -= sizeof(struct crat_subtype_computeunit); - if (avail_size < 0) - return -ENOMEM; - - sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); - memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); - - sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; - sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); - sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; - - /* Fill CU subtype data */ - cu = (struct crat_subtype_computeunit *)sub_type_hdr; - cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; - cu->proximity_domain = proximity_domain; - - kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); - cu->num_simd_per_cu = cu_info.simd_per_cu; - cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; - cu->max_waves_simd = cu_info.max_waves_per_simd; - - cu->wave_front_size = cu_info.wave_front_size; - cu->array_count = cu_info.num_shader_arrays_per_engine * - cu_info.num_shader_engines; - total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); - cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); - cu->num_cu_per_array = cu_info.num_cu_per_sh; - cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; - cu->num_banks = cu_info.num_shader_engines; - cu->lds_size_in_kb = cu_info.lds_size; - - cu->hsa_capability = 0; - - /* Check if this node supports IOMMU. During parsing this flag will - * translate to HSA_CAP_ATS_PRESENT - */ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - iommu_info.flags = 0; - if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) { - if ((iommu_info.flags & required_iommu_flags) == - required_iommu_flags) - cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; - } -#endif - - crat_table->length += sub_type_hdr->length; - crat_table->total_entries++; - - /* Fill in Subtype: Memory. Only on systems with large BAR (no - * private FB), report memory as public. On other systems - * report the total FB size (public+private) as a single - * private heap. - */ - kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - - if (debug_largebar) - local_mem_info.local_mem_size_private = 0; - - if (local_mem_info.local_mem_size_private == 0) - ret = kfd_fill_gpu_memory_affinity(&avail_size, - kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, - local_mem_info.local_mem_size_public, - (struct crat_subtype_memory *)sub_type_hdr, - proximity_domain, - &local_mem_info); - else - ret = kfd_fill_gpu_memory_affinity(&avail_size, - kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, - local_mem_info.local_mem_size_public + - local_mem_info.local_mem_size_private, - (struct crat_subtype_memory *)sub_type_hdr, - proximity_domain, - &local_mem_info); - if (ret < 0) - return ret; - - crat_table->length += sizeof(struct crat_subtype_memory); - crat_table->total_entries++; - - /* TODO: Fill in cache information. This information is NOT readily - * available in KGD - */ - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, - avail_size, - &cu_info, - (struct crat_subtype_cache *)sub_type_hdr, - &cache_mem_filled, - &num_of_cache_entries); - - if (ret < 0) - return ret; - - crat_table->length += cache_mem_filled; - crat_table->total_entries += num_of_cache_entries; - avail_size -= cache_mem_filled; - - /* Fill in Subtype: IO_LINKS - * Only direct links are added here which is Link from GPU to - * to its NUMA node. Indirect links are added by userspace. - */ - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - cache_mem_filled); - ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, - (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); - - if (ret < 0) - return ret; - - crat_table->length += sub_type_hdr->length; - crat_table->total_entries++; - - *size = crat_table->length; - pr_info("Virtual CRAT table created for GPU\n"); - - return ret; -} - -/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and - * creates a Virtual CRAT (VCRAT) image - * - * NOTE: Call kfd_destroy_crat_image to free CRAT image memory - * - * @crat_image: VCRAT image created because ACPI does not have a - * CRAT for this device - * @size: [OUT] size of virtual crat_image - * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device - * COMPUTE_UNIT_GPU - Create VCRAT for GPU - * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU - * -- this option is not currently implemented. - * The assumption is that all AMD APUs will have CRAT - * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU - * - * Return 0 if successful else return -ve value - */ -int kfd_create_crat_image_virtual(void **crat_image, size_t *size, - int flags, struct kfd_dev *kdev, uint32_t proximity_domain) -{ - void *pcrat_image = NULL; - int ret = 0; - - if (!crat_image) - return -EINVAL; - - *crat_image = NULL; - - /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and - * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover - * all the current conditions. A check is put not to overwrite beyond - * allocated size - */ - switch (flags) { - case COMPUTE_UNIT_CPU: - pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); - if (!pcrat_image) - return -ENOMEM; - *size = VCRAT_SIZE_FOR_CPU; - ret = kfd_create_vcrat_image_cpu(pcrat_image, size); - break; - case COMPUTE_UNIT_GPU: - if (!kdev) - return -EINVAL; - pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); - if (!pcrat_image) - return -ENOMEM; - *size = VCRAT_SIZE_FOR_GPU; - ret = kfd_create_vcrat_image_gpu(pcrat_image, size, - kdev, proximity_domain); - break; - case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): - /* TODO: */ - ret = -EINVAL; - pr_err("VCRAT not implemented for APU\n"); - break; - default: - ret = -EINVAL; - } - - if (!ret) - *crat_image = pcrat_image; - else - kfree(pcrat_image); - - return ret; -} - - -/* kfd_destroy_crat_image - * - * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) - * - */ -void kfd_destroy_crat_image(void *crat_image) -{ - kfree(crat_image); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index 00de41f..a374fa3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -24,7 +24,6 @@ #define KFD_CRAT_H_INCLUDED #include -#include "kfd_priv.h" #pragma pack(1) @@ -45,10 +44,6 @@ #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) -/* Compute Unit flags */ -#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ -#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ - struct crat_header { uint32_t signature; uint32_t length; @@ -110,7 +105,7 @@ struct crat_subtype_computeunit { uint8_t wave_front_size; uint8_t num_banks; uint16_t micro_engine_id; - uint8_t array_count; + uint8_t num_arrays; uint8_t num_cu_per_array; uint8_t num_simd_per_cu; uint8_t max_slots_scatch_cu; @@ -132,14 +127,13 @@ struct crat_subtype_memory { uint8_t length; uint16_t reserved; uint32_t flags; - uint32_t proximity_domain; + uint32_t promixity_domain; uint32_t base_addr_low; uint32_t base_addr_high; uint32_t length_low; uint32_t length_high; uint32_t width; - uint8_t visibility_type; /* for virtual (dGPU) CRAT */ - uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; + uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; }; /* @@ -228,12 +222,9 @@ struct crat_subtype_ccompute { /* * HSA IO Link Affinity structure and definitions */ -#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) -#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) -#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) -#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) -#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) -#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 +#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 +#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 +#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc /* * IO interface types @@ -241,16 +232,8 @@ struct crat_subtype_ccompute { #define CRAT_IOLINK_TYPE_UNDEFINED 0 #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 -#define CRAT_IOLINK_TYPE_AMBA 3 -#define CRAT_IOLINK_TYPE_MIPI 4 -#define CRAT_IOLINK_TYPE_QPI_1_1 5 -#define CRAT_IOLINK_TYPE_RESERVED1 6 -#define CRAT_IOLINK_TYPE_RESERVED2 7 -#define CRAT_IOLINK_TYPE_RAPID_IO 8 -#define CRAT_IOLINK_TYPE_INFINIBAND 9 -#define CRAT_IOLINK_TYPE_RESERVED3 10 -#define CRAT_IOLINK_TYPE_OTHER 11 -#define CRAT_IOLINK_TYPE_MAX 255 +#define CRAT_IOLINK_TYPE_OTHER 3 +#define CRAT_IOLINK_TYPE_MAX 255 #define CRAT_IOLINK_RESERVED_LENGTH 24 @@ -308,13 +291,4 @@ struct cdit_header { #pragma pack() -#ifdef CONFIG_ACPI -int kfd_create_crat_image_acpi(void **crat_image, size_t *size); -#endif -void kfd_destroy_crat_image(void *crat_image); -int kfd_parse_crat_table(void *crat_image, - struct list_head *device_list, - uint32_t proximity_domain); -int kfd_create_crat_image_virtual(void **crat_image, size_t *size, - int flags, struct kfd_dev *kdev, uint32_t proximity_domain); #endif /* KFD_CRAT_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c index df9b346..0aa021a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c @@ -29,7 +29,7 @@ #include #include -#include "kfd_pm4_headers_vi.h" +#include "kfd_pm4_headers.h" #include "kfd_pm4_headers_diq.h" #include "kfd_kernel_queue.h" #include "kfd_priv.h" @@ -47,10 +47,9 @@ static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev) static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, unsigned int pasid, uint64_t vmid0_address, - uint32_t *packet_buff, size_t size_in_bytes, - bool sync) + uint32_t *packet_buff, size_t size_in_bytes) { - struct pm4_mec_release_mem *rm_packet; + struct pm4__release_mem *rm_packet; struct pm4__indirect_buffer_pasid *ib_packet; struct kfd_mem_obj *mem_obj; size_t pq_packets_size_in_bytes; @@ -66,9 +65,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, kq = dbgdev->kq; - pq_packets_size_in_bytes = sizeof(struct pm4__indirect_buffer_pasid); - if (sync) - pq_packets_size_in_bytes += sizeof(struct pm4_mec_release_mem); + pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + + sizeof(struct pm4__indirect_buffer_pasid); /* * We acquire a buffer from DIQ @@ -97,15 +95,10 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, ib_packet->bitfields3.ib_base_hi = largep->u.high_part; ib_packet->control = (1 << 23) | (1 << 31) | - ((size_in_bytes / 4) & 0xfffff); + ((size_in_bytes / sizeof(uint32_t)) & 0xfffff); ib_packet->bitfields5.pasid = pasid; - if (!sync) { - kq->ops.submit_packet(kq); - return status; - } - /* * for now we use release mem for GPU-CPU synchronization * Consider WaitRegMem + WriteData as a better alternative @@ -114,7 +107,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, * (a) Sync with HW * (b) Sync var is written by CP to mem. */ - rm_packet = (struct pm4_mec_release_mem *) (ib_packet_buff + + rm_packet = (struct pm4__release_mem *) (ib_packet_buff + (sizeof(struct pm4__indirect_buffer_pasid) / sizeof(unsigned int))); @@ -133,7 +126,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, rm_packet->header.opcode = IT_RELEASE_MEM; rm_packet->header.type = PM4_TYPE_3; - rm_packet->header.count = sizeof(struct pm4_mec_release_mem) / 4 - 2; + rm_packet->header.count = sizeof(struct pm4__release_mem) / + sizeof(unsigned int) - 2; rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; rm_packet->bitfields2.event_index = @@ -190,9 +184,9 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) struct kernel_queue *kq = NULL; int status; - properties.type = KFD_QUEUE_TYPE_DIQ; status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, - &properties, &qid); + &properties, 0, KFD_QUEUE_TYPE_DIQ, + &qid); if (status) { pr_err("Failed to create DIQ\n"); @@ -238,8 +232,7 @@ static void dbgdev_address_watch_set_registers( union TCP_WATCH_ADDR_H_BITS *addrHi, union TCP_WATCH_ADDR_L_BITS *addrLo, union TCP_WATCH_CNTL_BITS *cntl, - unsigned int index, unsigned int vmid, - bool is_apu) + unsigned int index, unsigned int vmid) { union ULARGE_INTEGER addr; @@ -264,9 +257,9 @@ static void dbgdev_address_watch_set_registers( cntl->bitfields.mode = adw_info->watch_mode[index]; cntl->bitfields.vmid = (uint32_t) vmid; - /* for APU assume it is an ATC address */ - if (is_apu) - cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; + /* for now assume it is an ATC address */ + cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; + pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask); pr_debug("\t\t%20s %08x\n", "set reg add high :", addrHi->bitfields.addr); @@ -308,8 +301,7 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, for (i = 0; i < adw_info->num_watch_points; i++) { dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, - &cntl, i, pdd->qpd.vmid, - dbgdev->dev->device_info->is_need_iommu_device); + &cntl, i, pdd->qpd.vmid); pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); pr_debug("\t\t%20s %08x\n", "register index :", i); @@ -348,9 +340,9 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, union TCP_WATCH_ADDR_H_BITS addrHi; union TCP_WATCH_ADDR_L_BITS addrLo; union TCP_WATCH_CNTL_BITS cntl; + struct kfd_mem_obj *mem_obj; unsigned int aw_reg_add_dword; uint32_t *packet_buff_uint; - uint64_t packet_buff_gpu_addr; unsigned int i; int status; size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; @@ -372,13 +364,15 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, return -EINVAL; } - status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, - ib_size/sizeof(uint32_t), - &packet_buff_uint, &packet_buff_gpu_addr); + status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); + if (status) { - pr_err("Failed to allocate IB from DIQ ring\n"); + pr_err("Failed to allocate GART memory\n"); return status; } + + packet_buff_uint = mem_obj->cpu_ptr; + memset(packet_buff_uint, 0, ib_size); packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); @@ -397,9 +391,12 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, packets_vec[3].bitfields2.insert_vmid = 1; for (i = 0; i < adw_info->num_watch_points; i++) { - dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, - &cntl, i, vmid, - dbgdev->dev->device_info->is_need_iommu_device); + dbgdev_address_watch_set_registers(adw_info, + &addrHi, + &addrLo, + &cntl, + i, + vmid); pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); pr_debug("\t\t%20s %08x\n", "register index :", i); @@ -472,24 +469,24 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, status = dbgdev_diq_submit_ib( dbgdev, adw_info->process->pasid, - packet_buff_gpu_addr, + mem_obj->gpu_addr, packet_buff_uint, - ib_size, true); + ib_size); if (status) { pr_err("Failed to submit IB to DIQ\n"); - return status; + break; } } + kfd_gtt_sa_free(dbgdev->dev, mem_obj); return status; } static int dbgdev_wave_control_set_registers( struct dbg_wave_control_info *wac_info, union SQ_CMD_BITS *in_reg_sq_cmd, - union GRBM_GFX_INDEX_BITS *in_reg_gfx_index, - unsigned int asic_family) + union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) { int status = 0; union SQ_CMD_BITS reg_sq_cmd; @@ -547,25 +544,11 @@ static int dbgdev_wave_control_set_registers( switch (wac_info->operand) { case HSA_DBG_WAVEOP_HALT: - if (asic_family == CHIP_KAVERI) { - reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; - pr_debug("Halting KV\n"); - } else { - reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; - reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT; - pr_debug("Halting CZ\n"); - } + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; break; case HSA_DBG_WAVEOP_RESUME: - if (asic_family == CHIP_KAVERI) { - reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; - pr_debug("Resuming KV\n"); - } else { - reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; - reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME; - pr_debug("Resuming CZ\n"); - } + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; break; case HSA_DBG_WAVEOP_KILL: @@ -605,15 +588,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, int status; union SQ_CMD_BITS reg_sq_cmd; union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct kfd_mem_obj *mem_obj; uint32_t *packet_buff_uint; - uint64_t packet_buff_gpu_addr; struct pm4__set_config_reg *packets_vec; size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; reg_sq_cmd.u32All = 0; status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, - ®_gfx_index, dbgdev->dev->device_info->asic_family); + ®_gfx_index); if (status) { pr_err("Failed to set wave control registers\n"); return status; @@ -652,13 +635,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); - status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, - ib_size / sizeof(uint32_t), - &packet_buff_uint, &packet_buff_gpu_addr); - if (status) { - pr_err("Failed to allocate IB from DIQ ring\n"); + status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); + + if (status != 0) { + pr_err("Failed to allocate GART memory\n"); return status; } + + packet_buff_uint = mem_obj->cpu_ptr; + memset(packet_buff_uint, 0, ib_size); packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; @@ -666,7 +651,8 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; packets_vec[0].header.type = PM4_TYPE_3; packets_vec[0].bitfields2.reg_offset = - GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE; + GRBM_GFX_INDEX / (sizeof(uint32_t)) - + USERCONFIG_REG_BASE; packets_vec[0].bitfields2.insert_vmid = 0; packets_vec[0].reg_data[0] = reg_gfx_index.u32All; @@ -674,7 +660,8 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, packets_vec[1].header.count = 1; packets_vec[1].header.opcode = IT_SET_CONFIG_REG; packets_vec[1].header.type = PM4_TYPE_3; - packets_vec[1].bitfields2.reg_offset = SQ_CMD / 4 - AMD_CONFIG_REG_BASE; + packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - + AMD_CONFIG_REG_BASE; packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; packets_vec[1].bitfields2.insert_vmid = 1; @@ -690,7 +677,8 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, packets_vec[2].ordinal1 = packets_vec[0].ordinal1; packets_vec[2].bitfields2.reg_offset = - GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE; + GRBM_GFX_INDEX / (sizeof(uint32_t)) - + USERCONFIG_REG_BASE; packets_vec[2].bitfields2.insert_vmid = 0; packets_vec[2].reg_data[0] = reg_gfx_index.u32All; @@ -698,13 +686,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, status = dbgdev_diq_submit_ib( dbgdev, wac_info->process->pasid, - packet_buff_gpu_addr, + mem_obj->gpu_addr, packet_buff_uint, - ib_size, false); + ib_size); if (status) pr_err("Failed to submit IB to DIQ\n"); + kfd_gtt_sa_free(dbgdev->dev, mem_obj); + return status; } @@ -726,7 +716,7 @@ static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, return -EFAULT; } status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, - ®_gfx_index, dbgdev->dev->device_info->asic_family); + ®_gfx_index); if (status) { pr_err("Failed to set wave control registers\n"); return status; @@ -779,8 +769,13 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) union GRBM_GFX_INDEX_BITS reg_gfx_index; struct kfd_process_device *pdd; struct dbg_wave_control_info wac_info; - int first_vmid_to_scan = dev->vm_info.first_vmid_kfd; - int last_vmid_to_scan = dev->vm_info.last_vmid_kfd; + int temp; + int first_vmid_to_scan = 8; + int last_vmid_to_scan = 15; + + first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1; + temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan; + last_vmid_to_scan = first_vmid_to_scan + ffz(temp); reg_sq_cmd.u32All = 0; status = 0; @@ -818,7 +813,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) return -EFAULT; status = dbgdev_wave_control_set_registers(&wac_info, ®_sq_cmd, - ®_gfx_index, dev->device_info->asic_family); + ®_gfx_index); if (status != 0) return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h index 583aaa9..03424c2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h @@ -60,24 +60,6 @@ enum { SH_REG_SIZE = SH_REG_END - SH_REG_BASE }; -/* SQ_CMD definitions */ - -enum { - SQ_IND_CMD_DATA_RESUME = 0, - SQ_IND_CMD_DATA_HALT = 1 -}; - -enum SQ_IND_CMD_NEW { - SQ_IND_CMD_NEW_NULL = 0x00000000, - SQ_IND_CMD_NEW_SETHALT = 0x00000001, - SQ_IND_CMD_NEW_SAVECTX = 0x00000002, - SQ_IND_CMD_NEW_KILL = 0x00000003, - SQ_IND_CMD_NEW_DEBUG = 0x00000004, - SQ_IND_CMD_NEW_TRAP = 0x00000005, - SQ_IND_CMD_NEW_SET_PRIO = 0x00000006 - -}; - enum SQ_IND_CMD_CMD { SQ_IND_CMD_CMD_NULL = 0x00000000, SQ_IND_CMD_CMD_HALT = 0x00000001, @@ -136,20 +118,6 @@ union SQ_CMD_BITS { uint32_t:1; uint32_t vm_id:4; } bitfields, bits; - struct { - uint32_t cmd:3; - uint32_t:1; - uint32_t mode:3; - uint32_t check_vmid:1; - uint32_t data:3; - uint32_t:5; - uint32_t wave_id:4; - uint32_t simd_id:2; - uint32_t:2; - uint32_t queue_id:3; - uint32_t:1; - uint32_t vm_id:4; - } bitfields_sethalt, bits_sethalt; uint32_t u32All; signed int i32All; float f32All; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c index 9d4af96..3da25f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c @@ -33,7 +33,6 @@ #include "kfd_pm4_headers_diq.h" #include "kfd_dbgmgr.h" #include "kfd_dbgdev.h" -#include "kfd_device_queue_manager.h" static DEFINE_MUTEX(kfd_dbgmgr_mutex); @@ -84,7 +83,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) } /* get actual type of DBGDevice cpsch or not */ - if (pdev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) + if (sched_policy == KFD_SCHED_POLICY_NO_HWS) type = DBGDEV_TYPE_NODIQ; kfd_dbgdev_init(new_buff->dbgdev, pdev, type); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c deleted file mode 100644 index 232e28f..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include "kfd_priv.h" - -static struct dentry *debugfs_root; - -static int kfd_debugfs_open(struct inode *inode, struct file *file) -{ - int (*show)(struct seq_file *, void *) = inode->i_private; - - return single_open(file, show, NULL); -} - -static const struct file_operations kfd_debugfs_fops = { - .owner = THIS_MODULE, - .open = kfd_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -void kfd_debugfs_init(void) -{ - struct dentry *ent; - - debugfs_root = debugfs_create_dir("kfd", NULL); - if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) { - pr_warn("Failed to create kfd debugfs dir\n"); - return; - } - - ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, - kfd_debugfs_mqds_by_process, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create mqds in kfd debugfs\n"); - - ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, - kfd_debugfs_hqds_by_device, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create hqds in kfd debugfs\n"); - - ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, - kfd_debugfs_rls_by_device, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create rls in kfd debugfs\n"); -} - -void kfd_debugfs_fini(void) -{ - debugfs_remove_recursive(debugfs_root); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index f701b4e..61fff25 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -20,206 +20,36 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) #include -#endif +#include #include #include #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers_vi.h" -#include "cwsr_trap_handler_gfx8.asm" -#include "cwsr_trap_handler_gfx9.asm" #define MQD_SIZE_ALIGNED 768 -static atomic_t kfd_device_suspended = ATOMIC_INIT(0); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) static const struct kfd_device_info kaveri_device_info = { .asic_family = CHIP_KAVERI, .max_pasid_bits = 16, /* max num of queues for KV.TODO should be a dynamic value */ .max_no_of_hqd = 24, - .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = true, - .supports_cwsr = false, - .needs_pci_atomics = false, - .num_sdma_engines = 2, + .mqd_size_aligned = MQD_SIZE_ALIGNED }; -#endif -static const struct kfd_device_info hawaii_device_info = { - .asic_family = CHIP_HAWAII, - .max_pasid_bits = 16, - /* max num of queues for KV.TODO should be a dynamic value */ - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = false, - .needs_pci_atomics = false, - .num_sdma_engines = 2, -}; - -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) static const struct kfd_device_info carrizo_device_info = { .asic_family = CHIP_CARRIZO, .max_pasid_bits = 16, /* max num of queues for CZ.TODO should be a dynamic value */ .max_no_of_hqd = 24, - .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = true, - .supports_cwsr = true, - .needs_pci_atomics = false, - .num_sdma_engines = 2, -}; -#endif - -static const struct kfd_device_info tonga_device_info = { - .asic_family = CHIP_TONGA, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = false, - .needs_pci_atomics = true, - .num_sdma_engines = 2, -}; - -static const struct kfd_device_info fiji_device_info = { - .asic_family = CHIP_FIJI, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = true, - .num_sdma_engines = 2, -}; - -static const struct kfd_device_info fiji_vf_device_info = { - .asic_family = CHIP_FIJI, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = false, - .num_sdma_engines = 2, -}; - - -static const struct kfd_device_info polaris10_device_info = { - .asic_family = CHIP_POLARIS10, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = true, - .num_sdma_engines = 2, -}; - -static const struct kfd_device_info polaris10_vf_device_info = { - .asic_family = CHIP_POLARIS10, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = false, - .num_sdma_engines = 2, -}; - -static const struct kfd_device_info polaris11_device_info = { - .asic_family = CHIP_POLARIS11, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = true, - .num_sdma_engines = 2, -}; - -static const struct kfd_device_info vega10_device_info = { - .asic_family = CHIP_VEGA10, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 8, - .ih_ring_entry_size = 8 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_v9, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = true, - .num_sdma_engines = 2, -}; - -static const struct kfd_device_info vega10_vf_device_info = { - .asic_family = CHIP_VEGA10, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 8, - .ih_ring_entry_size = 8 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_v9, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = false, - .supports_cwsr = true, - .needs_pci_atomics = false, - .num_sdma_engines = 2, -}; - -static const struct kfd_device_info raven_device_info = { - .asic_family = CHIP_RAVEN, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 8, - .ih_ring_entry_size = 8 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_v9, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .is_need_iommu_device = true, - .supports_cwsr = true, - .needs_pci_atomics = true, - .num_sdma_engines = 1, + .mqd_size_aligned = MQD_SIZE_ALIGNED }; struct kfd_deviceid { @@ -229,7 +59,6 @@ struct kfd_deviceid { /* Please keep this sorted by increasing device id. */ static const struct kfd_deviceid supported_devices[] = { -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) { 0x1304, &kaveri_device_info }, /* Kaveri */ { 0x1305, &kaveri_device_info }, /* Kaveri */ { 0x1306, &kaveri_device_info }, /* Kaveri */ @@ -252,76 +81,17 @@ static const struct kfd_deviceid supported_devices[] = { { 0x131B, &kaveri_device_info }, /* Kaveri */ { 0x131C, &kaveri_device_info }, /* Kaveri */ { 0x131D, &kaveri_device_info }, /* Kaveri */ -#endif - { 0x67A0, &hawaii_device_info }, /* Hawaii */ - { 0x67A1, &hawaii_device_info }, /* Hawaii */ - { 0x67A2, &hawaii_device_info }, /* Hawaii */ - { 0x67A8, &hawaii_device_info }, /* Hawaii */ - { 0x67A9, &hawaii_device_info }, /* Hawaii */ - { 0x67AA, &hawaii_device_info }, /* Hawaii */ - { 0x67B0, &hawaii_device_info }, /* Hawaii */ - { 0x67B1, &hawaii_device_info }, /* Hawaii */ - { 0x67B8, &hawaii_device_info }, /* Hawaii */ - { 0x67B9, &hawaii_device_info }, /* Hawaii */ - { 0x67BA, &hawaii_device_info }, /* Hawaii */ - { 0x67BE, &hawaii_device_info }, /* Hawaii */ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) { 0x9870, &carrizo_device_info }, /* Carrizo */ { 0x9874, &carrizo_device_info }, /* Carrizo */ { 0x9875, &carrizo_device_info }, /* Carrizo */ { 0x9876, &carrizo_device_info }, /* Carrizo */ - { 0x9877, &carrizo_device_info }, /* Carrizo */ -#endif - { 0x6920, &tonga_device_info }, /* Tonga */ - { 0x6921, &tonga_device_info }, /* Tonga */ - { 0x6928, &tonga_device_info }, /* Tonga */ - { 0x6929, &tonga_device_info }, /* Tonga */ - { 0x692B, &tonga_device_info }, /* Tonga */ - { 0x6938, &tonga_device_info }, /* Tonga */ - { 0x6939, &tonga_device_info }, /* Tonga */ - { 0x7300, &fiji_device_info }, /* Fiji */ - { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ - { 0x67C0, &polaris10_device_info }, /* Polaris10 */ - { 0x67C1, &polaris10_device_info }, /* Polaris10 */ - { 0x67C2, &polaris10_device_info }, /* Polaris10 */ - { 0x67C4, &polaris10_device_info }, /* Polaris10 */ - { 0x67C7, &polaris10_device_info }, /* Polaris10 */ - { 0x67C8, &polaris10_device_info }, /* Polaris10 */ - { 0x67C9, &polaris10_device_info }, /* Polaris10 */ - { 0x67CA, &polaris10_device_info }, /* Polaris10 */ - { 0x67CC, &polaris10_device_info }, /* Polaris10 */ - { 0x67CF, &polaris10_device_info }, /* Polaris10 */ - { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ - { 0x67DF, &polaris10_device_info }, /* Polaris10 */ - { 0x67E0, &polaris11_device_info }, /* Polaris11 */ - { 0x67E1, &polaris11_device_info }, /* Polaris11 */ - { 0x67E3, &polaris11_device_info }, /* Polaris11 */ - { 0x67E7, &polaris11_device_info }, /* Polaris11 */ - { 0x67E8, &polaris11_device_info }, /* Polaris11 */ - { 0x67E9, &polaris11_device_info }, /* Polaris11 */ - { 0x67EB, &polaris11_device_info }, /* Polaris11 */ - { 0x67EF, &polaris11_device_info }, /* Polaris11 */ - { 0x67FF, &polaris11_device_info }, /* Polaris11 */ - { 0x6860, &vega10_device_info }, /* Vega10 */ - { 0x6861, &vega10_device_info }, /* Vega10 */ - { 0x6862, &vega10_device_info }, /* Vega10 */ - { 0x6863, &vega10_device_info }, /* Vega10 */ - { 0x6864, &vega10_device_info }, /* Vega10 */ - { 0x6867, &vega10_device_info }, /* Vega10 */ - { 0x6868, &vega10_device_info }, /* Vega10 */ - { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ - { 0x687F, &vega10_device_info }, /* Vega10 */ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - { 0x15DD, &raven_device_info } /* Raven */ -#endif + { 0x9877, &carrizo_device_info } /* Carrizo */ }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size); static void kfd_gtt_sa_fini(struct kfd_dev *kfd); -static int kfd_resume(struct kfd_dev *kfd); - static const struct kfd_device_info *lookup_device_info(unsigned short did) { size_t i; @@ -352,17 +122,6 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, return NULL; } - if (device_info->needs_pci_atomics) { - /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. - */ - if (pci_enable_atomic_ops_to_root(pdev) < 0) { - dev_info(kfd_device, - "skipped device %x:%x, PCI rejects atomics", - pdev->vendor, pdev->device); - return NULL; - } - } - kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); if (!kfd) return NULL; @@ -380,7 +139,6 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, return kfd; } -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) static bool device_iommu_pasid_init(struct kfd_dev *kfd) { const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | @@ -410,9 +168,23 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) pasid_limit = min_t(unsigned int, (unsigned int)(1 << kfd->device_info->max_pasid_bits), iommu_info.max_pasids); + /* + * last pasid is used for kernel queues doorbells + * in the future the last pasid might be used for a kernel thread. + */ + pasid_limit = min_t(unsigned int, + pasid_limit, + kfd->doorbell_process_limit - 1); + + err = amd_iommu_init_device(kfd->pdev, pasid_limit); + if (err < 0) { + dev_err(kfd_device, "error initializing iommu device\n"); + return false; + } if (!kfd_set_pasid_limit(pasid_limit)) { dev_err(kfd_device, "error setting pasid limit\n"); + amd_iommu_free_device(kfd->pdev); return false; } @@ -424,7 +196,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); if (dev) - kfd_process_iommu_unbind_callback(dev, pasid); + kfd_unbind_process_from_device(dev, pasid); } /* @@ -451,69 +223,14 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, return AMD_IOMMU_INV_PRI_RSP_INVALID; } -#endif /* CONFIG_AMD_IOMMU_V2 */ - -static int kfd_cwsr_init(struct kfd_dev *kfd) -{ - if (cwsr_enable && kfd->device_info->supports_cwsr) { - if (kfd->device_info->asic_family < CHIP_VEGA10) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); - kfd->cwsr_isa = cwsr_trap_gfx8_hex; - kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); - } else { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); - kfd->cwsr_isa = cwsr_trap_gfx9_hex; - kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); - } - - kfd->cwsr_enabled = true; - } - - return 0; -} - -static void kfd_ib_mem_init(struct kfd_dev *kdev) -{ - /* In certain cases we need to send IB from kernel using the GPU address - * space created by user applications. - * For example, on GFX v7, we need to flush TC associated to the VMID - * before tearing down the VMID. In order to do so, we need an address - * valid to the VMID to place the IB while this space was created on - * the user's side, not the kernel. - * Since kfd_set_process_dgpu_aperture reserves "cwsr_base + cwsr_size" - * but CWSR only uses pages above cwsr_base, we'll use one page memory - * under cwsr_base for IB submissions - */ - kdev->ib_size = PAGE_SIZE; -} bool kgd2kfd_device_init(struct kfd_dev *kfd, const struct kgd2kfd_shared_resources *gpu_resources) { unsigned int size; - kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, - KGD_ENGINE_MEC1); - kfd->shared_resources = *gpu_resources; - /* Usually first_vmid_kfd = 8, last_vmid_kfd = 15 */ - kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1; - kfd->vm_info.last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1; - kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd - - kfd->vm_info.first_vmid_kfd + 1; - - /* Verify module parameters regarding mapped process number*/ - if ((hws_max_conc_proc < 0) - || (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) { - dev_err(kfd_device, - "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n", - hws_max_conc_proc, kfd->vm_info.vmid_num_kfd, - kfd->vm_info.vmid_num_kfd); - kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd; - } else - kfd->max_proc_per_quantum = hws_max_conc_proc; - /* calculate max size of mqds needed for queues */ size = max_num_of_queues_per_device * kfd->device_info->mqd_size_aligned; @@ -563,31 +280,29 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto kfd_interrupt_error; } + if (!device_iommu_pasid_init(kfd)) { + dev_err(kfd_device, + "Error initializing iommuv2 for device %x:%x\n", + kfd->pdev->vendor, kfd->pdev->device); + goto device_iommu_pasid_error; + } + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, + iommu_pasid_shutdown_callback); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); + kfd->dqm = device_queue_manager_init(kfd); if (!kfd->dqm) { dev_err(kfd_device, "Error initializing queue manager\n"); goto device_queue_manager_error; } -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (kfd->device_info->is_need_iommu_device) { - if (!device_iommu_pasid_init(kfd)) { - dev_err(kfd_device, "Error initializing iommuv2\n"); - goto device_iommu_pasid_error; - } - } -#endif - - if (kfd_cwsr_init(kfd)) { - dev_err(kfd_device, "Error initializing cwsr\n"); - goto device_iommu_pasid_error; + if (kfd->dqm->ops.start(kfd->dqm)) { + dev_err(kfd_device, + "Error starting queue manager for device %x:%x\n", + kfd->pdev->vendor, kfd->pdev->device); + goto dqm_start_error; } - kfd_ib_mem_init(kfd); - - if (kfd_resume(kfd)) - goto kfd_resume_error; - kfd->dbgmgr = NULL; kfd->init_complete = true; @@ -595,14 +310,15 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, kfd->pdev->device); pr_debug("Starting kfd with the following scheduling policy %d\n", - kfd->dqm->sched_policy); + sched_policy); goto out; -kfd_resume_error: -device_iommu_pasid_error: +dqm_start_error: device_queue_manager_uninit(kfd->dqm); device_queue_manager_error: + amd_iommu_free_device(kfd->pdev); +device_iommu_pasid_error: kfd_interrupt_exit(kfd); kfd_interrupt_error: kfd_topology_remove_device(kfd); @@ -622,8 +338,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, void kgd2kfd_device_exit(struct kfd_dev *kfd) { if (kfd->init_complete) { - kgd2kfd_suspend(kfd); device_queue_manager_uninit(kfd->dqm); + amd_iommu_free_device(kfd->pdev); kfd_interrupt_exit(kfd); kfd_topology_remove_device(kfd); kfd_doorbell_fini(kfd); @@ -634,385 +350,55 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfree(kfd); } -int kgd2kfd_pre_reset(struct kfd_dev *kfd) -{ - return 0; -} - -int kgd2kfd_post_reset(struct kfd_dev *kfd) -{ - return 0; -} - void kgd2kfd_suspend(struct kfd_dev *kfd) { - if (!kfd->init_complete) - return; - - /* For first KFD device suspend all the KFD processes */ - if (atomic_inc_return(&kfd_device_suspended) == 1) - kfd_suspend_all_processes(); - - kfd->dqm->ops.stop(kfd->dqm); - -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (!kfd->device_info->is_need_iommu_device) - return; - - kfd_unbind_processes_from_device(kfd); - - amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); - amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); - amd_iommu_free_device(kfd->pdev); -#endif + if (kfd->init_complete) { + kfd->dqm->ops.stop(kfd->dqm); + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); + amd_iommu_free_device(kfd->pdev); + } } int kgd2kfd_resume(struct kfd_dev *kfd) { - int ret; - - if (!kfd->init_complete) - return 0; - - ret = kfd_resume(kfd); - if (ret) - return ret; - - if (atomic_dec_return(&kfd_device_suspended) == 0) - ret = kfd_resume_all_processes(); - WARN(atomic_read(&kfd_device_suspended) < 0, - "KFD suspend / resume ref. error\n"); - return ret; -} - -static int kfd_resume(struct kfd_dev *kfd) -{ - int err = 0; + unsigned int pasid_limit; + int err; -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (kfd->device_info->is_need_iommu_device) { - unsigned int pasid_limit = kfd_get_pasid_limit(); + pasid_limit = kfd_get_pasid_limit(); + if (kfd->init_complete) { err = amd_iommu_init_device(kfd->pdev, pasid_limit); - if (err) { + if (err < 0) { dev_err(kfd_device, "failed to initialize iommu\n"); return -ENXIO; } amd_iommu_set_invalidate_ctx_cb(kfd->pdev, - iommu_pasid_shutdown_callback); - amd_iommu_set_invalid_ppr_cb(kfd->pdev, - iommu_invalid_ppr_cb); - - err = kfd_bind_processes_to_device(kfd); - if (err) { - dev_err(kfd_device, - "failed to bind process to device\n"); - return -ENXIO; - } + iommu_pasid_shutdown_callback); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); + kfd->dqm->ops.start(kfd->dqm); } -#endif - err = kfd->dqm->ops.start(kfd->dqm); - if (err) { - dev_err(kfd_device, - "Error starting queue manager for device %x:%x\n", - kfd->pdev->vendor, kfd->pdev->device); - goto dqm_start_error; - } - - return err; - -dqm_start_error: -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (kfd->device_info->is_need_iommu_device) - amd_iommu_free_device(kfd->pdev); -#endif - - return err; + return 0; } /* This is called directly from KGD at ISR. */ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) { - uint32_t patched_ihre[DIV_ROUND_UP( - kfd->device_info->ih_ring_entry_size, - sizeof(uint32_t))]; - bool is_patched = false; - if (!kfd->init_complete) return; spin_lock(&kfd->interrupt_lock); - if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry, - patched_ihre, &is_patched) - && enqueue_ih_ring_entry(kfd, - is_patched ? patched_ihre : ih_ring_entry)) - queue_work(kfd->ih_wq, &kfd->interrupt_work); + if (kfd->interrupts_active + && interrupt_is_wanted(kfd, ih_ring_entry) + && enqueue_ih_ring_entry(kfd, ih_ring_entry)) + schedule_work(&kfd->interrupt_work); spin_unlock(&kfd->interrupt_lock); } -/* quiesce_process_mm - - * Quiesce all user queues that belongs to given process p - */ -int quiesce_process_mm(struct kfd_process *p) -{ - struct kfd_process_device *pdd; - int r = 0; - unsigned int n_evicted = 0; - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - r = process_evict_queues(pdd->dev->dqm, &pdd->qpd); - if (r != 0) { - pr_err("Failed to evict process queues\n"); - goto fail; - } - n_evicted++; - } - - return r; - -fail: - /* To keep state consistent, roll back partial eviction by - * restoring queues - */ - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - if (n_evicted == 0) - break; - if (process_restore_queues(pdd->dev->dqm, &pdd->qpd)) - pr_err("Failed to restore queues\n"); - - n_evicted--; - } - - return r; -} - -/* resume_process_mm - - * Resume all user queues that belongs to given process p. The caller must - * ensure that process p context is valid. - */ -static int resume_process_mm(struct kfd_process *p) -{ - struct kfd_process_device *pdd; - struct mm_struct *mm = (struct mm_struct *)p->mm; - int r, ret = 0; - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) - down_read(&mm->mmap_sem); - - r = process_restore_queues(pdd->dev->dqm, &pdd->qpd); - if (r != 0) { - pr_err("Failed to restore process queues\n"); - if (ret == 0) - ret = r; - } - - if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) - up_read(&mm->mmap_sem); - } - - return ret; -} - -int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm) -{ - struct kfd_process *p; - struct kfd_process_device *pdd; - int r; - - /* Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. - */ - p = kfd_lookup_process_by_mm(mm); - if (!p) - return -ENODEV; - - if (kfd) { - r = -ENODEV; - pdd = kfd_get_process_device_data(kfd, p); - if (pdd) - r = process_evict_queues(kfd->dqm, &pdd->qpd); - } else { - r = quiesce_process_mm(p); - } - - kfd_unref_process(p); - return r; -} - -int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) -{ - struct kfd_process *p; - struct kfd_process_device *pdd; - int r; - - /* Because we are called from arbitrary context (workqueue) as opposed - * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. - */ - p = kfd_lookup_process_by_mm(mm); - if (!p) - return -ENODEV; - - if (kfd) { - r = -ENODEV; - pdd = kfd_get_process_device_data(kfd, p); - if (pdd) - r = process_restore_queues(kfd->dqm, &pdd->qpd); - } else { - r = resume_process_mm(p); - } - - kfd_unref_process(p); - return r; -} - - -void kfd_restore_bo_worker(struct work_struct *work) -{ - struct delayed_work *dwork; - struct kfd_process *p; - struct kfd_process_device *pdd; - int ret = 0; - - dwork = to_delayed_work(work); - - /* Process termination destroys this worker thread. So during the - * lifetime of this thread, kfd_process p will be valid - */ - p = container_of(dwork, struct kfd_process, restore_work); - - /* Call restore_process_bos on the first KGD device. This function - * takes care of restoring the whole process including other devices. - * Restore can fail if enough memory is not available. If so, - * reschedule again. - */ - pdd = list_first_entry(&p->per_device_data, - struct kfd_process_device, - per_device_list); - - pr_info("Started restoring process of pasid %d\n", p->pasid); - - /* Setting last_restore_timestamp before successful restoration. - * Otherwise this would have to be set by KGD (restore_process_bos) - * before KFD BOs are unreserved. If not, the process can be evicted - * again before the timestamp is set. - * If restore fails, the timestamp will be set again in the next - * attempt. This would mean that the minimum GPU quanta would be - * PROCESS_ACTIVE_TIME_MS - (time to execute the following two - * functions) - */ - - p->last_restore_timestamp = get_jiffies_64(); - ret = pdd->dev->kfd2kgd->restore_process_bos(p->process_info, &p->ef); - if (ret) { - pr_info("Restore failed, try again after %d ms\n", - PROCESS_BACK_OFF_TIME_MS); - ret = schedule_delayed_work(&p->restore_work, - msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); - WARN(!ret, "reschedule restore work failed\n"); - return; - } - - ret = resume_process_mm(p); - if (ret) - pr_err("Failed to resume user queues\n"); - - pr_info("Finished restoring process of pasid %d\n", p->pasid); -} - -/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will - * prepare for safe eviction of KFD BOs that belong to the specified - * process. - * - * @mm: mm_struct that identifies the specified KFD process - * @fence: eviction fence attached to KFD process BOs - * - */ -int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, - struct dma_fence *fence) -{ - struct kfd_process *p; - unsigned long active_time; - unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS); - - if (!fence) - return -EINVAL; - - if (dma_fence_is_signaled(fence)) - return 0; - - p = kfd_lookup_process_by_mm(mm); - if (!p) - return -ENODEV; - - if (fence->seqno == p->last_eviction_seqno) - goto out; - - p->last_eviction_seqno = fence->seqno; - - /* Avoid KFD process starvation. Wait for at least - * PROCESS_ACTIVE_TIME_MS before evicting the process again - */ - active_time = get_jiffies_64() - p->last_restore_timestamp; - if (delay_jiffies > active_time) - delay_jiffies -= active_time; - else - delay_jiffies = 0; - - /* During process initialization eviction_work.dwork is initialized - * to kfd_evict_bo_worker - */ - schedule_delayed_work(&p->eviction_work, delay_jiffies); -out: - kfd_unref_process(p); - return 0; -} - -void kfd_evict_bo_worker(struct work_struct *work) -{ - int ret; - struct kfd_process *p; - struct delayed_work *dwork; - - dwork = to_delayed_work(work); - - /* Process termination destroys this worker thread. So during the - * lifetime of this thread, kfd_process p will be valid - */ - p = container_of(dwork, struct kfd_process, eviction_work); - WARN_ONCE(p->last_eviction_seqno != p->ef->seqno, - "Eviction fence mismatch\n"); - - /* Narrow window of overlap between restore and evict work - * item is possible. Once - * amdgpu_amdkfd_gpuvm_restore_process_bos unreserves KFD BOs, - * it is possible to evicted again. But restore has few more - * steps of finish. So lets wait for any previous restore work - * to complete - */ - flush_delayed_work(&p->restore_work); - - pr_info("Started evicting process of pasid %d\n", p->pasid); - ret = quiesce_process_mm(p); - if (!ret) { - dma_fence_signal(p->ef); - dma_fence_put(p->ef); - p->ef = NULL; - schedule_delayed_work(&p->restore_work, - msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); - - pr_info("Finished evicting process of pasid %d\n", p->pasid); - } else - pr_err("Failed to quiesce user queues. Cannot evict pasid %d\n", - p->pasid); -} - static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size) { @@ -1076,8 +462,8 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) return -ENOMEM; - *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); - if (!(*mem_obj)) + *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if ((*mem_obj) == NULL) return -ENOMEM; pr_debug("Allocated mem_obj = %p for size = %d\n", *mem_obj, size); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index a628a0d..53a66e8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -44,14 +44,9 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -static int execute_queues_cpsch(struct device_queue_manager *dqm, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param); -static int unmap_queues_cpsch(struct device_queue_manager *dqm, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param); - -static int map_queues_cpsch(struct device_queue_manager *dqm); +static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); +static int destroy_queues_cpsch(struct device_queue_manager *dqm, + bool preempt_static_queues, bool lock); static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, @@ -98,17 +93,6 @@ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm) return dqm->dev->shared_resources.num_pipe_per_mec; } -static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) -{ - return dqm->dev->device_info->num_sdma_engines; -} - -unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) -{ - return dqm->dev->device_info->num_sdma_engines - * KFD_SDMA_QUEUES_PER_ENGINE; -} - void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { @@ -120,57 +104,6 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, qpd->sh_mem_bases); } -static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) -{ - struct kfd_dev *dev = qpd->dqm->dev; - - if (!KFD_IS_SOC15(dev->device_info->asic_family)) { - /* On pre-SOC15 chips we need to use the queue ID to - * preserve the user mode ABI. - */ - q->doorbell_id = q->properties.queue_id; - } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - /* For SDMA queues on SOC15, use static doorbell - * assignments based on the engine and queue. - */ - q->doorbell_id = dev->shared_resources.sdma_doorbell - [q->properties.sdma_engine_id] - [q->properties.sdma_queue_id]; - } else { - /* For CP queues on SOC15 reserve a free doorbell ID */ - unsigned int found; - - found = find_first_zero_bit(qpd->doorbell_bitmap, - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); - if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { - pr_debug("No doorbells available"); - return -EBUSY; - } - set_bit(found, qpd->doorbell_bitmap); - q->doorbell_id = found; - } - - q->properties.doorbell_off = - kfd_doorbell_id_to_offset(dev, q->process, - q->doorbell_id); - - return 0; -} - -static void deallocate_doorbell(struct qcm_process_device *qpd, - struct queue *q) -{ - unsigned int old; - struct kfd_dev *dev = qpd->dqm->dev; - - if (!KFD_IS_SOC15(dev->device_info->asic_family) || - q->properties.type == KFD_QUEUE_TYPE_SDMA) - return; - - old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); - WARN_ON(!old); -} - static int allocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) @@ -180,10 +113,11 @@ static int allocate_vmid(struct device_queue_manager *dqm, if (dqm->vmid_bitmap == 0) return -ENOMEM; - bit = ffs(dqm->vmid_bitmap) - 1; - dqm->vmid_bitmap &= ~(1 << bit); + bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM); + clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); - allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; + /* Kaveri kfd vmid's starts from vmid 8 */ + allocated_vmid = bit + KFD_VMID_START_OFFSET; pr_debug("vmid allocation %d\n", allocated_vmid); qpd->vmid = allocated_vmid; q->properties.vmid = allocated_vmid; @@ -191,57 +125,27 @@ static int allocate_vmid(struct device_queue_manager *dqm, set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); program_sh_mem_settings(dqm, qpd); - /* qpd->page_table_base is set earlier when register_process() - * is called, i.e. when the first queue is created. - */ - dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, - qpd->vmid, - qpd->page_table_base); - /*invalidate the VM context after pasid and vmid mapping is set up*/ - kfd_flush_tlb(dqm->dev, qpd->pqm->process); - return 0; } -static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, - struct qcm_process_device *qpd) -{ - uint32_t len; - - if (!qpd->ib_kaddr) - return -ENOMEM; - - len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base, - (uint32_t *)qpd->ib_kaddr); - - return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, - qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); -} - static void deallocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { - int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; - - /* On GFX v7, CP doesn't flush TC at dequeue */ - if (q->device->device_info->asic_family == CHIP_HAWAII) - if (flush_texture_cache_nocpsch(q->device, qpd)) - pr_err("Failed to flush TC\n"); - - kfd_flush_tlb(dqm->dev, qpd->pqm->process); + int bit = qpd->vmid - KFD_VMID_START_OFFSET; /* Release the vmid mapping */ set_pasid_vmid_mapping(dqm, 0, qpd->vmid); - dqm->vmid_bitmap |= (1 << bit); + set_bit(bit, (unsigned long *)&dqm->vmid_bitmap); qpd->vmid = 0; q->properties.vmid = 0; } static int create_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd) + struct qcm_process_device *qpd, + int *allocated_vmid) { int retval; @@ -261,18 +165,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, if (retval) goto out_unlock; } + *allocated_vmid = qpd->vmid; q->properties.vmid = qpd->vmid; - /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later - */ - if (qpd->evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); - - q->properties.tba_addr = qpd->tba_addr; - q->properties.tma_addr = qpd->tma_addr; if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) retval = create_compute_queue_nocpsch(dqm, q, qpd); @@ -282,14 +176,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, retval = -EINVAL; if (retval) { - if (list_empty(&qpd->queues_list)) + if (list_empty(&qpd->queues_list)) { deallocate_vmid(dqm, qpd, q); - + *allocated_vmid = 0; + } goto out_unlock; } list_add(&q->list, &qpd->queues_list); - qpd->queue_count++; if (q->properties.is_active) dqm->queue_count++; @@ -324,8 +218,12 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) continue; if (dqm->allocated_queues[pipe] != 0) { - bit = ffs(dqm->allocated_queues[pipe]) - 1; - dqm->allocated_queues[pipe] &= ~(1 << bit); + bit = find_first_bit( + (unsigned long *)&dqm->allocated_queues[pipe], + get_queues_per_pipe(dqm)); + + clear_bit(bit, + (unsigned long *)&dqm->allocated_queues[pipe]); q->pipe = pipe; q->queue = bit; set = true; @@ -346,7 +244,7 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) static inline void deallocate_hqd(struct device_queue_manager *dqm, struct queue *q) { - dqm->allocated_queues[q->pipe] |= (1 << q->queue); + set_bit(q->queue, (unsigned long *)&dqm->allocated_queues[q->pipe]); } static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, @@ -364,24 +262,17 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, if (retval) return retval; - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_hqd; - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) - goto out_deallocate_doorbell; + goto out_deallocate_hqd; pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", q->pipe, q->queue); - dqm->dev->kfd2kgd->alloc_memory_of_scratch( + dqm->dev->kfd2kgd->set_scratch_backing_va( dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); - if (!q->properties.is_active) - return 0; - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, q->process->mm); if (retval) @@ -391,84 +282,71 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, out_uninit_mqd: mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); out_deallocate_hqd: deallocate_hqd(dqm, q); return retval; } -/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked - * to avoid asynchronized access - */ -static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, +static int destroy_queue_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { int retval; struct mqd_manager *mqd; - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) - return -ENOMEM; + retval = 0; + + mutex_lock(&dqm->lock); if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (mqd == NULL) { + retval = -ENOMEM; + goto out; + } deallocate_hqd(dqm, q); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); + if (mqd == NULL) { + retval = -ENOMEM; + goto out; + } dqm->sdma_queue_count--; deallocate_sdma_queue(dqm, q->sdma_id); } else { pr_debug("q->properties.type %d is invalid\n", q->properties.type); - return -EINVAL; + retval = -EINVAL; + goto out; } - dqm->total_queue_count--; - - deallocate_doorbell(qpd, q); retval = mqd->destroy_mqd(mqd, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, - KFD_UNMAP_LATENCY_MS, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, q->pipe, q->queue); - if (retval == -ETIME) - qpd->reset_wavefronts = true; + + if (retval) + goto out; mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); list_del(&q->list); - if (list_empty(&qpd->queues_list)) { - if (qpd->reset_wavefronts) { - pr_warn("Resetting wave fronts (nocpsch) on dev %p\n", - dqm->dev); - /* dbgdev_wave_reset_wavefronts has to be called before - * deallocate_vmid(), i.e. when vmid is still in use. - */ - dbgdev_wave_reset_wavefronts(dqm->dev, - qpd->pqm->process); - qpd->reset_wavefronts = false; - } - + if (list_empty(&qpd->queues_list)) deallocate_vmid(dqm, qpd, q); - } - qpd->queue_count--; if (q->properties.is_active) dqm->queue_count--; - return retval; -} - -static int destroy_queue_nocpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - struct queue *q) -{ - int retval; + /* + * Unconditionally decrement this counter, regardless of the queue's + * type + */ + dqm->total_queue_count--; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); - mutex_lock(&dqm->lock); - retval = destroy_queue_nocpsch_locked(dqm, qpd, q); +out: mutex_unlock(&dqm->lock); - return retval; } @@ -476,82 +354,39 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) { int retval; struct mqd_manager *mqd; - struct kfd_process_device *pdd; - bool prev_active = false; mutex_lock(&dqm->lock); - - pdd = kfd_get_process_device_data(q->device, q->process); - if (!pdd) { - retval = -ENODEV; - goto out_unlock; - } mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); if (!mqd) { retval = -ENOMEM; goto out_unlock; } - /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later - */ - if (pdd->qpd.evicted > 0) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); - - /* Save previous activity state for counters */ - prev_active = q->properties.is_active; - - /* Make sure the queue is unmapped before updating the MQD */ - if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) { - retval = unmap_queues_cpsch(dqm, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - if (retval) { - pr_err("unmap queue failed\n"); - goto out_unlock; - } - } else if (prev_active && - (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) { - retval = mqd->destroy_mqd(mqd, q->mqd, - KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, - KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); - if (retval) { - pr_err("destroy mqd failed\n"); - goto out_unlock; - } - } - retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + if (q->properties.is_active) + prev_active = true; /* - * check active state vs. the previous state and modify - * counter accordingly. map_queues_cpsch uses the - * dqm->queue_count to determine whether a new runlist must be - * uploaded. + * + * check active state vs. the previous state + * and modify counter accordingly */ - if (q->properties.is_active && !prev_active) + retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + if ((q->properties.is_active) && (!prev_active)) dqm->queue_count++; else if (!q->properties.is_active && prev_active) dqm->queue_count--; - if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) - retval = map_queues_cpsch(dqm); - else if (q->properties.is_active && - (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, - &q->properties, q->process->mm); + if (sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = execute_queues_cpsch(dqm, false); out_unlock: mutex_unlock(&dqm->lock); return retval; } -static struct mqd_manager *get_mqd_manager( +static struct mqd_manager *get_mqd_manager_nocpsch( struct device_queue_manager *dqm, enum KFD_MQD_TYPE type) { struct mqd_manager *mqd; @@ -572,140 +407,11 @@ static struct mqd_manager *get_mqd_manager( return mqd; } -int process_evict_queues(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct queue *q, *next; - struct mqd_manager *mqd; - struct kfd_process_device *pdd; - int retval = 0; - - mutex_lock(&dqm->lock); - if (qpd->evicted++ > 0) /* already evicted, do nothing */ - goto out; - - pdd = qpd_to_pdd(qpd); - pr_info_ratelimited("Evicting PASID %u queues\n", - pdd->process->pasid); - - /* unactivate all active queues on the qpd */ - list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { /* should not be here */ - pr_err("Cannot evict queue, mqd is NULL\n"); - retval = -ENOMEM; - goto out; - } - /* if the queue is not active anyway, it is not evicted */ - if (q->properties.is_active) { - q->properties.is_evicted = true; - q->properties.is_active = false; - } - - if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && - q->properties.is_evicted && - (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) - retval = mqd->destroy_mqd(mqd, q->mqd, - KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, - KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); - if (q->properties.is_evicted) - dqm->queue_count--; - } - if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) - retval = execute_queues_cpsch(dqm, - qpd->is_debug ? - KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - -out: - mutex_unlock(&dqm->lock); - return retval; - -} - -int process_restore_queues(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct queue *q, *next; - struct mqd_manager *mqd; - int retval = 0; - struct kfd_process_device *pdd; - uint32_t pd_base; - - pdd = qpd_to_pdd(qpd); - /* Retrieve PD base */ - pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); - - mutex_lock(&dqm->lock); - if (qpd->evicted == 0) /* already restored, do nothing */ - goto out_unlock; - - if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ - qpd->evicted--; - goto out_unlock; - } - - pr_info_ratelimited("Restoring PASID %u queues\n", - pdd->process->pasid); - - /* Update PD Base in QPD */ - qpd->page_table_base = pd_base; - pr_debug("Updated PD address to 0x%08x\n", pd_base); - - if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && - !list_empty(&qpd->queues_list)) { - dqm->dev->kfd2kgd->set_vm_context_page_table_base( - dqm->dev->kgd, - qpd->vmid, - qpd->page_table_base); - - kfd_flush_tlb(dqm->dev, pdd->process); - } - - /* activate all active queues on the qpd */ - list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { /* should not be here */ - pr_err("Cannot restore queue, mqd is NULL\n"); - retval = -ENOMEM; - goto out_unlock; - } - if (q->properties.is_evicted) { - q->properties.is_evicted = false; - q->properties.is_active = true; - - if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && - (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, - q->queue, &q->properties, - q->process->mm); - dqm->queue_count++; - } - } - if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) - retval = execute_queues_cpsch(dqm, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - - if (retval == 0) - qpd->evicted = 0; - -out_unlock: - mutex_unlock(&dqm->lock); - - return retval; -} - -static int register_process(struct device_queue_manager *dqm, +static int register_process_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct device_process_node *n; int retval; - struct kfd_process_device *pdd; - uint32_t pd_base; n = kzalloc(sizeof(*n), GFP_KERNEL); if (!n) @@ -713,18 +419,10 @@ static int register_process(struct device_queue_manager *dqm, n->qpd = qpd; - pdd = qpd_to_pdd(qpd); - /* Retrieve PD base */ - pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); - mutex_lock(&dqm->lock); list_add(&n->list, &dqm->queues); - /* Update PD Base in QPD */ - qpd->page_table_base = pd_base; - pr_debug("Updated PD address to 0x%08x\n", pd_base); - - retval = dqm->asic_ops.update_qpd(dqm, qpd); + retval = dqm->ops_asic_specific.register_process(dqm, qpd); dqm->processes_count++; @@ -733,7 +431,7 @@ static int register_process(struct device_queue_manager *dqm, return retval; } -static int unregister_process(struct device_queue_manager *dqm, +static int unregister_process_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { int retval; @@ -809,13 +507,13 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) dqm->allocated_queues[pipe] |= 1 << queue; } - dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; - dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; + dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; return 0; } -static void uninitialize(struct device_queue_manager *dqm) +static void uninitialize_nocpsch(struct device_queue_manager *dqm) { int i; @@ -831,12 +529,11 @@ static void uninitialize(struct device_queue_manager *dqm) static int start_nocpsch(struct device_queue_manager *dqm) { init_interrupts(dqm); - return pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); + return 0; } static int stop_nocpsch(struct device_queue_manager *dqm) { - pm_uninit(&dqm->packets); return 0; } @@ -848,8 +545,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, if (dqm->sdma_bitmap == 0) return -ENOMEM; - bit = ffs(dqm->sdma_bitmap) - 1; - dqm->sdma_bitmap &= ~(1 << bit); + bit = find_first_bit((unsigned long *)&dqm->sdma_bitmap, + CIK_SDMA_QUEUES); + + clear_bit(bit, (unsigned long *)&dqm->sdma_bitmap); *sdma_queue_id = bit; return 0; @@ -858,9 +557,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, static void deallocate_sdma_queue(struct device_queue_manager *dqm, unsigned int sdma_queue_id) { - if (sdma_queue_id >= get_num_sdma_queues(dqm)) + if (sdma_queue_id >= CIK_SDMA_QUEUES) return; - dqm->sdma_bitmap |= (1 << sdma_queue_id); + set_bit(sdma_queue_id, (unsigned long *)&dqm->sdma_bitmap); } static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, @@ -878,22 +577,18 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, if (retval) return retval; - q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm); - q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm); - - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_sdma_queue; + q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM; pr_debug("SDMA id is: %d\n", q->sdma_id); pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); - dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) - goto out_deallocate_doorbell; + goto out_deallocate_sdma_queue; retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); if (retval) @@ -903,8 +598,6 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, out_uninit_mqd: mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); out_deallocate_sdma_queue: deallocate_sdma_queue(dqm, q->sdma_id); @@ -920,7 +613,8 @@ static int set_sched_resources(struct device_queue_manager *dqm) int i, mec; struct scheduling_resources res; - res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap; + res.vmid_mask = (1 << VMID_PER_DEVICE) - 1; + res.vmid_mask <<= KFD_VMID_START_OFFSET; res.queue_mask = 0; for (i = 0; i < KGD_MAX_QUEUES; ++i) { @@ -958,6 +652,8 @@ static int set_sched_resources(struct device_queue_manager *dqm) static int initialize_cpsch(struct device_queue_manager *dqm) { + int retval; + pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); mutex_init(&dqm->lock); @@ -965,18 +661,21 @@ static int initialize_cpsch(struct device_queue_manager *dqm) dqm->queue_count = dqm->processes_count = 0; dqm->sdma_queue_count = 0; dqm->active_runlist = false; - dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + retval = dqm->ops_asic_specific.initialize(dqm); + if (retval) + mutex_destroy(&dqm->lock); - return 0; + return retval; } static int start_cpsch(struct device_queue_manager *dqm) { + struct device_process_node *node; int retval; retval = 0; - retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); + retval = pm_init(&dqm->packets, dqm); if (retval) goto fail_packet_manager_init; @@ -998,9 +697,12 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - mutex_lock(&dqm->lock); - execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - mutex_unlock(&dqm->lock); + list_for_each_entry(node, &dqm->queues, list) + if (node->qpd->pqm->process && dqm->dev) + kfd_bind_process_to_device(dqm->dev, + node->qpd->pqm->process); + + execute_queues_cpsch(dqm, true); return 0; fail_allocate_vidmem: @@ -1012,12 +714,15 @@ static int start_cpsch(struct device_queue_manager *dqm) static int stop_cpsch(struct device_queue_manager *dqm) { - mutex_lock(&dqm->lock); - - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); + struct device_process_node *node; + struct kfd_process_device *pdd; - mutex_unlock(&dqm->lock); + destroy_queues_cpsch(dqm, true, true); + list_for_each_entry(node, &dqm->queues, list) { + pdd = qpd_to_pdd(node->qpd); + pdd->bound = false; + } kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->packets); @@ -1047,7 +752,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, list_add(&kq->list, &qpd->priv_queue_list); dqm->queue_count++; qpd->is_debug = true; - execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + execute_queues_cpsch(dqm, false); mutex_unlock(&dqm->lock); return 0; @@ -1058,10 +763,12 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { mutex_lock(&dqm->lock); + /* here we actually preempt the DIQ */ + destroy_queues_cpsch(dqm, true, false); list_del(&kq->list); dqm->queue_count--; qpd->is_debug = false; - execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); + execute_queues_cpsch(dqm, false); /* * Unconditionally decrement this counter, regardless of the queue's * type. @@ -1072,68 +779,55 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, mutex_unlock(&dqm->lock); } +static void select_sdma_engine_id(struct queue *q) +{ + static int sdma_id; + + q->sdma_id = sdma_id; + sdma_id = (sdma_id + 1) % 2; +} + static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd) + struct qcm_process_device *qpd, int *allocate_vmid) { int retval; struct mqd_manager *mqd; retval = 0; + if (allocate_vmid) + *allocate_vmid = 0; + mutex_lock(&dqm->lock); if (dqm->total_queue_count >= max_num_of_queues_per_device) { pr_warn("Can't create new usermode queue because %d queues were already created\n", dqm->total_queue_count); retval = -EPERM; - goto out_unlock; - } - - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - retval = allocate_sdma_queue(dqm, &q->sdma_id); - if (retval) - goto out_unlock; - q->properties.sdma_queue_id = - q->sdma_id / get_num_sdma_engines(dqm); - q->properties.sdma_engine_id = - q->sdma_id % get_num_sdma_engines(dqm); + goto out; } - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_sdma_queue; + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + select_sdma_engine_id(q); mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); if (!mqd) { retval = -ENOMEM; - goto out_deallocate_doorbell; + goto out; } - /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later - */ - if (qpd->evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); - dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - - q->properties.tba_addr = qpd->tba_addr; - q->properties.tma_addr = qpd->tma_addr; + dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) - goto out_deallocate_doorbell; + goto out; list_add(&q->list, &qpd->queues_list); - qpd->queue_count++; if (q->properties.is_active) { dqm->queue_count++; - retval = execute_queues_cpsch(dqm, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + retval = execute_queues_cpsch(dqm, false); } if (q->properties.type == KFD_QUEUE_TYPE_SDMA) @@ -1147,28 +841,19 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, pr_debug("Total of %d queues are accountable so far\n", dqm->total_queue_count); +out: mutex_unlock(&dqm->lock); return retval; - -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); -out_deallocate_sdma_queue: - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - deallocate_sdma_queue(dqm, q->sdma_id); -out_unlock: - mutex_unlock(&dqm->lock); - - return retval; } int amdkfd_fence_wait_timeout(unsigned int *fence_addr, unsigned int fence_value, - unsigned int timeout_ms) + unsigned long timeout) { - unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies; + timeout += jiffies; while (*fence_addr != fence_value) { - if (time_after(jiffies, end_jiffies)) { + if (time_after(jiffies, timeout)) { pr_err("qcm fence wait loop timeout expired\n"); return -ETIME; } @@ -1178,57 +863,44 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, return 0; } -static int unmap_sdma_queues(struct device_queue_manager *dqm, +static int destroy_sdma_queues(struct device_queue_manager *dqm, unsigned int sdma_engine) { return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, + KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false, sdma_engine); } -/* dqm->lock mutex has to be locked before calling this function */ -static int map_queues_cpsch(struct device_queue_manager *dqm) +static int destroy_queues_cpsch(struct device_queue_manager *dqm, + bool preempt_static_queues, bool lock) { int retval; + enum kfd_preempt_type_filter preempt_type; + struct kfd_process_device *pdd; - if (dqm->queue_count <= 0 || dqm->processes_count <= 0) - return 0; - - if (dqm->active_runlist) - return 0; - - retval = pm_send_runlist(&dqm->packets, &dqm->queues); - if (retval) { - pr_err("failed to execute runlist\n"); - return retval; - } - dqm->active_runlist = true; - - return retval; -} - -/* dqm->lock mutex has to be locked before calling this function */ -static int unmap_queues_cpsch(struct device_queue_manager *dqm, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param) -{ - int retval = 0; + retval = 0; + if (lock) + mutex_lock(&dqm->lock); if (!dqm->active_runlist) - return retval; + goto out; pr_debug("Before destroying queues, sdma queue count is : %u\n", dqm->sdma_queue_count); if (dqm->sdma_queue_count > 0) { - unmap_sdma_queues(dqm, 0); - unmap_sdma_queues(dqm, 1); + destroy_sdma_queues(dqm, 0); + destroy_sdma_queues(dqm, 1); } + preempt_type = preempt_static_queues ? + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES : + KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES; + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, - filter, filter_param, false, 0); + preempt_type, 0, false, 0); if (retval) - return retval; + goto out; *dqm->fence_addr = KFD_FENCE_INIT; pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, @@ -1236,29 +908,55 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, /* should be timed out */ retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); - if (retval) - return retval; - + if (retval) { + pdd = kfd_get_process_device_data(dqm->dev, + kfd_get_process(current)); + pdd->reset_wavefronts = true; + goto out; + } pm_release_ib(&dqm->packets); dqm->active_runlist = false; +out: + if (lock) + mutex_unlock(&dqm->lock); return retval; } -/* dqm->lock mutex has to be locked before calling this function */ -static int execute_queues_cpsch(struct device_queue_manager *dqm, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param) +static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock) { int retval; - retval = unmap_queues_cpsch(dqm, filter, filter_param); + if (lock) + mutex_lock(&dqm->lock); + + retval = destroy_queues_cpsch(dqm, false, false); if (retval) { - pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); - return retval; + pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption"); + goto out; + } + + if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { + retval = 0; + goto out; + } + + if (dqm->active_runlist) { + retval = 0; + goto out; } - return map_queues_cpsch(dqm); + retval = pm_send_runlist(&dqm->packets, &dqm->queues); + if (retval) { + pr_err("failed to execute runlist"); + goto out; + } + dqm->active_runlist = true; + +out: + if (lock) + mutex_unlock(&dqm->lock); + return retval; } static int destroy_queue_cpsch(struct device_queue_manager *dqm, @@ -1293,22 +991,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, goto failed; } - deallocate_doorbell(qpd, q); - - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); - } list_del(&q->list); - qpd->queue_count--; - if (q->properties.is_active) { + if (q->properties.is_active) dqm->queue_count--; - retval = execute_queues_cpsch(dqm, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - if (retval == -ETIME) - qpd->reset_wavefronts = true; - } + + execute_queues_cpsch(dqm, false); mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); @@ -1322,7 +1012,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, mutex_unlock(&dqm->lock); - return retval; + return 0; failed: failed_try_destroy_debugged_queue: @@ -1346,10 +1036,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, void __user *alternate_aperture_base, uint64_t alternate_aperture_size) { - bool retval = true; - - if (!dqm->asic_ops.set_cache_memory_policy) - return retval; + bool retval; mutex_lock(&dqm->lock); @@ -1381,7 +1068,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, qpd->sh_mem_ape1_limit = limit >> 16; } - retval = dqm->asic_ops.set_cache_memory_policy( + retval = dqm->ops_asic_specific.set_cache_memory_policy( dqm, qpd, default_policy, @@ -1389,7 +1076,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, alternate_aperture_base, alternate_aperture_size); - if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) + if ((sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) program_sh_mem_settings(dqm, qpd); pr_debug("sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", @@ -1401,166 +1088,6 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, return retval; } -static int set_trap_handler(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - uint64_t tba_addr, - uint64_t tma_addr) -{ - uint64_t *tma; - - if (dqm->dev->cwsr_enabled) { - /* Jump from CWSR trap handler to user trap */ - tma = (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET); - tma[0] = tba_addr; - tma[1] = tma_addr; - } else { - qpd->tba_addr = tba_addr; - qpd->tma_addr = tma_addr; - } - - return 0; -} - -static int process_termination_nocpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct queue *q, *next; - struct device_process_node *cur, *next_dpn; - int retval = 0; - - mutex_lock(&dqm->lock); - - /* Clear all user mode queues */ - list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - int ret; - - ret = destroy_queue_nocpsch_locked(dqm, qpd, q); - if (ret) - retval = ret; - } - - /* Unregister process */ - list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { - if (qpd == cur->qpd) { - list_del(&cur->list); - kfree(cur); - dqm->processes_count--; - break; - } - } - - mutex_unlock(&dqm->lock); - return retval; -} - -static int get_wave_state(struct device_queue_manager *dqm, - struct queue *q, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size) -{ - struct mqd_manager *mqd; - int r; - - mutex_lock(&dqm->lock); - - if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE || - q->properties.is_active || !q->device->cwsr_enabled) { - r = -EINVAL; - goto dqm_unlock; - } - - mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd) { - r = -ENOMEM; - goto dqm_unlock; - } - - if (!mqd->get_wave_state) { - r = -EINVAL; - goto dqm_unlock; - } - - r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size, - save_area_used_size); - -dqm_unlock: - mutex_unlock(&dqm->lock); - return r; -} - -static int process_termination_cpsch(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - int retval; - struct queue *q, *next; - struct kernel_queue *kq, *kq_next; - struct mqd_manager *mqd; - struct device_process_node *cur, *next_dpn; - enum kfd_unmap_queues_filter filter = - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; - - retval = 0; - - mutex_lock(&dqm->lock); - - /* Clean all kernel queues */ - list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) { - list_del(&kq->list); - dqm->queue_count--; - qpd->is_debug = false; - dqm->total_queue_count--; - filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES; - } - - /* Clear all user mode queues */ - list_for_each_entry(q, &qpd->queues_list, list) { - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); - } - - if (q->properties.is_active) - dqm->queue_count--; - - dqm->total_queue_count--; - } - - /* Unregister process */ - list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { - if (qpd == cur->qpd) { - list_del(&cur->list); - kfree(cur); - dqm->processes_count--; - break; - } - } - - retval = execute_queues_cpsch(dqm, filter, 0); - if (retval || qpd->reset_wavefronts) { - pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); - dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); - qpd->reset_wavefronts = false; - } - - /* lastly, free mqd resources */ - list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { - retval = -ENOMEM; - goto out; - } - list_del(&q->list); - qpd->queue_count--; - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); - } - -out: - mutex_unlock(&dqm->lock); - return retval; -} - struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) { struct device_queue_manager *dqm; @@ -1571,18 +1098,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) if (!dqm) return NULL; - switch (dev->device_info->asic_family) { - case CHIP_HAWAII: - case CHIP_TONGA: - dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS; - break; - default: - dqm->sched_policy = sched_policy; - break; - } - dqm->dev = dev; - switch (dqm->sched_policy) { + switch (sched_policy) { case KFD_SCHED_POLICY_HWS: case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: /* initialize dqm for cp scheduling */ @@ -1592,16 +1109,13 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.stop = stop_cpsch; dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.update_queue = update_queue; - dqm->ops.get_mqd_manager = get_mqd_manager; - dqm->ops.register_process = register_process; - dqm->ops.unregister_process = unregister_process; - dqm->ops.uninitialize = uninitialize; + dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; + dqm->ops.register_process = register_process_nocpsch; + dqm->ops.unregister_process = unregister_process_nocpsch; + dqm->ops.uninitialize = uninitialize_nocpsch; dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; dqm->ops.set_cache_memory_policy = set_cache_memory_policy; - dqm->ops.set_trap_handler = set_trap_handler; - dqm->ops.process_termination = process_termination_cpsch; - dqm->ops.get_wave_state = get_wave_state; break; case KFD_SCHED_POLICY_NO_HWS: /* initialize dqm for no cp scheduling */ @@ -1610,49 +1124,26 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.create_queue = create_queue_nocpsch; dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.update_queue = update_queue; - dqm->ops.get_mqd_manager = get_mqd_manager; - dqm->ops.register_process = register_process; - dqm->ops.unregister_process = unregister_process; + dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; + dqm->ops.register_process = register_process_nocpsch; + dqm->ops.unregister_process = unregister_process_nocpsch; dqm->ops.initialize = initialize_nocpsch; - dqm->ops.uninitialize = uninitialize; + dqm->ops.uninitialize = uninitialize_nocpsch; dqm->ops.set_cache_memory_policy = set_cache_memory_policy; - dqm->ops.set_trap_handler = set_trap_handler; - dqm->ops.process_termination = process_termination_nocpsch; - dqm->ops.get_wave_state = get_wave_state; break; default: - pr_err("Invalid scheduling policy %d\n", dqm->sched_policy); + pr_err("Invalid scheduling policy %d\n", sched_policy); goto out_free; } switch (dev->device_info->asic_family) { case CHIP_CARRIZO: - device_queue_manager_init_vi(&dqm->asic_ops); + device_queue_manager_init_vi(&dqm->ops_asic_specific); break; case CHIP_KAVERI: - device_queue_manager_init_cik(&dqm->asic_ops); + device_queue_manager_init_cik(&dqm->ops_asic_specific); break; - - case CHIP_HAWAII: - device_queue_manager_init_cik_hawaii(&dqm->asic_ops); - break; - - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - device_queue_manager_init_vi_tonga(&dqm->asic_ops); - break; - - case CHIP_VEGA10: - case CHIP_RAVEN: - device_queue_manager_init_v9_vega10(&dqm->asic_ops); - break; - default: - WARN(1, "Unexpected ASIC family %u", - dev->device_info->asic_family); - goto out_free; } if (!dqm->ops.initialize(dqm)) @@ -1668,87 +1159,3 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) dqm->ops.uninitialize(dqm); kfree(dqm); } - -int kfd_process_vm_fault(struct device_queue_manager *dqm, - unsigned int pasid) -{ - struct kfd_process_device *pdd; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - int ret = 0; - - if (!p) - return -EINVAL; - pdd = kfd_get_process_device_data(dqm->dev, p); - if (pdd) - ret = process_evict_queues(dqm, &pdd->qpd); - kfd_unref_process(p); - - return ret; -} - -static void seq_reg_dump(struct seq_file *m, - uint32_t (*dump)[2], uint32_t n_regs) -{ - uint32_t i, count; - - for (i = 0, count = 0; i < n_regs; i++) { - if (count == 0 || - dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) { - seq_printf(m, "%s %08x: %08x", - i ? "\n" : "", - dump[i][0], dump[i][1]); - count = 7; - } else { - seq_printf(m, " %08x", dump[i][1]); - count--; - } - } - - seq_puts(m, "\n"); -} - -int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data) -{ - struct device_queue_manager *dqm = data; - uint32_t (*dump)[2], n_regs; - int pipe, queue; - int r = 0; - - for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { - int pipe_offset = pipe * get_queues_per_pipe(dqm); - - for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) { - if (!test_bit(pipe_offset + queue, - dqm->dev->shared_resources.queue_bitmap)) - continue; - - r = dqm->dev->kfd2kgd->hqd_dump( - dqm->dev->kgd, pipe, queue, &dump, &n_regs); - if (r) - break; - - seq_printf(m, " CP Pipe %d, Queue %d\n", - pipe, queue); - seq_reg_dump(m, dump, n_regs); - - kfree(dump); - } - } - - for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) { - for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) { - r = dqm->dev->kfd2kgd->hqd_sdma_dump( - dqm->dev->kgd, pipe, queue, &dump, &n_regs); - if (r) - break; - - seq_printf(m, " SDMA Engine %d, RLC %d\n", - pipe, queue); - seq_reg_dump(m, dump, n_regs); - - kfree(dump); - } - } - - return r; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 9785680..faf820a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -29,10 +29,14 @@ #include "kfd_priv.h" #include "kfd_mqd_manager.h" -#define KFD_UNMAP_LATENCY_MS (4000) -#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) - -#define KFD_SDMA_QUEUES_PER_ENGINE (2) +#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500) +#define CIK_VMID_NUM (8) +#define KFD_VMID_START_OFFSET (8) +#define VMID_PER_DEVICE CIK_VMID_NUM +#define KFD_DQM_FIRST_PIPE (0) +#define CIK_SDMA_QUEUES (4) +#define CIK_SDMA_QUEUES_PER_ENGINE (2) +#define CIK_SDMA_ENGINE_NUM (2) struct device_process_node { struct qcm_process_device *qpd; @@ -75,16 +79,13 @@ struct device_process_node { * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the * memory apertures. * - * @process_termination: Clears all process queues belongs to that device. - * - * @get_wave_state: Retrieves context save state and optionally copies the - * control stack, if kept in the MQD, to the given userspace address. */ struct device_queue_manager_ops { int (*create_queue)(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd); + struct qcm_process_device *qpd, + int *allocate_vmid); int (*destroy_queue)(struct device_queue_manager *dqm, struct qcm_process_device *qpd, @@ -121,25 +122,12 @@ struct device_queue_manager_ops { enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); - - int (*set_trap_handler)(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - uint64_t tba_addr, - uint64_t tma_addr); - - int (*process_termination)(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); - - int (*get_wave_state)(struct device_queue_manager *dqm, - struct queue *q, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size); }; struct device_queue_manager_asic_ops { - int (*update_qpd)(struct device_queue_manager *dqm, + int (*register_process)(struct device_queue_manager *dqm, struct qcm_process_device *qpd); + int (*initialize)(struct device_queue_manager *dqm); bool (*set_cache_memory_policy)(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, @@ -165,7 +153,7 @@ struct device_queue_manager_asic_ops { struct device_queue_manager { struct device_queue_manager_ops ops; - struct device_queue_manager_asic_ops asic_ops; + struct device_queue_manager_asic_ops ops_asic_specific; struct mqd_manager *mqds[KFD_MQD_TYPE_MAX]; struct packet_manager packets; @@ -186,38 +174,21 @@ struct device_queue_manager { unsigned int *fence_addr; struct kfd_mem_obj *fence_mem; bool active_runlist; - int sched_policy; }; -void device_queue_manager_init_cik( - struct device_queue_manager_asic_ops *asic_ops); -void device_queue_manager_init_cik_hawaii( - struct device_queue_manager_asic_ops *asic_ops); -void device_queue_manager_init_vi( - struct device_queue_manager_asic_ops *asic_ops); -void device_queue_manager_init_vi_tonga( - struct device_queue_manager_asic_ops *asic_ops); -void device_queue_manager_init_v9_vega10( - struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops); +void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_queues_num(struct device_queue_manager *dqm); unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); -unsigned int get_num_sdma_queues(struct device_queue_manager *dqm); - -int process_evict_queues(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); -int process_restore_queues(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); - static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { return (pdd->lds_base >> 16) & 0xFF; } -/* This function is only useful for GFXv7 and v8 */ static inline unsigned int get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c index aed4c21..72c3cba 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -32,30 +32,18 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); -static int update_qpd_cik(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); -static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, +static int register_process_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd); +static int initialize_cpsch_cik(struct device_queue_manager *dqm); static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - -void device_queue_manager_init_cik( - struct device_queue_manager_asic_ops *asic_ops) -{ - asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; - asic_ops->update_qpd = update_qpd_cik; - asic_ops->init_sdma_vm = init_sdma_vm; -} -void device_queue_manager_init_cik_hawaii( - struct device_queue_manager_asic_ops *asic_ops) +void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops) { - asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; - asic_ops->update_qpd = update_qpd_cik_hawaii; - asic_ops->init_sdma_vm = init_sdma_vm_hawaii; + ops->set_cache_memory_policy = set_cache_memory_policy_cik; + ops->register_process = register_process_cik; + ops->initialize = initialize_cpsch_cik; + ops->init_sdma_vm = init_sdma_vm; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) @@ -111,7 +99,7 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, return true; } -static int update_qpd_cik(struct device_queue_manager *dqm, +static int register_process_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct kfd_process_device *pdd; @@ -145,36 +133,6 @@ static int update_qpd_cik(struct device_queue_manager *dqm, return 0; } -static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - unsigned int temp; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | - DEFAULT_MTYPE(MTYPE_NONCACHED) | - APE1_MTYPE(MTYPE_NONCACHED); - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit - * aperture addresses. - */ - temp = get_sh_mem_bases_nybble_64(pdd); - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); - - pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", - qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); - - return 0; -} - static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { @@ -191,15 +149,7 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, q->properties.sdma_vm_addr = value; } -static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd) +static int initialize_cpsch_cik(struct device_queue_manager *dqm) { - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit - * aperture addresses. - */ - q->properties.sdma_vm_addr = - ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << - SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & - SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; + return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c deleted file mode 100644 index 9c6c83a9..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "kfd_device_queue_manager.h" -#include "vega10_enum.h" -#include "gc/gc_9_0_offset.h" -#include "gc/gc_9_0_sh_mask.h" -#include "sdma0/sdma0_4_0_sh_mask.h" - -static int update_qpd_v9(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); -static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd); - -void device_queue_manager_init_v9_vega10( - struct device_queue_manager_asic_ops *asic_ops) -{ - asic_ops->update_qpd = update_qpd_v9; - asic_ops->init_sdma_vm = init_sdma_vm_v9; -} - -static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) -{ - uint32_t shared_base = pdd->lds_base >> 48; - uint32_t private_base = pdd->scratch_base >> 48; - - return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | - private_base; -} - -static int update_qpd_v9(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; - if (vega10_noretry) - qpd->sh_mem_config |= - 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; - - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); - - pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); - - return 0; -} - -static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd) -{ - /* Not needed on SDMAv4 any more */ - q->properties.sdma_vm_addr = 0; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c index 030b014..40e9ddd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -33,41 +33,18 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); -static int update_qpd_vi(struct device_queue_manager *dqm, +static int register_process_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd); +static int initialize_cpsch_vi(struct device_queue_manager *dqm); static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -/* - * Tonga device queue manager functions - */ -static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - enum cache_policy default_policy, - enum cache_policy alternate_policy, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); -static int update_qpd_vi_tonga(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); -static void init_sdma_vm_tonga(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - -void device_queue_manager_init_vi_tonga( - struct device_queue_manager_asic_ops *asic_ops) +void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops) { - asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; - asic_ops->update_qpd = update_qpd_vi_tonga; - asic_ops->init_sdma_vm = init_sdma_vm_tonga; -} - - -void device_queue_manager_init_vi( - struct device_queue_manager_asic_ops *asic_ops) -{ - asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; - asic_ops->update_qpd = update_qpd_vi; - asic_ops->init_sdma_vm = init_sdma_vm; + ops->set_cache_memory_policy = set_cache_memory_policy_vi; + ops->register_process = register_process_vi; + ops->initialize = initialize_cpsch_vi; + ops->init_sdma_vm = init_sdma_vm; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) @@ -127,34 +104,7 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, return true; } -static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - enum cache_policy default_policy, - enum cache_policy alternate_policy, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) -{ - uint32_t default_mtype; - uint32_t ape1_mtype; - - default_mtype = (default_policy == cache_policy_coherent) ? - MTYPE_UC : - MTYPE_NC; - - ape1_mtype = (alternate_policy == cache_policy_coherent) ? - MTYPE_UC : - MTYPE_NC; - - qpd->sh_mem_config = - SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | - default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | - ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; - - return true; -} - -static int update_qpd_vi(struct device_queue_manager *dqm, +static int register_process_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct kfd_process_device *pdd; @@ -195,40 +145,6 @@ static int update_qpd_vi(struct device_queue_manager *dqm, return 0; } -static int update_qpd_vi_tonga(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - unsigned int temp; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | - MTYPE_UC << - SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | - MTYPE_UC << - SH_MEM_CONFIG__APE1_MTYPE__SHIFT; - - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit - * aperture addresses. - */ - temp = get_sh_mem_bases_nybble_64(pdd); - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); - - pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n", - temp, qpd->sh_mem_bases); - - return 0; -} - static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { @@ -245,15 +161,7 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, q->properties.sdma_vm_addr = value; } -static void init_sdma_vm_tonga(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd) +static int initialize_cpsch_vi(struct device_queue_manager *dqm) { - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit - * aperture addresses. - */ - q->properties.sdma_vm_addr = - ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << - SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & - SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; + return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index fc41689..acf4d2a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -24,15 +24,17 @@ #include #include #include -#include /* - * This extension supports a kernel level doorbells management for the - * kernel queues using the first doorbell page reserved for the kernel. + * This extension supports a kernel level doorbells management for + * the kernel queues. + * Basically the last doorbells page is devoted to kernel queues + * and that's assures that any user process won't get access to the + * kernel doorbells page */ -static DEFINE_IDA(doorbell_ida); -static unsigned int max_doorbell_slices; +#define KERNEL_DOORBELL_PASID 1 +#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 /* * Each device exposes a doorbell aperture, a PCI MMIO aperture that @@ -49,9 +51,9 @@ static unsigned int max_doorbell_slices; */ /* # of doorbell bytes allocated for each process. */ -size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) +static inline size_t doorbell_process_allocation(void) { - return roundup(kfd->device_info->doorbell_size * + return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, PAGE_SIZE); } @@ -71,30 +73,27 @@ int kfd_doorbell_init(struct kfd_dev *kfd) doorbell_start_offset = roundup(kfd->shared_resources.doorbell_start_offset, - kfd_doorbell_process_slice(kfd)); + doorbell_process_allocation()); doorbell_aperture_size = rounddown(kfd->shared_resources.doorbell_aperture_size, - kfd_doorbell_process_slice(kfd)); + doorbell_process_allocation()); if (doorbell_aperture_size > doorbell_start_offset) doorbell_process_limit = (doorbell_aperture_size - doorbell_start_offset) / - kfd_doorbell_process_slice(kfd); + doorbell_process_allocation(); else - return -ENOSPC; - - if (!max_doorbell_slices || - doorbell_process_limit < max_doorbell_slices) - max_doorbell_slices = doorbell_process_limit; + doorbell_process_limit = 0; kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address + doorbell_start_offset; kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); + kfd->doorbell_process_limit = doorbell_process_limit - 1; kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, - kfd_doorbell_process_slice(kfd)); + doorbell_process_allocation()); if (!kfd->doorbell_kernel_ptr) return -ENOMEM; @@ -115,7 +114,8 @@ int kfd_doorbell_init(struct kfd_dev *kfd) pr_debug("doorbell aperture size == 0x%08lX\n", kfd->shared_resources.doorbell_aperture_size); - pr_debug("doorbell kernel address == 0x%p\n", kfd->doorbell_kernel_ptr); + pr_debug("doorbell kernel address == 0x%08lX\n", + (uintptr_t)kfd->doorbell_kernel_ptr); return 0; } @@ -126,16 +126,21 @@ void kfd_doorbell_fini(struct kfd_dev *kfd) iounmap(kfd->doorbell_kernel_ptr); } -int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, - struct vm_area_struct *vma) +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) { phys_addr_t address; + struct kfd_dev *dev; /* * For simplicitly we only allow mapping of the entire doorbell * allocation of a single device & process. */ - if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) + if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) + return -EINVAL; + + /* Find kfd device according to gpu id */ + dev = kfd_device_by_id(vma->vm_pgoff); + if (!dev) return -EINVAL; /* Calculate physical address of doorbell */ @@ -152,19 +157,19 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, " vm_flags == 0x%04lX\n" " size == 0x%04lX\n", (unsigned long long) vma->vm_start, address, vma->vm_flags, - kfd_doorbell_process_slice(dev)); + doorbell_process_allocation()); return io_remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, - kfd_doorbell_process_slice(dev), + doorbell_process_allocation(), vma->vm_page_prot); } /* get kernel iomem pointer for a doorbell */ -void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, unsigned int *doorbell_off) { u32 inx; @@ -179,18 +184,17 @@ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) return NULL; - inx *= kfd->device_info->doorbell_size / sizeof(u32); - /* - * Calculating the kernel doorbell offset using the first - * doorbell page. + * Calculating the kernel doorbell offset using "faked" kernel + * pasid that allocated for kernel queues only */ - *doorbell_off = kfd->doorbell_id_offset + inx; + *doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() / + sizeof(u32)) + inx; pr_debug("Get kernel queue doorbell\n" " doorbell offset == 0x%08X\n" - " kernel address == 0x%p\n", - *doorbell_off, (kfd->doorbell_kernel_ptr + inx)); + " kernel address == 0x%08lX\n", + *doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx)); return kfd->doorbell_kernel_ptr + inx; } @@ -206,7 +210,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) mutex_unlock(&kfd->doorbell_mutex); } -void write_kernel_doorbell(void __iomem *db, u32 value) +inline void write_kernel_doorbell(u32 __iomem *db, u32 value) { if (db) { writel(value, db); @@ -214,40 +218,29 @@ void write_kernel_doorbell(void __iomem *db, u32 value) } } -void write_kernel_doorbell64(void __iomem *db, u64 value) -{ - if (db) { - WARN(((unsigned long)db & 7) != 0, - "Unaligned 64-bit doorbell"); - writeq(value, (u64 __iomem *)db); - pr_debug("writing %llu to doorbell address 0x%p\n", value, db); - } -} - /* * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 * to doorbells with the process's doorbell page */ -unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, struct kfd_process *process, - unsigned int doorbell_id) + unsigned int queue_id) { /* * doorbell_id_offset accounts for doorbells taken by KGD. - * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to - * the process's doorbells. The offset returned is in dword - * units regardless of the ASIC-dependent doorbell size. + * pasid * doorbell_process_allocation/sizeof(u32) adjusts + * to the process's doorbells */ return kfd->doorbell_id_offset + - process->doorbell_index * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) + - doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); + process->pasid * (doorbell_process_allocation()/sizeof(u32)) + + queue_id; } uint64_t kfd_get_number_elems(struct kfd_dev *kfd) { uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - kfd->shared_resources.doorbell_start_offset) / - kfd_doorbell_process_slice(kfd) + 1; + doorbell_process_allocation() + 1; return num_of_elems; @@ -257,21 +250,5 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, struct kfd_process *process) { return dev->doorbell_base + - process->doorbell_index * kfd_doorbell_process_slice(dev); -} - -int kfd_alloc_process_doorbells(struct kfd_process *process) -{ - int r = ida_simple_get(&doorbell_ida, 1, max_doorbell_slices, - GFP_KERNEL); - if (r > 0) - process->doorbell_index = r; - - return r; -} - -void kfd_free_process_doorbells(struct kfd_process *process) -{ - if (process->doorbell_index) - ida_simple_remove(&doorbell_ida, process->doorbell_index); + process->pasid * doorbell_process_allocation(); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index ee3c288..5979158 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -23,9 +23,9 @@ #include #include #include -#include #include #include +#include #include #include #include "kfd_priv.h" @@ -33,105 +33,185 @@ #include /* - * Wrapper around wait_queue_entry_t + * A task can only be on a single wait_queue at a time, but we need to support + * waiting on multiple events (any/all). + * Instead of each event simply having a wait_queue with sleeping tasks, it + * has a singly-linked list of tasks. + * A thread that wants to sleep creates an array of these, one for each event + * and adds one to each event's waiter chain. */ struct kfd_event_waiter { - wait_queue_entry_t wait; - struct kfd_event *event; /* Event to wait for */ - bool activated; /* Becomes true when event is signaled */ + struct list_head waiters; + struct task_struct *sleeping_task; + + /* Transitions to true when the event this belongs to is signaled. */ + bool activated; + + /* Event */ + struct kfd_event *event; + uint32_t input_index; }; /* + * Over-complicated pooled allocator for event notification slots. + * * Each signal event needs a 64-bit signal slot where the signaler will write - * a 1 before sending an interrupt. (This is needed because some interrupts + * a 1 before sending an interrupt.l (This is needed because some interrupts * do not contain enough spare data bits to identify an event.) - * We get whole pages and map them to the process VA. - * Individual signal events use their event_id as slot index. + * We get whole pages from vmalloc and map them to the process VA. + * Individual signal events are then allocated a slot in a page. */ -struct kfd_signal_page { + +struct signal_page { + struct list_head event_pages; /* kfd_process.signal_event_pages */ uint64_t *kernel_address; - uint64_t handle; uint64_t __user *user_address; + uint32_t page_index; /* Index into the mmap aperture. */ + unsigned int free_slots; + unsigned long used_slot_bitmap[0]; }; +#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT +#define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE) +#define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1) +#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \ + SLOT_BITMAP_SIZE * sizeof(long)) -static uint64_t *page_slots(struct kfd_signal_page *page) +/* + * For signal events, the event ID is used as the interrupt user data. + * For SQ s_sendmsg interrupts, this is limited to 8 bits. + */ + +#define INTERRUPT_DATA_BITS 8 +#define SIGNAL_EVENT_ID_SLOT_SHIFT 0 + +static uint64_t *page_slots(struct signal_page *page) { return page->kernel_address; } -static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p) +static bool allocate_free_slot(struct kfd_process *process, + struct signal_page **out_page, + unsigned int *out_slot_index) +{ + struct signal_page *page; + + list_for_each_entry(page, &process->signal_event_pages, event_pages) { + if (page->free_slots > 0) { + unsigned int slot = + find_first_zero_bit(page->used_slot_bitmap, + SLOTS_PER_PAGE); + + __set_bit(slot, page->used_slot_bitmap); + page->free_slots--; + + page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT; + + *out_page = page; + *out_slot_index = slot; + + pr_debug("Allocated event signal slot in page %p, slot %d\n", + page, slot); + + return true; + } + } + + pr_debug("No free event signal slots were found for process %p\n", + process); + + return false; +} + +#define list_tail_entry(head, type, member) \ + list_entry((head)->prev, type, member) + +static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) { void *backing_store; - struct kfd_signal_page *page; + struct signal_page *page; - page = kzalloc(sizeof(*page), GFP_KERNEL); + page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); if (!page) - return NULL; + goto fail_alloc_signal_page; + + page->free_slots = SLOTS_PER_PAGE; - backing_store = (void *) __get_free_pages(GFP_KERNEL, + backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); if (!backing_store) goto fail_alloc_signal_store; - /* Initialize all events to unsignaled */ + /* prevent user-mode info leaks */ memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, - KFD_SIGNAL_EVENT_LIMIT * 8); + KFD_SIGNAL_EVENT_LIMIT * 8); page->kernel_address = backing_store; + + if (list_empty(&p->signal_event_pages)) + page->page_index = 0; + else + page->page_index = list_tail_entry(&p->signal_event_pages, + struct signal_page, + event_pages)->page_index + 1; + pr_debug("Allocated new event signal page at %p, for process %p\n", page, p); + pr_debug("Page index is %d\n", page->page_index); + + list_add(&page->event_pages, &p->signal_event_pages); - return page; + return true; fail_alloc_signal_store: kfree(page); - return NULL; +fail_alloc_signal_page: + return false; } -static int allocate_event_notification_slot(struct kfd_process *p, - struct kfd_event *ev) +static bool allocate_event_notification_slot(struct file *devkfd, + struct kfd_process *p, + struct signal_page **page, + unsigned int *signal_slot_index) { - int id; + bool ret; - if (!p->signal_page) { - p->signal_page = allocate_signal_page(p); - if (!p->signal_page) - return -ENOMEM; + ret = allocate_free_slot(p, page, signal_slot_index); + if (!ret) { + ret = allocate_signal_page(devkfd, p); + if (ret) + ret = allocate_free_slot(p, page, signal_slot_index); } - id = idr_alloc(&p->event_idr, ev, 0, KFD_SIGNAL_EVENT_LIMIT, - GFP_KERNEL); - if (id < 0) - return id; - - ev->event_id = id; - page_slots(p->signal_page)[id] = UNSIGNALED_EVENT_SLOT; - - return 0; + return ret; } -static struct kfd_signal_page *allocate_signal_page_dgpu( - struct kfd_process *p, uint64_t *kernel_address, uint64_t handle) +/* Assumes that the process's event_mutex is locked. */ +static void release_event_notification_slot(struct signal_page *page, + size_t slot_index) { - struct kfd_signal_page *my_page; + __clear_bit(slot_index, page->used_slot_bitmap); + page->free_slots++; - my_page = kzalloc(sizeof(*my_page), GFP_KERNEL); - if (!my_page) - return NULL; - - /* Initialize all events to unsignaled */ - memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, - KFD_SIGNAL_EVENT_LIMIT * 8); + /* We don't free signal pages, they are retained by the process + * and reused until it exits. + */ +} - my_page->kernel_address = kernel_address; - my_page->handle = handle; - my_page->user_address = NULL; +static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, + unsigned int page_index) +{ + struct signal_page *page; - pr_debug("Allocated new event signal page at %p, for process %p\n", - my_page, p); + /* + * This is safe because we don't delete signal pages until the + * process exits. + */ + list_for_each_entry(page, &p->signal_event_pages, event_pages) + if (page->page_index == page_index) + return page; - return my_page; + return NULL; } /* @@ -140,80 +220,96 @@ static struct kfd_signal_page *allocate_signal_page_dgpu( */ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) { - return idr_find(&p->event_idr, id); + struct kfd_event *ev; + + hash_for_each_possible(p->events, ev, events, id) + if (ev->event_id == id) + return ev; + + return NULL; } -/** - * lookup_signaled_event_by_partial_id - Lookup signaled event from partial ID - * @p: Pointer to struct kfd_process - * @id: ID to look up - * @bits: Number of valid bits in @id - * - * Finds the first signaled event with a matching partial ID. If no - * matching signaled event is found, returns NULL. In that case the - * caller should assume that the partial ID is invalid and do an - * exhaustive search of all siglaned events. - * - * If multiple events with the same partial ID signal at the same - * time, they will be found one interrupt at a time, not necessarily - * in the same order the interrupts occurred. As long as the number of - * interrupts is correct, all signaled events will be seen by the - * driver. +static u32 make_signal_event_id(struct signal_page *page, + unsigned int signal_slot_index) +{ + return page->page_index | + (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT); +} + +/* + * Produce a kfd event id for a nonsignal event. + * These are arbitrary numbers, so we do a sequential search through + * the hash table for an unused number. */ -static struct kfd_event *lookup_signaled_event_by_partial_id( - struct kfd_process *p, uint32_t id, uint32_t bits) +static u32 make_nonsignal_event_id(struct kfd_process *p) { - struct kfd_event *ev; + u32 id; - if (!p->signal_page || id >= KFD_SIGNAL_EVENT_LIMIT) - return NULL; + for (id = p->next_nonsignal_event_id; + id < KFD_LAST_NONSIGNAL_EVENT_ID && + lookup_event_by_id(p, id); + id++) + ; - /* Fast path for the common case that @id is not a partial ID - * and we only need a single lookup. - */ - if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) { - if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) - return NULL; + if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { + + /* + * What if id == LAST_NONSIGNAL_EVENT_ID - 1? + * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so + * the first loop fails immediately and we proceed with the + * wraparound loop below. + */ + p->next_nonsignal_event_id = id + 1; - return idr_find(&p->event_idr, id); + return id; } - /* General case for partial IDs: Iterate over all matching IDs - * and find the first one that has signaled. - */ - for (ev = NULL; id < KFD_SIGNAL_EVENT_LIMIT && !ev; id += 1U << bits) { - if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) - continue; + for (id = KFD_FIRST_NONSIGNAL_EVENT_ID; + id < KFD_LAST_NONSIGNAL_EVENT_ID && + lookup_event_by_id(p, id); + id++) + ; + - ev = idr_find(&p->event_idr, id); + if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { + p->next_nonsignal_event_id = id + 1; + return id; } - return ev; + p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; + return 0; +} + +static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p, + struct signal_page *page, + unsigned int signal_slot) +{ + return lookup_event_by_id(p, make_signal_event_id(page, signal_slot)); } static int create_signal_event(struct file *devkfd, struct kfd_process *p, struct kfd_event *ev) { - int ret; - if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { - if (!p->signal_event_limit_reached) { - pr_warn("Signal event wasn't created because limit was reached\n"); - p->signal_event_limit_reached = true; - } - return -ENOSPC; + pr_warn("Signal event wasn't created because limit was reached\n"); + return -ENOMEM; } - ret = allocate_event_notification_slot(p, ev); - if (ret) { + if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page, + &ev->signal_slot_index)) { pr_warn("Signal event wasn't created because out of kernel memory\n"); - return ret; + return -ENOMEM; } p->signal_event_count++; - ev->user_signal_address = &p->signal_page->user_address[ev->event_id]; + ev->user_signal_address = + &ev->signal_page->user_address[ev->signal_slot_index]; + + ev->event_id = make_signal_event_id(ev->signal_page, + ev->signal_slot_index); + pr_debug("Signal event number %zu created with id %d, address %p\n", p->signal_event_count, ev->event_id, ev->user_signal_address); @@ -221,20 +317,16 @@ static int create_signal_event(struct file *devkfd, return 0; } +/* + * No non-signal events are supported yet. + * We create them as events that never signal. + * Set event calls from user-mode are failed. + */ static int create_other_event(struct kfd_process *p, struct kfd_event *ev) { - /* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an - * intentional integer overflow to -1 without a compiler - * warning. idr_alloc treats a negative value as "maximum - * signed integer". - */ - int id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID, - (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1, - GFP_KERNEL); - - if (id < 0) - return id; - ev->event_id = id; + ev->event_id = make_nonsignal_event_id(p); + if (ev->event_id == 0) + return -ENOMEM; return 0; } @@ -242,50 +334,52 @@ static int create_other_event(struct kfd_process *p, struct kfd_event *ev) void kfd_event_init_process(struct kfd_process *p) { mutex_init(&p->event_mutex); - idr_init(&p->event_idr); - p->signal_page = NULL; + hash_init(p->events); + INIT_LIST_HEAD(&p->signal_event_pages); + p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; p->signal_event_count = 0; } static void destroy_event(struct kfd_process *p, struct kfd_event *ev) { - struct kfd_event_waiter *waiter; - - /* Wake up pending waiters. They will return failure */ - list_for_each_entry(waiter, &ev->wq.head, wait.entry) - waiter->event = NULL; - wake_up_all(&ev->wq); - - if (ev->type == KFD_EVENT_TYPE_SIGNAL || - ev->type == KFD_EVENT_TYPE_DEBUG) + if (ev->signal_page) { + release_event_notification_slot(ev->signal_page, + ev->signal_slot_index); p->signal_event_count--; + } + + /* + * Abandon the list of waiters. Individual waiting threads will + * clean up their own data. + */ + list_del(&ev->waiters); - idr_remove(&p->event_idr, ev->event_id); + hash_del(&ev->events); kfree(ev); } static void destroy_events(struct kfd_process *p) { struct kfd_event *ev; - uint32_t id; + struct hlist_node *tmp; + unsigned int hash_bkt; - idr_for_each_entry(&p->event_idr, ev, id) + hash_for_each_safe(p->events, hash_bkt, tmp, ev, events) destroy_event(p, ev); - idr_destroy(&p->event_idr); } /* * We assume that the process is being destroyed and there is no need to * unmap the pages or keep bookkeeping data in order. */ -static void shutdown_signal_page(struct kfd_process *p) +static void shutdown_signal_pages(struct kfd_process *p) { - struct kfd_signal_page *page = p->signal_page; + struct signal_page *page, *tmp; - if (page) { - if (page->user_address) - free_pages((unsigned long)page->kernel_address, - get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + list_for_each_entry_safe(page, tmp, &p->signal_event_pages, + event_pages) { + free_pages((unsigned long)page->kernel_address, + get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); kfree(page); } } @@ -293,7 +387,7 @@ static void shutdown_signal_page(struct kfd_process *p) void kfd_event_free_process(struct kfd_process *p) { destroy_events(p); - shutdown_signal_page(p); + shutdown_signal_pages(p); } static bool event_can_be_gpu_signaled(const struct kfd_event *ev) @@ -310,8 +404,7 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint32_t event_type, bool auto_reset, uint32_t node_id, uint32_t *event_id, uint32_t *event_trigger_data, - uint64_t *event_page_offset, uint32_t *event_slot_index, - void *kern_addr) + uint64_t *event_page_offset, uint32_t *event_slot_index) { int ret = 0; struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); @@ -323,29 +416,21 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, ev->auto_reset = auto_reset; ev->signaled = false; - init_waitqueue_head(&ev->wq); - - mutex_lock(&p->event_mutex); - - if (kern_addr && !p->signal_page) { - p->signal_page = allocate_signal_page_dgpu(p, kern_addr, - *event_page_offset); - if (!p->signal_page) { - ret = -ENOMEM; - goto out; - } - } + INIT_LIST_HEAD(&ev->waiters); *event_page_offset = 0; + mutex_lock(&p->event_mutex); + switch (event_type) { case KFD_EVENT_TYPE_SIGNAL: case KFD_EVENT_TYPE_DEBUG: ret = create_signal_event(devkfd, p, ev); if (!ret) { - *event_page_offset = KFD_MMAP_TYPE_EVENTS; + *event_page_offset = (ev->signal_page->page_index | + KFD_MMAP_EVENTS_MASK); *event_page_offset <<= PAGE_SHIFT; - *event_slot_index = ev->event_id; + *event_slot_index = ev->signal_slot_index; } break; default: @@ -354,13 +439,14 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, } if (!ret) { + hash_add(p->events, &ev->events, ev->event_id); + *event_id = ev->event_id; *event_trigger_data = ev->event_id; } else { kfree(ev); } -out: mutex_unlock(&p->event_mutex); return ret; @@ -388,14 +474,19 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id) static void set_event(struct kfd_event *ev) { struct kfd_event_waiter *waiter; + struct kfd_event_waiter *next; /* Auto reset if the list is non-empty and we're waking someone. */ - ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq); + ev->signaled = !ev->auto_reset || list_empty(&ev->waiters); - list_for_each_entry(waiter, &ev->wq.head, wait.entry) + list_for_each_entry_safe(waiter, next, &ev->waiters, waiters) { waiter->activated = true; - wake_up_all(&ev->wq); + /* _init because free_waiters will call list_del */ + list_del_init(&waiter->waiters); + + wake_up_process(waiter->sleeping_task); + } } /* Assumes that p is current. */ @@ -444,7 +535,13 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id) static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev) { - page_slots(p->signal_page)[ev->event_id] = UNSIGNALED_EVENT_SLOT; + page_slots(ev->signal_page)[ev->signal_slot_index] = + UNSIGNALED_EVENT_SLOT; +} + +static bool is_slot_signaled(struct signal_page *page, unsigned int index) +{ + return page_slots(page)[index] != UNSIGNALED_EVENT_SLOT; } static void set_event_from_interrupt(struct kfd_process *p, @@ -459,12 +556,12 @@ static void set_event_from_interrupt(struct kfd_process *p, void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits) { - struct kfd_event *ev = NULL; + struct kfd_event *ev; /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. + * running so the lookup function returns a locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -473,50 +570,30 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, mutex_lock(&p->event_mutex); - if (valid_id_bits) - ev = lookup_signaled_event_by_partial_id(p, partial_id, - valid_id_bits); - if (ev) { + if (valid_id_bits >= INTERRUPT_DATA_BITS) { + /* Partial ID is a full ID. */ + ev = lookup_event_by_id(p, partial_id); set_event_from_interrupt(p, ev); - } else if (p->signal_page) { + } else { /* - * Partial ID lookup failed. Assume that the event ID - * in the interrupt payload was invalid and do an - * exhaustive search of signaled events. + * Partial ID is in fact partial. For now we completely + * ignore it, but we could use any bits we did receive to + * search faster. */ - uint64_t *slots = page_slots(p->signal_page); - uint32_t id; - - if (valid_id_bits) - pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", - partial_id, valid_id_bits); - - if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/64) { - /* With relatively few events, it's faster to - * iterate over the event IDR - */ - idr_for_each_entry(&p->event_idr, ev, id) { - if (id >= KFD_SIGNAL_EVENT_LIMIT) - break; - - if (slots[id] != UNSIGNALED_EVENT_SLOT) - set_event_from_interrupt(p, ev); - } - } else { - /* With relatively many events, it's faster to - * iterate over the signal slots and lookup - * only signaled events from the IDR. - */ - for (id = 0; id < KFD_SIGNAL_EVENT_LIMIT; id++) - if (slots[id] != UNSIGNALED_EVENT_SLOT) { - ev = lookup_event_by_id(p, id); + struct signal_page *page; + unsigned int i; + + list_for_each_entry(page, &p->signal_event_pages, event_pages) + for (i = 0; i < SLOTS_PER_PAGE; i++) + if (is_slot_signaled(page, i)) { + ev = lookup_event_by_page_slot(p, + page, i); set_event_from_interrupt(p, ev); } - } } mutex_unlock(&p->event_mutex); - kfd_unref_process(p); + mutex_unlock(&p->mutex); } static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) @@ -529,16 +606,18 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) GFP_KERNEL); for (i = 0; (event_waiters) && (i < num_events) ; i++) { - init_wait(&event_waiters[i].wait); + INIT_LIST_HEAD(&event_waiters[i].waiters); + event_waiters[i].sleeping_task = current; event_waiters[i].activated = false; } return event_waiters; } -static int init_event_waiter_get_status(struct kfd_process *p, +static int init_event_waiter(struct kfd_process *p, struct kfd_event_waiter *waiter, - uint32_t event_id) + uint32_t event_id, + uint32_t input_index) { struct kfd_event *ev = lookup_event_by_id(p, event_id); @@ -546,60 +625,38 @@ static int init_event_waiter_get_status(struct kfd_process *p, return -EINVAL; waiter->event = ev; + waiter->input_index = input_index; waiter->activated = ev->signaled; ev->signaled = ev->signaled && !ev->auto_reset; - return 0; -} + list_add(&waiter->waiters, &ev->waiters); -static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) -{ - struct kfd_event *ev = waiter->event; - - /* Only add to the wait list if we actually need to - * wait on this event. - */ - if (!waiter->activated) - add_wait_queue(&ev->wq, &waiter->wait); + return 0; } -/* test_event_condition - Test condition of events being waited for - * @all: Return completion only if all events have signaled - * @num_events: Number of events to wait for - * @event_waiters: Array of event waiters, one per event - * - * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have - * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all) - * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of - * the events have been destroyed. - */ -static uint32_t test_event_condition(bool all, uint32_t num_events, +static bool test_event_condition(bool all, uint32_t num_events, struct kfd_event_waiter *event_waiters) { uint32_t i; uint32_t activated_count = 0; for (i = 0; i < num_events; i++) { - if (!event_waiters[i].event) - return KFD_IOC_WAIT_RESULT_FAIL; - if (event_waiters[i].activated) { if (!all) - return KFD_IOC_WAIT_RESULT_COMPLETE; + return true; activated_count++; } } - return activated_count == num_events ? - KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT; + return activated_count == num_events; } /* * Copy event specific data, if defined. * Currently only memory exception events have additional data to copy to user */ -static int copy_signaled_event_data(uint32_t num_events, +static bool copy_signaled_event_data(uint32_t num_events, struct kfd_event_waiter *event_waiters, struct kfd_event_data __user *data) { @@ -613,15 +670,15 @@ static int copy_signaled_event_data(uint32_t num_events, waiter = &event_waiters[i]; event = waiter->event; if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) { - dst = &data[i].memory_exception_data; + dst = &data[waiter->input_index].memory_exception_data; src = &event->memory_exception_data; if (copy_to_user(dst, src, sizeof(struct kfd_hsa_memory_exception_data))) - return -EFAULT; + return false; } } - return 0; + return true; } @@ -650,9 +707,7 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters) uint32_t i; for (i = 0; i < num_events; i++) - if (waiters[i].event) - remove_wait_queue(&waiters[i].event->wq, - &waiters[i].wait); + list_del(&waiters[i].waiters); kfree(waiters); } @@ -660,56 +715,38 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters) int kfd_wait_on_events(struct kfd_process *p, uint32_t num_events, void __user *data, bool all, uint32_t user_timeout_ms, - uint32_t *wait_result) + enum kfd_event_wait_result *wait_result) { struct kfd_event_data __user *events = (struct kfd_event_data __user *) data; uint32_t i; int ret = 0; - struct kfd_event_waiter *event_waiters = NULL; long timeout = user_timeout_to_jiffies(user_timeout_ms); + mutex_lock(&p->event_mutex); + event_waiters = alloc_event_waiters(num_events); if (!event_waiters) { ret = -ENOMEM; - goto out; + goto fail; } - mutex_lock(&p->event_mutex); - for (i = 0; i < num_events; i++) { struct kfd_event_data event_data; if (copy_from_user(&event_data, &events[i], sizeof(struct kfd_event_data))) { ret = -EFAULT; - goto out_unlock; + goto fail; } - ret = init_event_waiter_get_status(p, &event_waiters[i], - event_data.event_id); + ret = init_event_waiter(p, &event_waiters[i], + event_data.event_id, i); if (ret) - goto out_unlock; - } - - /* Check condition once. */ - *wait_result = test_event_condition(all, num_events, event_waiters); - if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) { - ret = copy_signaled_event_data(num_events, - event_waiters, events); - goto out_unlock; - } else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) { - /* This should not happen. Events shouldn't be - * destroyed while we're holding the event_mutex - */ - goto out_unlock; + goto fail; } - /* Add to wait lists if we need to wait. */ - for (i = 0; i < num_events; i++) - init_event_waiter_add_to_waitlist(&event_waiters[i]); - mutex_unlock(&p->event_mutex); while (true) { @@ -731,45 +768,37 @@ int kfd_wait_on_events(struct kfd_process *p, break; } - /* Set task state to interruptible sleep before - * checking wake-up conditions. A concurrent wake-up - * will put the task back into runnable state. In that - * case schedule_timeout will not put the task to - * sleep and we'll get a chance to re-check the - * updated conditions almost immediately. Otherwise, - * this race condition would lead to a soft hang or a - * very long sleep. - */ - set_current_state(TASK_INTERRUPTIBLE); - - *wait_result = test_event_condition(all, num_events, - event_waiters); - if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT) + if (test_event_condition(all, num_events, event_waiters)) { + if (copy_signaled_event_data(num_events, + event_waiters, events)) + *wait_result = KFD_WAIT_COMPLETE; + else + *wait_result = KFD_WAIT_ERROR; break; + } - if (timeout <= 0) + if (timeout <= 0) { + *wait_result = KFD_WAIT_TIMEOUT; break; + } - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); } __set_current_state(TASK_RUNNING); - /* copy_signaled_event_data may sleep. So this has to happen - * after the task state is set back to RUNNING. - */ - if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) - ret = copy_signaled_event_data(num_events, - event_waiters, events); - mutex_lock(&p->event_mutex); -out_unlock: free_waiters(num_events, event_waiters); mutex_unlock(&p->event_mutex); -out: - if (ret) - *wait_result = KFD_IOC_WAIT_RESULT_FAIL; - else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL) - ret = -EIO; + + return ret; + +fail: + if (event_waiters) + free_waiters(num_events, event_waiters); + + mutex_unlock(&p->event_mutex); + + *wait_result = KFD_WAIT_ERROR; return ret; } @@ -777,8 +806,9 @@ int kfd_wait_on_events(struct kfd_process *p, int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) { + unsigned int page_index; unsigned long pfn; - struct kfd_signal_page *page; + struct signal_page *page; /* check required size is logical */ if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) != @@ -787,10 +817,13 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) return -EINVAL; } - page = p->signal_page; + page_index = vma->vm_pgoff; + + page = lookup_signal_page_by_index(p, page_index); if (!page) { /* Probably KFD bug, but mmap is user-accessible. */ - pr_debug("Signal page could not be found\n"); + pr_debug("Signal page could not be found for page_index %u\n", + page_index); return -EINVAL; } @@ -824,13 +857,12 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, { struct kfd_hsa_memory_exception_data *ev_data; struct kfd_event *ev; - uint32_t id; + int bkt; bool send_signal = true; ev_data = (struct kfd_hsa_memory_exception_data *) event_data; - id = KFD_FIRST_NONSIGNAL_EVENT_ID; - idr_for_each_entry_continue(&p->event_idr, ev, id) + hash_for_each(p->events, bkt, ev, events) if (ev->type == type) { send_signal = false; dev_dbg(kfd_device, @@ -841,13 +873,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, ev->memory_exception_data = *ev_data; } - if (type == KFD_EVENT_TYPE_MEMORY) { - dev_warn(kfd_device, - "Sending SIGSEGV to HSA Process with PID %d ", - p->lead_thread->pid); - send_sig(SIGSEGV, p->lead_thread, 0); - } - /* Send SIGTERM no event of type "type" has been found*/ if (send_signal) { if (send_sigterm) { @@ -863,7 +888,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, } } -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, unsigned long address, bool is_write_requested, bool is_execute_requested) @@ -874,27 +898,17 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. + * running so the lookup function returns a locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - struct mm_struct *mm; if (!p) return; /* Presumably process exited. */ - /* Take a safe reference to the mm_struct, which may otherwise - * disappear even while the kfd_process is still referenced. - */ - mm = get_task_mm(p->lead_thread); - if (!mm) { - kfd_unref_process(p); - return; /* Process is exiting */ - } - memset(&memory_exception_data, 0, sizeof(memory_exception_data)); - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); + down_read(&p->mm->mmap_sem); + vma = find_vma(p->mm, address); memory_exception_data.gpu_id = dev->id; memory_exception_data.va = address; @@ -920,8 +934,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, } } - up_read(&mm->mmap_sem); - mmput(mm); + up_read(&p->mm->mmap_sem); mutex_lock(&p->event_mutex); @@ -930,17 +943,15 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, &memory_exception_data); mutex_unlock(&p->event_mutex); - - kfd_unref_process(p); + mutex_unlock(&p->mutex); } -#endif /* CONFIG_AMD_IOMMU_V2_MODULE */ void kfd_signal_hw_exception_event(unsigned int pasid) { /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function increments the process ref count. + * running so the lookup function returns a locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -953,42 +964,5 @@ void kfd_signal_hw_exception_event(unsigned int pasid) lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL); mutex_unlock(&p->event_mutex); - kfd_unref_process(p); -} - -void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, - struct kfd_vm_fault_info *info) -{ - struct kfd_event *ev; - uint32_t id; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - struct kfd_hsa_memory_exception_data memory_exception_data; - - if (!p) - return; /* Presumably process exited. */ - memset(&memory_exception_data, 0, sizeof(memory_exception_data)); - memory_exception_data.gpu_id = dev->id; - memory_exception_data.failure.imprecise = true; - /* Set failure reason */ - if (info) { - memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; - memory_exception_data.failure.NotPresent = - info->prot_valid ? 1 : 0; - memory_exception_data.failure.NoExecute = - info->prot_exec ? 1 : 0; - memory_exception_data.failure.ReadOnly = - info->prot_write ? 1 : 0; - memory_exception_data.failure.imprecise = 0; - } - mutex_lock(&p->event_mutex); - - id = KFD_FIRST_NONSIGNAL_EVENT_ID; - idr_for_each_entry_continue(&p->event_idr, ev, id) - if (ev->type == KFD_EVENT_TYPE_MEMORY) { - ev->memory_exception_data = memory_exception_data; - set_event(ev); - } - - mutex_unlock(&p->event_mutex); - kfd_unref_process(p); + mutex_unlock(&p->mutex); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h index abca5bf..28f6838 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h @@ -27,17 +27,12 @@ #include #include #include -#include #include "kfd_priv.h" #include -/* - * IDR supports non-negative integer IDs. Small IDs are used for - * signal events to match their signal slot. Use the upper half of the - * ID space for non-signal events. - */ -#define KFD_FIRST_NONSIGNAL_EVENT_ID ((INT_MAX >> 1) + 1) -#define KFD_LAST_NONSIGNAL_EVENT_ID INT_MAX +#define KFD_EVENT_ID_NONSIGNAL_MASK 0x80000000U +#define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK +#define KFD_LAST_NONSIGNAL_EVENT_ID UINT_MAX /* * Written into kfd_signal_slot_t to indicate that the event is not signaled. @@ -51,6 +46,9 @@ struct kfd_event_waiter; struct signal_page; struct kfd_event { + /* All events in process, rooted at kfd_process.events. */ + struct hlist_node events; + u32 event_id; bool signaled; @@ -58,9 +56,11 @@ struct kfd_event { int type; - wait_queue_head_t wq; /* List of event waiters. */ + struct list_head waiters; /* List of kfd_event_waiter by waiters. */ /* Only for signal events. */ + struct signal_page *signal_page; + unsigned int signal_slot_index; uint64_t __user *user_signal_address; /* type specific data */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 499efa1..c59384b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -275,80 +275,24 @@ * for FLAT_* / S_LOAD operations. */ -#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ +#define MAKE_GPUVM_APP_BASE(gpu_num) \ (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) -#define MAKE_GPUVM_APP_LIMIT(base, size) \ - (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) +#define MAKE_GPUVM_APP_LIMIT(base) \ + (((uint64_t)(base) & \ + 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) -#define MAKE_SCRATCH_APP_BASE_VI() \ - (((uint64_t)(0x1UL) << 61) + 0x100000000L) +#define MAKE_SCRATCH_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x100000000L) #define MAKE_SCRATCH_APP_LIMIT(base) \ (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) -#define MAKE_LDS_APP_BASE_VI() \ - (((uint64_t)(0x1UL) << 61) + 0x0) - +#define MAKE_LDS_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x0) #define MAKE_LDS_APP_LIMIT(base) \ (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) -/* On GFXv9 the LDS and scratch apertures are programmed independently - * using the high 16 bits of the 64-bit virtual address. They must be - * in the hole, which will be the case as long as the high 16 bits are - * not 0. - * - * The aperture sizes are still 4GB implicitly. - * - * A GPUVM aperture is not applicable on GFXv9. - */ -#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) -#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) - -/* Some VM address space reserved for kernel use (CWSR trap handlers - * and kernel IBs) - */ -#define DGPU_VM_BASE_DEFAULT 0x100000 -#define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE) - -int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, - uint64_t base, uint64_t limit) -{ - if (base < (pdd->qpd.cwsr_base + KFD_CWSR_TBA_TMA_SIZE)) { - pr_err("Set dgpu vm base 0x%llx failed.\n", base); - return -EINVAL; - } - pdd->dgpu_base = base; - pdd->dgpu_limit = limit; - return 0; -} - -void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) -{ - /* - * node id couldn't be 0 - the three MSB bits of - * aperture shoudn't be 0 - */ - pdd->lds_base = MAKE_LDS_APP_BASE_VI(); - pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); - - pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); - pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( - pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size); - - pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); - pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); -} - -void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) -{ - pdd->lds_base = MAKE_LDS_APP_BASE_V9(); - pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); - - pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); - pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); -} - int kfd_init_apertures(struct kfd_process *process) { uint8_t id = 0; @@ -356,11 +300,8 @@ int kfd_init_apertures(struct kfd_process *process) struct kfd_process_device *pdd; /*Iterating over all devices*/ - while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { - if (!dev) { - id++; /* Skip non GPU devices */ - continue; - } + while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && + id < NUM_OF_SUPPORTED_GPUS) { pdd = kfd_create_process_device_data(dev, process); if (!pdd) { @@ -377,29 +318,23 @@ int kfd_init_apertures(struct kfd_process *process) pdd->gpuvm_base = pdd->gpuvm_limit = 0; pdd->scratch_base = pdd->scratch_limit = 0; } else { - switch (dev->device_info->asic_family) { - case CHIP_KAVERI: - case CHIP_HAWAII: - case CHIP_CARRIZO: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - kfd_init_apertures_vi(pdd, id); - break; - case CHIP_VEGA10: - case CHIP_RAVEN: - kfd_init_apertures_v9(pdd, id); - break; - default: - pr_err("Unknown chip in kfd_init_apertures\n"); - return -1; - } + /* + * node id couldn't be 0 - the three MSB bits of + * aperture shoudn't be 0 + */ + pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); + + pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); + + pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); + + pdd->gpuvm_limit = + MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); - if (!dev->device_info->is_need_iommu_device) { - pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT; - pdd->qpd.ib_base = DGPU_IB_BASE_DEFAULT; - } + pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); + + pdd->scratch_limit = + MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); } dev_dbg(kfd_device, "node id %u\n", id); @@ -416,3 +351,5 @@ int kfd_init_apertures(struct kfd_process *process) return 0; } + + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c deleted file mode 100644 index 009d6f4..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "kfd_priv.h" -#include "kfd_events.h" -#include "soc15_int.h" - - -static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid) -{ - uint32_t pasid = 0; - const struct kfd2kgd_calls *f2g = dev->kfd2kgd; - - if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid)) - pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); - - return pasid; -} - -static bool event_interrupt_isr_v9(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, - uint32_t *patched_ihre, - bool *patched_flag) -{ - uint16_t source_id, client_id, pasid, vmid; - bool result = false; - - source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); - client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); - pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); - vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); - - if (pasid) { - const uint32_t *data = ih_ring_entry; - - pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", - client_id, source_id, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); - } - - if ((vmid >= dev->vm_info.first_vmid_kfd && - vmid <= dev->vm_info.last_vmid_kfd) && - (source_id == SOC15_INTSRC_CP_END_OF_PIPE || - source_id == SOC15_INTSRC_SDMA_TRAP || - source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || - source_id == SOC15_INTSRC_CP_BAD_OPCODE || - client_id == SOC15_IH_CLIENTID_VMC || - client_id == SOC15_IH_CLIENTID_UTCL2)) { - - /* - * KFD want to handle this INT, but MEC firmware did - * not send pasid. Try to get it from vmid mapping - * and patch the ih entry. It's a temp workaround. - */ - WARN_ONCE((!pasid), "Fix me.\n"); - if (!pasid) { - uint32_t temp = le32_to_cpu(ih_ring_entry[3]); - - pasid = kfd_get_pasid_from_vmid(dev, vmid); - memcpy(patched_ihre, ih_ring_entry, - dev->device_info->ih_ring_entry_size); - patched_ihre[3] = cpu_to_le32(temp | pasid); - *patched_flag = true; - } - result = pasid ? true : false; - } - - /* Do not process in ISR, just request it to be forwarded to WQ. */ - return result; - -} - -static void event_interrupt_wq_v9(struct kfd_dev *dev, - const uint32_t *ih_ring_entry) -{ - uint16_t source_id, client_id, pasid, vmid; - uint32_t context_id; - - source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); - client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); - pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); - vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); - context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); - - if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(pasid, context_id, 32); - else if (source_id == SOC15_INTSRC_SDMA_TRAP) - kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28); - else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) - kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24); - else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) - kfd_signal_hw_exception_event(pasid); - else if (client_id == SOC15_IH_CLIENTID_VMC || - client_id == SOC15_IH_CLIENTID_UTCL2) { - struct kfd_vm_fault_info info = {0}; - uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); - - info.vmid = vmid; - info.mc_id = client_id; - info.page_addr = ih_ring_entry[4] | - (uint64_t)(ih_ring_entry[5] & 0xf) << 32; - info.prot_valid = ring_id & 0x08; - info.prot_read = ring_id & 0x10; - info.prot_write = ring_id & 0x20; - - kfd_process_vm_fault(dev->dqm, pasid); - kfd_signal_vm_fault_event(dev, pasid, &info); - } -} - -const struct kfd_event_interrupt_class event_interrupt_class_v9 = { - .interrupt_isr = event_interrupt_isr_v9, - .interrupt_wq = event_interrupt_wq_v9, -}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index 92a277f..70b3a99c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -44,23 +44,24 @@ #include #include "kfd_priv.h" -#define KFD_IH_NUM_ENTRIES 8192 +#define KFD_INTERRUPT_RING_SIZE 1024 static void interrupt_wq(struct work_struct *); int kfd_interrupt_init(struct kfd_dev *kfd) { - int r; - - r = kfifo_alloc(&kfd->ih_fifo, - KFD_IH_NUM_ENTRIES * kfd->device_info->ih_ring_entry_size, - GFP_KERNEL); - if (r) { - dev_err(kfd_chardev(), "Failed to allocate IH fifo\n"); - return r; - } + void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE, + kfd->device_info->ih_ring_entry_size, + GFP_KERNEL); + if (!interrupt_ring) + return -ENOMEM; + + kfd->interrupt_ring = interrupt_ring; + kfd->interrupt_ring_size = + KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size; + atomic_set(&kfd->interrupt_ring_wptr, 0); + atomic_set(&kfd->interrupt_ring_rptr, 0); - kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); spin_lock_init(&kfd->interrupt_lock); INIT_WORK(&kfd->interrupt_work, interrupt_wq); @@ -91,47 +92,74 @@ void kfd_interrupt_exit(struct kfd_dev *kfd) spin_unlock_irqrestore(&kfd->interrupt_lock, flags); /* - * flush_work ensures that there are no outstanding + * Flush_scheduled_work ensures that there are no outstanding * work-queue items that will access interrupt_ring. New work items * can't be created because we stopped interrupt handling above. */ - flush_workqueue(kfd->ih_wq); + flush_scheduled_work(); - kfifo_free(&kfd->ih_fifo); + kfree(kfd->interrupt_ring); } /* - * Assumption: single reader/writer. This function is not re-entrant + * This assumes that it can't be called concurrently with itself + * but only with dequeue_ih_ring_entry. */ bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) { - int count; + unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); + unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); - count = kfifo_in(&kfd->ih_fifo, ih_ring_entry, - kfd->device_info->ih_ring_entry_size); - if (count != kfd->device_info->ih_ring_entry_size) { + if ((rptr - wptr) % kfd->interrupt_ring_size == + kfd->device_info->ih_ring_entry_size) { + /* This is very bad, the system is likely to hang. */ dev_err_ratelimited(kfd_chardev(), - "Interrupt ring overflow, dropping interrupt %d\n", - count); + "Interrupt ring overflow, dropping interrupt.\n"); return false; } + memcpy(kfd->interrupt_ring + wptr, ih_ring_entry, + kfd->device_info->ih_ring_entry_size); + + wptr = (wptr + kfd->device_info->ih_ring_entry_size) % + kfd->interrupt_ring_size; + smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */ + atomic_set(&kfd->interrupt_ring_wptr, wptr); + return true; } /* - * Assumption: single reader/writer. This function is not re-entrant + * This assumes that it can't be called concurrently with itself + * but only with enqueue_ih_ring_entry. */ static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry) { - int count; + /* + * Assume that wait queues have an implicit barrier, i.e. anything that + * happened in the ISR before it queued work is visible. + */ + + unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); + unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); - count = kfifo_out(&kfd->ih_fifo, ih_ring_entry, - kfd->device_info->ih_ring_entry_size); + if (rptr == wptr) + return false; - WARN_ON(count && count != kfd->device_info->ih_ring_entry_size); + memcpy(ih_ring_entry, kfd->interrupt_ring + rptr, + kfd->device_info->ih_ring_entry_size); - return count == kfd->device_info->ih_ring_entry_size; + rptr = (rptr + kfd->device_info->ih_ring_entry_size) % + kfd->interrupt_ring_size; + + /* + * Ensure the rptr write update is not visible until + * memcpy has finished reading. + */ + smp_mb(); + atomic_set(&kfd->interrupt_ring_rptr, rptr); + + return true; } static void interrupt_wq(struct work_struct *work) @@ -148,15 +176,13 @@ static void interrupt_wq(struct work_struct *work) ih_ring_entry); } -bool interrupt_is_wanted(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, - uint32_t *patched_ihre, bool *flag) +bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) { /* integer and bitwise OR so there is no boolean short-circuiting */ unsigned int wanted = 0; wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, - ih_ring_entry, patched_ihre, flag); + ih_ring_entry); return wanted != 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c deleted file mode 100644 index 0feb366..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include - -#include "kfd_ipc.h" -#include "kfd_priv.h" - -#define KFD_IPC_HASH_TABLE_SIZE_SHIFT 4 -#define KFD_IPC_HASH_TABLE_SIZE_MASK ((1 << KFD_IPC_HASH_TABLE_SIZE_SHIFT) - 1) - -static struct kfd_ipc_handles { - DECLARE_HASHTABLE(handles, KFD_IPC_HASH_TABLE_SIZE_SHIFT); - struct mutex lock; -} kfd_ipc_handles; - -/* Since, handles are random numbers, it can be used directly as hashing key. - * The least 4 bits of the handle are used as key. However, during import all - * 128 bits of the handle are checked to prevent handle snooping. - */ -#define HANDLE_TO_KEY(sh) ((*(uint64_t *)sh) & KFD_IPC_HASH_TABLE_SIZE_MASK) - -static int ipc_store_insert(void *val, void *sh, struct kfd_ipc_obj **ipc_obj) -{ - struct kfd_ipc_obj *obj; - - obj = kmalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; - - /* The initial ref belongs to the allocator process. - * The IPC object store itself does not hold a ref since - * there is no specific moment in time where that ref should - * be dropped, except "when there are no more userspace processes - * holding a ref to the object". Therefore the removal from IPC - * storage happens at ipc_obj release time. - */ - kref_init(&obj->ref); - obj->data = val; - get_random_bytes(obj->share_handle, sizeof(obj->share_handle)); - - memcpy(sh, obj->share_handle, sizeof(obj->share_handle)); - - mutex_lock(&kfd_ipc_handles.lock); - hlist_add_head(&obj->node, - &kfd_ipc_handles.handles[HANDLE_TO_KEY(obj->share_handle)]); - mutex_unlock(&kfd_ipc_handles.lock); - - if (ipc_obj) - *ipc_obj = obj; - - return 0; -} - -static void ipc_obj_release(struct kref *r) -{ - struct kfd_ipc_obj *obj; - - obj = container_of(r, struct kfd_ipc_obj, ref); - - mutex_lock(&kfd_ipc_handles.lock); - hash_del(&obj->node); - mutex_unlock(&kfd_ipc_handles.lock); - - dma_buf_put(obj->data); - kfree(obj); -} - -void ipc_obj_get(struct kfd_ipc_obj *obj) -{ - kref_get(&obj->ref); -} - -void ipc_obj_put(struct kfd_ipc_obj **obj) -{ - kref_put(&(*obj)->ref, ipc_obj_release); - *obj = NULL; -} - -int kfd_ipc_init(void) -{ - mutex_init(&kfd_ipc_handles.lock); - hash_init(kfd_ipc_handles.handles); - return 0; -} - -static int kfd_import_dmabuf_create_kfd_bo(struct kfd_dev *dev, - struct kfd_process *p, - uint32_t gpu_id, struct dma_buf *dmabuf, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset, - struct kfd_ipc_obj *ipc_obj) -{ - int r; - void *mem; - uint64_t size; - int idr_handle; - struct kfd_process_device *pdd = NULL; - - if (!handle) - return -EINVAL; - - if (!dev || !dev->kfd2kgd->import_dmabuf) - return -EINVAL; - - mutex_lock(&p->mutex); - - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - r = PTR_ERR(pdd); - goto err_unlock; - } - - r = dev->kfd2kgd->import_dmabuf(dev->kgd, dmabuf, - va_addr, pdd->vm, - (struct kgd_mem **)&mem, &size, - mmap_offset); - if (r) - goto err_unlock; - - idr_handle = kfd_process_device_create_obj_handle(pdd, mem, - va_addr, size, - ipc_obj); - if (idr_handle < 0) { - r = -EFAULT; - goto err_free; - } - - mutex_unlock(&p->mutex); - - *handle = MAKE_HANDLE(gpu_id, idr_handle); - - return 0; - -err_free: - dev->kfd2kgd->free_memory_of_gpu(dev->kgd, - (struct kgd_mem *)mem, - pdd->vm); -err_unlock: - mutex_unlock(&p->mutex); - return r; -} - -int kfd_ipc_import_dmabuf(struct kfd_dev *dev, - struct kfd_process *p, - uint32_t gpu_id, int dmabuf_fd, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset) -{ - int r; - struct dma_buf *dmabuf = dma_buf_get(dmabuf_fd); - - if (!dmabuf) - return -EINVAL; - - r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, dmabuf, - va_addr, handle, mmap_offset, - NULL); - dma_buf_put(dmabuf); - return r; -} - -int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, - uint32_t gpu_id, uint32_t *share_handle, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset) -{ - int r; - struct kfd_ipc_obj *entry, *found = NULL; - - mutex_lock(&kfd_ipc_handles.lock); - /* Convert the user provided handle to hash key and search only in that - * bucket - */ - hlist_for_each_entry(entry, - &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { - if (!memcmp(entry->share_handle, share_handle, - sizeof(entry->share_handle))) { - found = entry; - break; - } - } - mutex_unlock(&kfd_ipc_handles.lock); - - if (!found) - return -EINVAL; - ipc_obj_get(found); - - pr_debug("Found ipc_dma_buf: %p\n", found->data); - - r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, found->data, - va_addr, handle, mmap_offset, - found); - if (r) - goto error_unref; - - return r; - -error_unref: - ipc_obj_put(&found); - return r; -} - -int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, - uint64_t handle, uint32_t *ipc_handle) -{ - struct kfd_process_device *pdd = NULL; - struct kfd_ipc_obj *obj; - struct kfd_bo *kfd_bo = NULL; - struct dma_buf *dmabuf; - int r; - - if (!dev || !ipc_handle) - return -EINVAL; - - mutex_lock(&p->mutex); - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - mutex_unlock(&p->mutex); - pr_err("Failed to get pdd\n"); - return PTR_ERR(pdd); - } - - kfd_bo = kfd_process_device_find_bo(pdd, GET_IDR_HANDLE(handle)); - mutex_unlock(&p->mutex); - - if (!kfd_bo) { - pr_err("Failed to get bo"); - return -EINVAL; - } - if (kfd_bo->kfd_ipc_obj) { - memcpy(ipc_handle, kfd_bo->kfd_ipc_obj->share_handle, - sizeof(kfd_bo->kfd_ipc_obj->share_handle)); - return 0; - } - - r = dev->kfd2kgd->export_dmabuf(dev->kgd, pdd->vm, - (struct kgd_mem *)kfd_bo->mem, - &dmabuf); - if (r) - return r; - - r = ipc_store_insert(dmabuf, ipc_handle, &obj); - if (r) - return r; - - kfd_bo->kfd_ipc_obj = obj; - - return r; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h deleted file mode 100644 index 9ee8627..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef KFD_IPC_H_ -#define KFD_IPC_H_ - -#include -#include "kfd_priv.h" - -struct kfd_ipc_obj { - struct hlist_node node; - struct kref ref; - void *data; - uint32_t share_handle[4]; -}; - -int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, - uint32_t gpu_id, uint32_t *share_handle, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset); -int kfd_ipc_import_dmabuf(struct kfd_dev *kfd, struct kfd_process *p, - uint32_t gpu_id, int dmabuf_fd, - uint64_t va_addr, uint64_t *handle, - uint64_t *mmap_offset); -int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, - uint64_t handle, uint32_t *ipc_handle); - -void ipc_obj_get(struct kfd_ipc_obj *obj); -void ipc_obj_put(struct kfd_ipc_obj **obj); - -#endif /* KFD_IPC_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 8cf9d44..0649dd4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -99,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->rptr_kernel = kq->rptr_mem->cpu_ptr; kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; - retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, + retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), &kq->wptr_mem); if (retval != 0) @@ -123,7 +123,6 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr; prop.eop_ring_buffer_address = kq->eop_gpu_addr; prop.eop_ring_buffer_size = PAGE_SIZE; - prop.cu_mask = NULL; if (init_queue(&kq->queue, &prop) != 0) goto err_init_queue; @@ -185,8 +184,8 @@ static void uninitialize(struct kernel_queue *kq) if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) kq->mqd->destroy_mqd(kq->mqd, kq->queue->mqd, - KFD_PREEMPT_TYPE_WAVEFRONT_RESET, - KFD_UNMAP_LATENCY_MS, + false, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, kq->queue->pipe, kq->queue->queue); else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) @@ -209,90 +208,39 @@ static int acquire_packet_buffer(struct kernel_queue *kq, size_t available_size; size_t queue_size_dwords; uint32_t wptr, rptr; - uint64_t wptr64; unsigned int *queue_address; - /* When rptr == wptr, the buffer is empty. - * When rptr == wptr + 1, the buffer is full. - * It is always rptr that advances to the position of wptr, rather than - * the opposite. So we can only use up to queue_size_dwords - 1 dwords. - */ rptr = *kq->rptr_kernel; - wptr = kq->pending_wptr; - wptr64 = kq->pending_wptr64; + wptr = *kq->wptr_kernel; queue_address = (unsigned int *)kq->pq_kernel_addr; - queue_size_dwords = kq->queue->properties.queue_size / 4; + queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); pr_debug("rptr: %d\n", rptr); pr_debug("wptr: %d\n", wptr); pr_debug("queue_address 0x%p\n", queue_address); - available_size = (rptr + queue_size_dwords - 1 - wptr) % + available_size = (rptr - 1 - wptr + queue_size_dwords) % queue_size_dwords; - if (packet_size_in_dwords > available_size) { + if (packet_size_in_dwords >= queue_size_dwords || + packet_size_in_dwords >= available_size) { /* * make sure calling functions know * acquire_packet_buffer() failed */ - goto err_no_space; + *buffer_ptr = NULL; + return -ENOMEM; } if (wptr + packet_size_in_dwords >= queue_size_dwords) { - /* make sure after rolling back to position 0, there is - * still enough space. - */ - if (packet_size_in_dwords >= rptr) - goto err_no_space; - - /* fill nops, roll back and start at position 0 */ while (wptr > 0) { queue_address[wptr] = kq->nop_packet; wptr = (wptr + 1) % queue_size_dwords; - wptr64++; } } *buffer_ptr = &queue_address[wptr]; kq->pending_wptr = wptr + packet_size_in_dwords; - kq->pending_wptr64 = wptr64 + packet_size_in_dwords; - - return 0; - -err_no_space: - *buffer_ptr = NULL; - return -ENOMEM; -} - -static int acquire_inline_ib(struct kernel_queue *kq, - size_t size_in_dwords, - unsigned int **buffer_ptr, - uint64_t *gpu_addr) -{ - int ret; - unsigned int *buf; - union PM4_MES_TYPE_3_HEADER nop; - - if (size_in_dwords >= (1 << 14)) - return -EINVAL; - - /* Allocate size_in_dwords on the ring, plus an extra dword - * for a NOP packet header - */ - ret = acquire_packet_buffer(kq, size_in_dwords + 1, &buf); - if (ret) - return ret; - - /* Build a NOP packet that contains the IB as "payload". */ - nop.u32all = 0; - nop.opcode = IT_NOP; - nop.count = size_in_dwords - 1; - nop.type = PM4_TYPE_3; - - *buf = nop.u32all; - *buffer_ptr = buf + 1; - *gpu_addr = kq->pq_gpu_addr + ((unsigned long)*buffer_ptr - - (unsigned long)kq->pq_kernel_addr); return 0; } @@ -310,7 +258,9 @@ static void submit_packet(struct kernel_queue *kq) pr_debug("\n"); #endif - kq->ops_asic_specific.submit_packet(kq); + *kq->wptr_kernel = kq->pending_wptr; + write_kernel_doorbell(kq->queue->properties.doorbell_ptr, + kq->pending_wptr); } static void rollback_packet(struct kernel_queue *kq) @@ -330,42 +280,25 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, kq->ops.initialize = initialize; kq->ops.uninitialize = uninitialize; kq->ops.acquire_packet_buffer = acquire_packet_buffer; - kq->ops.acquire_inline_ib = acquire_inline_ib; kq->ops.submit_packet = submit_packet; kq->ops.rollback_packet = rollback_packet; switch (dev->device_info->asic_family) { case CHIP_CARRIZO: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: kernel_queue_init_vi(&kq->ops_asic_specific); break; case CHIP_KAVERI: - case CHIP_HAWAII: kernel_queue_init_cik(&kq->ops_asic_specific); break; - - case CHIP_VEGA10: - case CHIP_RAVEN: - kernel_queue_init_v9(&kq->ops_asic_specific); - break; - default: - WARN(1, "Unexpected ASIC family %u", - dev->device_info->asic_family); - goto out_free; } - if (kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) - return kq; - - pr_err("Failed to init kernel queue\n"); - -out_free: - kfree(kq); - return NULL; + if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) { + pr_err("Failed to init kernel queue\n"); + kfree(kq); + return NULL; + } + return kq; } void kernel_queue_uninit(struct kernel_queue *kq) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index 82c94a6..5940531 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -42,12 +42,6 @@ * pending write pointer to that location so subsequent calls to * acquire_packet_buffer will get a correct write pointer * - * @acquire_inline_ib: Returns a pointer to the location in the kernel - * queue ring buffer where the calling function can write an inline IB. It is - * Guaranteed that there is enough space for that IB. It also updates the - * pending write pointer to that location so subsequent calls to - * acquire_packet_buffer will get a correct write pointer - * * @submit_packet: Update the write pointer and doorbell of a kernel queue. * * @sync_with_hw: Wait until the write pointer and the read pointer of a kernel @@ -65,10 +59,6 @@ struct kernel_queue_ops { int (*acquire_packet_buffer)(struct kernel_queue *kq, size_t packet_size_in_dwords, unsigned int **buffer_ptr); - int (*acquire_inline_ib)(struct kernel_queue *kq, - size_t packet_size_in_dwords, - unsigned int **buffer_ptr, - uint64_t *gpu_addr); void (*submit_packet)(struct kernel_queue *kq); void (*rollback_packet)(struct kernel_queue *kq); @@ -82,7 +72,6 @@ struct kernel_queue { struct kfd_dev *dev; struct mqd_manager *mqd; struct queue *queue; - uint64_t pending_wptr64; uint32_t pending_wptr; unsigned int nop_packet; @@ -90,10 +79,7 @@ struct kernel_queue { uint32_t *rptr_kernel; uint64_t rptr_gpu_addr; struct kfd_mem_obj *wptr_mem; - union { - uint64_t *wptr64_kernel; - uint32_t *wptr_kernel; - }; + uint32_t *wptr_kernel; uint64_t wptr_gpu_addr; struct kfd_mem_obj *pq; uint64_t pq_gpu_addr; @@ -111,6 +97,5 @@ struct kernel_queue { void kernel_queue_init_cik(struct kernel_queue_ops *ops); void kernel_queue_init_vi(struct kernel_queue_ops *ops); -void kernel_queue_init_v9(struct kernel_queue_ops *ops); #endif /* KFD_KERNEL_QUEUE_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c index 2808422..a90eb44 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c @@ -22,19 +22,15 @@ */ #include "kfd_kernel_queue.h" -#include "kfd_pm4_headers.h" -#include "kfd_pm4_opcodes.h" static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, enum kfd_queue_type type, unsigned int queue_size); static void uninitialize_cik(struct kernel_queue *kq); -static void submit_packet_cik(struct kernel_queue *kq); void kernel_queue_init_cik(struct kernel_queue_ops *ops) { ops->initialize = initialize_cik; ops->uninitialize = uninitialize_cik; - ops->submit_packet = submit_packet_cik; } static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, @@ -46,127 +42,3 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, static void uninitialize_cik(struct kernel_queue *kq) { } - -static void submit_packet_cik(struct kernel_queue *kq) -{ - *kq->wptr_kernel = kq->pending_wptr; - write_kernel_doorbell(kq->queue->properties.doorbell_ptr, - kq->pending_wptr); -} - -static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer, - struct qcm_process_device *qpd) -{ - struct pm4_map_process *packet; - - packet = (struct pm4_map_process *)buffer; - - memset(buffer, 0, sizeof(struct pm4_map_process)); - - packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, - sizeof(struct pm4_map_process)); - packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; - packet->bitfields2.pasid = qpd->pqm->process->pasid; - packet->bitfields3.page_table_base = qpd->page_table_base; - packet->bitfields10.gds_size = qpd->gds_size; - packet->bitfields10.num_gws = qpd->num_gws; - packet->bitfields10.num_oac = qpd->num_oac; - packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; - - packet->sh_mem_config = qpd->sh_mem_config; - packet->sh_mem_bases = qpd->sh_mem_bases; - packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; - packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; - - packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); - packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); - - return 0; -} - -static int pm_map_process_scratch_cik(struct packet_manager *pm, - uint32_t *buffer, struct qcm_process_device *qpd) -{ - struct pm4_map_process_scratch_kv *packet; - - packet = (struct pm4_map_process_scratch_kv *)buffer; - - memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); - - packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, - sizeof(struct pm4_map_process_scratch_kv)); - packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; - packet->bitfields2.pasid = qpd->pqm->process->pasid; - packet->bitfields3.page_table_base = qpd->page_table_base; - packet->bitfields14.gds_size = qpd->gds_size; - packet->bitfields14.num_gws = qpd->num_gws; - packet->bitfields14.num_oac = qpd->num_oac; - packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; - - packet->sh_mem_config = qpd->sh_mem_config; - packet->sh_mem_bases = qpd->sh_mem_bases; - packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; - packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; - - packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; - - packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); - packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); - - return 0; -} - -static uint32_t pm_get_map_process_packet_size_cik(void) -{ - return sizeof(struct pm4_map_process); -} -static uint32_t pm_get_map_process_scratch_packet_size_cik(void) -{ - return sizeof(struct pm4_map_process_scratch_kv); -} - - -static struct packet_manager_funcs kfd_cik_pm_funcs = { - .map_process = pm_map_process_cik, - .runlist = pm_runlist_vi, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_vi, - .unmap_queues = pm_unmap_queues_vi, - .query_status = pm_query_status_vi, - .release_mem = pm_release_mem_vi, - .get_map_process_packet_size = pm_get_map_process_packet_size_cik, - .get_runlist_packet_size = pm_get_runlist_packet_size_vi, - .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, - .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, - .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, - .get_query_status_packet_size = pm_get_query_status_packet_size_vi, - .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, -}; - -static struct packet_manager_funcs kfd_cik_scratch_pm_funcs = { - .map_process = pm_map_process_scratch_cik, - .runlist = pm_runlist_vi, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_vi, - .unmap_queues = pm_unmap_queues_vi, - .query_status = pm_query_status_vi, - .release_mem = pm_release_mem_vi, - .get_map_process_packet_size = - pm_get_map_process_scratch_packet_size_cik, - .get_runlist_packet_size = pm_get_runlist_packet_size_vi, - .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, - .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, - .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, - .get_query_status_packet_size = pm_get_query_status_packet_size_vi, - .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, -}; - -void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver) -{ - if (fw_ver >= KFD_SCRATCH_KV_FW_VER) - pm->pmf = &kfd_cik_scratch_pm_funcs; - else - pm->pmf = &kfd_cik_pm_funcs; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c deleted file mode 100644 index 5fe4f60..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +++ /dev/null @@ -1,377 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "kfd_kernel_queue.h" -#include "kfd_device_queue_manager.h" -#include "kfd_pm4_headers_ai.h" -#include "kfd_pm4_opcodes.h" - -static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, - enum kfd_queue_type type, unsigned int queue_size); -static void uninitialize_v9(struct kernel_queue *kq); -static void submit_packet_v9(struct kernel_queue *kq); - -void kernel_queue_init_v9(struct kernel_queue_ops *ops) -{ - ops->initialize = initialize_v9; - ops->uninitialize = uninitialize_v9; - ops->submit_packet = submit_packet_v9; -} - -static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, - enum kfd_queue_type type, unsigned int queue_size) -{ - int retval; - - retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); - if (retval != 0) - return false; - - kq->eop_gpu_addr = kq->eop_mem->gpu_addr; - kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; - - memset(kq->eop_kernel_addr, 0, PAGE_SIZE); - - return true; -} - -static void uninitialize_v9(struct kernel_queue *kq) -{ - kfd_gtt_sa_free(kq->dev, kq->eop_mem); -} - -static void submit_packet_v9(struct kernel_queue *kq) -{ - *kq->wptr64_kernel = kq->pending_wptr64; - write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, - kq->pending_wptr64); -} - -static int pm_map_process_v9(struct packet_manager *pm, - uint32_t *buffer, struct qcm_process_device *qpd) -{ - struct pm4_mes_map_process *packet; - uint64_t vm_page_table_base_addr = - (uint64_t)(qpd->page_table_base) << 12; - - packet = (struct pm4_mes_map_process *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_map_process)); - - packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, - sizeof(struct pm4_mes_map_process)); - packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; - packet->bitfields2.pasid = qpd->pqm->process->pasid; - packet->bitfields14.gds_size = qpd->gds_size; - packet->bitfields14.num_gws = qpd->num_gws; - packet->bitfields14.num_oac = qpd->num_oac; - packet->bitfields14.sdma_enable = 1; - packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; - - packet->sh_mem_config = qpd->sh_mem_config; - packet->sh_mem_bases = qpd->sh_mem_bases; - packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); - packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); - packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); - packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); - - packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); - packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); - - packet->vm_context_page_table_base_addr_lo32 = - lower_32_bits(vm_page_table_base_addr); - packet->vm_context_page_table_base_addr_hi32 = - upper_32_bits(vm_page_table_base_addr); - - return 0; -} - -static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, - uint64_t ib, size_t ib_size_in_dwords, bool chain) -{ - struct pm4_mes_runlist *packet; - - int concurrent_proc_cnt = 0; - struct kfd_dev *kfd = pm->dqm->dev; - - /* Determine the number of processes to map together to HW: - * it can not exceed the number of VMIDs available to the - * scheduler, and it is determined by the smaller of the number - * of processes in the runlist and kfd module parameter - * hws_max_conc_proc. - * Note: the arbitration between the number of VMIDs and - * hws_max_conc_proc has been done in - * kgd2kfd_device_init(). - */ - concurrent_proc_cnt = min(pm->dqm->processes_count, - kfd->max_proc_per_quantum); - - - packet = (struct pm4_mes_runlist *)buffer; - - memset(buffer, 0, sizeof(struct pm4_mes_runlist)); - packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, - sizeof(struct pm4_mes_runlist)); - - packet->bitfields4.ib_size = ib_size_in_dwords; - packet->bitfields4.chain = chain ? 1 : 0; - packet->bitfields4.offload_polling = 0; - packet->bitfields4.valid = 1; - packet->bitfields4.process_cnt = concurrent_proc_cnt; - packet->ordinal2 = lower_32_bits(ib); - packet->ib_base_hi = upper_32_bits(ib); - - return 0; -} - -static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, - struct queue *q, bool is_static) -{ - struct pm4_mes_map_queues *packet; - bool use_static = is_static; - - packet = (struct pm4_mes_map_queues *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); - - packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, - sizeof(struct pm4_mes_map_queues)); - packet->bitfields2.alloc_format = - alloc_format__mes_map_queues__one_per_pipe_vi; - packet->bitfields2.num_queues = 1; - packet->bitfields2.queue_sel = - queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; - - packet->bitfields2.engine_sel = - engine_sel__mes_map_queues__compute_vi; - packet->bitfields2.queue_type = - queue_type__mes_map_queues__normal_compute_vi; - - switch (q->properties.type) { - case KFD_QUEUE_TYPE_COMPUTE: - if (use_static) - packet->bitfields2.queue_type = - queue_type__mes_map_queues__normal_latency_static_queue_vi; - break; - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.queue_type = - queue_type__mes_map_queues__debug_interface_queue_vi; - break; - case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = q->properties.sdma_engine_id + - engine_sel__mes_map_queues__sdma0_vi; - use_static = false; /* no static queues under SDMA */ - break; - default: - WARN(1, "queue type %d", q->properties.type); - return -EINVAL; - } - packet->bitfields3.doorbell_offset = - q->properties.doorbell_off; - - packet->mqd_addr_lo = - lower_32_bits(q->gart_mqd_addr); - - packet->mqd_addr_hi = - upper_32_bits(q->gart_mqd_addr); - - packet->wptr_addr_lo = - lower_32_bits((uint64_t)q->properties.write_ptr); - - packet->wptr_addr_hi = - upper_32_bits((uint64_t)q->properties.write_ptr); - - return 0; -} - -static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, - enum kfd_queue_type type, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param, bool reset, - unsigned int sdma_engine) -{ - struct pm4_mes_unmap_queues *packet; - - packet = (struct pm4_mes_unmap_queues *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); - - packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, - sizeof(struct pm4_mes_unmap_queues)); - switch (type) { - case KFD_QUEUE_TYPE_COMPUTE: - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.engine_sel = - engine_sel__mes_unmap_queues__compute; - break; - case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = - engine_sel__mes_unmap_queues__sdma0 + sdma_engine; - break; - default: - WARN(1, "queue type %d", type); - return -EINVAL; - } - - if (reset) - packet->bitfields2.action = - action__mes_unmap_queues__reset_queues; - else - packet->bitfields2.action = - action__mes_unmap_queues__preempt_queues; - - switch (filter) { - case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_specified_queues; - packet->bitfields2.num_queues = 1; - packet->bitfields3b.doorbell_offset0 = filter_param; - break; - case KFD_UNMAP_QUEUES_FILTER_BY_PASID: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; - packet->bitfields3a.pasid = filter_param; - break; - case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__unmap_all_queues; - break; - case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: - /* in this case, we do not preempt static queues */ - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__unmap_all_non_static_queues; - break; - default: - WARN(1, "filter %d", filter); - return -EINVAL; - } - - return 0; - -} - -static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, - uint64_t fence_address, uint32_t fence_value) -{ - struct pm4_mes_query_status *packet; - - packet = (struct pm4_mes_query_status *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_query_status)); - - - packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, - sizeof(struct pm4_mes_query_status)); - - packet->bitfields2.context_id = 0; - packet->bitfields2.interrupt_sel = - interrupt_sel__mes_query_status__completion_status; - packet->bitfields2.command = - command__mes_query_status__fence_only_after_write_ack; - - packet->addr_hi = upper_32_bits((uint64_t)fence_address); - packet->addr_lo = lower_32_bits((uint64_t)fence_address); - packet->data_hi = upper_32_bits((uint64_t)fence_value); - packet->data_lo = lower_32_bits((uint64_t)fence_value); - - return 0; -} - - -static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) -{ - struct pm4_mec_release_mem *packet; - - packet = (struct pm4_mec_release_mem *)buffer; - memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); - - packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, - sizeof(struct pm4_mec_release_mem)); - - packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; - packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; - packet->bitfields2.tcl1_action_ena = 1; - packet->bitfields2.tc_action_ena = 1; - packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; - - packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; - packet->bitfields3.int_sel = - int_sel__mec_release_mem__send_interrupt_after_write_confirm; - - packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; - packet->address_hi = upper_32_bits(gpu_addr); - - packet->data_lo = 0; - - return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); -} - -static uint32_t pm_get_map_process_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_map_process); -} - -static uint32_t pm_get_runlist_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_runlist); -} - -static uint32_t pm_get_map_queues_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_map_queues); -} - -static uint32_t pm_get_unmap_queues_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_unmap_queues); -} - -static uint32_t pm_get_query_status_packet_size_v9(void) -{ - return sizeof(struct pm4_mes_query_status); -} - -static uint32_t pm_get_release_mem_packet_size_v9(void) -{ - return sizeof(struct pm4_mec_release_mem); -} - -static struct packet_manager_funcs kfd_v9_pm_funcs = { - .map_process = pm_map_process_v9, - .runlist = pm_runlist_v9, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_v9, - .unmap_queues = pm_unmap_queues_v9, - .query_status = pm_query_status_v9, - .release_mem = pm_release_mem_v9, - .get_map_process_packet_size = pm_get_map_process_packet_size_v9, - .get_runlist_packet_size = pm_get_runlist_packet_size_v9, - .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, - .get_map_queues_packet_size = pm_get_map_queues_packet_size_v9, - .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_v9, - .get_query_status_packet_size = pm_get_query_status_packet_size_v9, - .get_release_mem_packet_size = pm_get_release_mem_packet_size_v9, -}; - -void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver) -{ - pm->pmf = &kfd_v9_pm_funcs; -} - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c index 9022ecb..f1d4828 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c @@ -22,20 +22,15 @@ */ #include "kfd_kernel_queue.h" -#include "kfd_device_queue_manager.h" -#include "kfd_pm4_headers_vi.h" -#include "kfd_pm4_opcodes.h" static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, enum kfd_queue_type type, unsigned int queue_size); static void uninitialize_vi(struct kernel_queue *kq); -static void submit_packet_vi(struct kernel_queue *kq); void kernel_queue_init_vi(struct kernel_queue_ops *ops) { ops->initialize = initialize_vi; ops->uninitialize = uninitialize_vi; - ops->submit_packet = submit_packet_vi; } static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, @@ -59,359 +54,3 @@ static void uninitialize_vi(struct kernel_queue *kq) { kfd_gtt_sa_free(kq->dev, kq->eop_mem); } - -static void submit_packet_vi(struct kernel_queue *kq) -{ - *kq->wptr_kernel = kq->pending_wptr; - write_kernel_doorbell(kq->queue->properties.doorbell_ptr, - kq->pending_wptr); -} - -static int pm_map_process_vi(struct packet_manager *pm, - uint32_t *buffer, struct qcm_process_device *qpd) -{ - struct pm4_mes_map_process *packet; - - packet = (struct pm4_mes_map_process *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_map_process)); - - packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, - sizeof(struct pm4_mes_map_process)); - packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; - packet->bitfields2.pasid = qpd->pqm->process->pasid; - packet->bitfields3.page_table_base = qpd->page_table_base; - packet->bitfields10.gds_size = qpd->gds_size; - packet->bitfields10.num_gws = qpd->num_gws; - packet->bitfields10.num_oac = qpd->num_oac; - packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; - - packet->sh_mem_config = qpd->sh_mem_config; - packet->sh_mem_bases = qpd->sh_mem_bases; - packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; - packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; - - packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; - - packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); - packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); - - return 0; -} - - -unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) -{ - union PM4_MES_TYPE_3_HEADER header; - - header.u32All = 0; - header.opcode = opcode; - header.count = packet_size / 4 - 2; - header.type = PM4_TYPE_3; - - return header.u32All; -} - -int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, - uint64_t ib, size_t ib_size_in_dwords, bool chain) -{ - struct pm4_mes_runlist *packet; - - int concurrent_proc_cnt = 0; - struct kfd_dev *kfd = pm->dqm->dev; - - /* Determine the number of processes to map together to HW: - * it can not exceed the number of VMIDs available to the - * scheduler, and it is determined by the smaller of the number - * of processes in the runlist and kfd module parameter - * hws_max_conc_proc. - * Note: the arbitration between the number of VMIDs and - * hws_max_conc_proc has been done in - * kgd2kfd_device_init(). - */ - concurrent_proc_cnt = min(pm->dqm->processes_count, - kfd->max_proc_per_quantum); - - - packet = (struct pm4_mes_runlist *)buffer; - - memset(buffer, 0, sizeof(struct pm4_mes_runlist)); - packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, - sizeof(struct pm4_mes_runlist)); - - packet->bitfields4.ib_size = ib_size_in_dwords; - packet->bitfields4.chain = chain ? 1 : 0; - packet->bitfields4.offload_polling = 0; - packet->bitfields4.valid = 1; - packet->bitfields4.process_cnt = concurrent_proc_cnt; - packet->ordinal2 = lower_32_bits(ib); - packet->bitfields3.ib_base_hi = upper_32_bits(ib); - - return 0; -} - -int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, - struct queue *q, bool is_static) -{ - struct pm4_mes_map_queues *packet; - bool use_static = is_static; - - packet = (struct pm4_mes_map_queues *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); - - packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, - sizeof(struct pm4_mes_map_queues)); - packet->bitfields2.alloc_format = - alloc_format__mes_map_queues__one_per_pipe_vi; - packet->bitfields2.num_queues = 1; - packet->bitfields2.queue_sel = - queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; - - packet->bitfields2.engine_sel = - engine_sel__mes_map_queues__compute_vi; - packet->bitfields2.queue_type = - queue_type__mes_map_queues__normal_compute_vi; - - switch (q->properties.type) { - case KFD_QUEUE_TYPE_COMPUTE: - if (use_static) - packet->bitfields2.queue_type = - queue_type__mes_map_queues__normal_latency_static_queue_vi; - break; - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.queue_type = - queue_type__mes_map_queues__debug_interface_queue_vi; - break; - case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = q->properties.sdma_engine_id + - engine_sel__mes_map_queues__sdma0_vi; - use_static = false; /* no static queues under SDMA */ - break; - default: - WARN(1, "queue type %d", q->properties.type); - return -EINVAL; - } - packet->bitfields3.doorbell_offset = - q->properties.doorbell_off; - - packet->mqd_addr_lo = - lower_32_bits(q->gart_mqd_addr); - - packet->mqd_addr_hi = - upper_32_bits(q->gart_mqd_addr); - - packet->wptr_addr_lo = - lower_32_bits((uint64_t)q->properties.write_ptr); - - packet->wptr_addr_hi = - upper_32_bits((uint64_t)q->properties.write_ptr); - - return 0; -} - -int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res) -{ - struct pm4_mes_set_resources *packet; - - packet = (struct pm4_mes_set_resources *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); - - packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, - sizeof(struct pm4_mes_set_resources)); - - packet->bitfields2.queue_type = - queue_type__mes_set_resources__hsa_interface_queue_hiq; - packet->bitfields2.vmid_mask = res->vmid_mask; - packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; - packet->bitfields7.oac_mask = res->oac_mask; - packet->bitfields8.gds_heap_base = res->gds_heap_base; - packet->bitfields8.gds_heap_size = res->gds_heap_size; - - packet->gws_mask_lo = lower_32_bits(res->gws_mask); - packet->gws_mask_hi = upper_32_bits(res->gws_mask); - - packet->queue_mask_lo = lower_32_bits(res->queue_mask); - packet->queue_mask_hi = upper_32_bits(res->queue_mask); - - return 0; -} - -int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, - enum kfd_queue_type type, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param, bool reset, - unsigned int sdma_engine) -{ - struct pm4_mes_unmap_queues *packet; - - packet = (struct pm4_mes_unmap_queues *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); - - packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, - sizeof(struct pm4_mes_unmap_queues)); - switch (type) { - case KFD_QUEUE_TYPE_COMPUTE: - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.engine_sel = - engine_sel__mes_unmap_queues__compute; - break; - case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = - engine_sel__mes_unmap_queues__sdma0 + sdma_engine; - break; - default: - WARN(1, "queue type %d", type); - return -EINVAL; - } - - if (reset) - packet->bitfields2.action = - action__mes_unmap_queues__reset_queues; - else - packet->bitfields2.action = - action__mes_unmap_queues__preempt_queues; - - switch (filter) { - case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_specified_queues; - packet->bitfields2.num_queues = 1; - packet->bitfields3b.doorbell_offset0 = filter_param; - break; - case KFD_UNMAP_QUEUES_FILTER_BY_PASID: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; - packet->bitfields3a.pasid = filter_param; - break; - case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__unmap_all_queues; - break; - case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: - /* in this case, we do not preempt static queues */ - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__unmap_all_non_static_queues; - break; - default: - WARN(1, "filter %d", filter); - return -EINVAL; - } - - return 0; - -} - -int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, - uint64_t fence_address, uint32_t fence_value) -{ - struct pm4_mes_query_status *packet; - - packet = (struct pm4_mes_query_status *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_query_status)); - - - packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, - sizeof(struct pm4_mes_query_status)); - - packet->bitfields2.context_id = 0; - packet->bitfields2.interrupt_sel = - interrupt_sel__mes_query_status__completion_status; - packet->bitfields2.command = - command__mes_query_status__fence_only_after_write_ack; - - packet->addr_hi = upper_32_bits((uint64_t)fence_address); - packet->addr_lo = lower_32_bits((uint64_t)fence_address); - packet->data_hi = upper_32_bits((uint64_t)fence_value); - packet->data_lo = lower_32_bits((uint64_t)fence_value); - - return 0; -} - - -uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) -{ - struct pm4_mec_release_mem *packet; - - packet = (struct pm4_mec_release_mem *)buffer; - memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); - - packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, - sizeof(struct pm4_mec_release_mem)); - - packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; - packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; - packet->bitfields2.tcl1_action_ena = 1; - packet->bitfields2.tc_action_ena = 1; - packet->bitfields2.cache_policy = cache_policy___release_mem__lru; - packet->bitfields2.atc = 0; - - packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; - packet->bitfields3.int_sel = - int_sel___release_mem__send_interrupt_after_write_confirm; - - packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; - packet->address_hi = upper_32_bits(gpu_addr); - - packet->data_lo = 0; - - return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); -} - -uint32_t pm_get_map_process_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_map_process); -} - -uint32_t pm_get_runlist_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_runlist); -} - -uint32_t pm_get_set_resources_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_set_resources); -} - -uint32_t pm_get_map_queues_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_map_queues); -} - -uint32_t pm_get_unmap_queues_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_unmap_queues); -} - -uint32_t pm_get_query_status_packet_size_vi(void) -{ - return sizeof(struct pm4_mes_query_status); -} - -uint32_t pm_get_release_mem_packet_size_vi(void) -{ - return sizeof(struct pm4_mec_release_mem); -} - - -static struct packet_manager_funcs kfd_vi_pm_funcs = { - .map_process = pm_map_process_vi, - .runlist = pm_runlist_vi, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_vi, - .unmap_queues = pm_unmap_queues_vi, - .query_status = pm_query_status_vi, - .release_mem = pm_release_mem_vi, - .get_map_process_packet_size = pm_get_map_process_packet_size_vi, - .get_runlist_packet_size = pm_get_runlist_packet_size_vi, - .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, - .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, - .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, - .get_query_status_packet_size = pm_get_query_status_packet_size_vi, - .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, -}; - -void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver) -{ - pm->pmf = &kfd_vi_pm_funcs; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index b6f9d23..0d73bea 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -29,10 +29,10 @@ #define KFD_DRIVER_AUTHOR "AMD Inc. and others" #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" -#define KFD_DRIVER_DATE "20160408" -#define KFD_DRIVER_MAJOR 2 -#define KFD_DRIVER_MINOR 0 -#define KFD_DRIVER_PATCHLEVEL 0 +#define KFD_DRIVER_DATE "20150421" +#define KFD_DRIVER_MAJOR 0 +#define KFD_DRIVER_MINOR 7 +#define KFD_DRIVER_PATCHLEVEL 2 static const struct kgd2kfd_calls kgd2kfd = { .exit = kgd2kfd_exit, @@ -42,12 +42,6 @@ static const struct kgd2kfd_calls kgd2kfd = { .interrupt = kgd2kfd_interrupt, .suspend = kgd2kfd_suspend, .resume = kgd2kfd_resume, - .quiesce_mm = kgd2kfd_quiesce_mm, - .resume_mm = kgd2kfd_resume_mm, - .schedule_evict_and_restore_process = - kgd2kfd_schedule_evict_and_restore_process, - .pre_reset = kgd2kfd_pre_reset, - .post_reset = kgd2kfd_post_reset, }; int sched_policy = KFD_SCHED_POLICY_HWS; @@ -55,15 +49,6 @@ module_param(sched_policy, int, 0444); MODULE_PARM_DESC(sched_policy, "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); -int hws_max_conc_proc = 8; -module_param(hws_max_conc_proc, int, 0444); -MODULE_PARM_DESC(hws_max_conc_proc, - "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); - -int cwsr_enable = 1; -module_param(cwsr_enable, int, 0444); -MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); - int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; module_param(max_num_of_queues_per_device, int, 0444); MODULE_PARM_DESC(max_num_of_queues_per_device, @@ -76,33 +61,6 @@ MODULE_PARM_DESC(send_sigterm, static int amdkfd_init_completed; -int debug_largebar; -module_param(debug_largebar, int, 0444); -MODULE_PARM_DESC(debug_largebar, - "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); - -int ignore_crat; -module_param(ignore_crat, int, 0444); -MODULE_PARM_DESC(ignore_crat, - "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); - -int vega10_noretry; -module_param_named(noretry, vega10_noretry, int, 0644); -MODULE_PARM_DESC(noretry, - "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); - -int priv_cp_queues; -module_param(priv_cp_queues, int, 0644); -MODULE_PARM_DESC(priv_cp_queues, - "Enable privileged mode for CP queues (0 = off (default), 1 = on)"); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0) && defined(BUILD_AS_DKMS) -int cma_enable; -module_param(cma_enable, int, 0644); -MODULE_PARM_DESC(cma_enable, - "Enable CMA (1 = enable, 0 = disable (default)). Warning! relaxed access check"); -#endif - int kgd2kfd_init(unsigned int interface_version, const struct kgd2kfd_calls **g2f) { @@ -145,6 +103,10 @@ static int __init kfd_module_init(void) return -1; } + err = kfd_pasid_init(); + if (err < 0) + return err; + err = kfd_chardev_init(); if (err < 0) goto err_ioctl; @@ -153,16 +115,8 @@ static int __init kfd_module_init(void) if (err < 0) goto err_topology; - err = kfd_ipc_init(); - if (err < 0) - goto err_topology; - kfd_process_create_wq(); - kfd_init_peer_direct(); - - kfd_debugfs_init(); - amdkfd_init_completed = 1; dev_info(kfd_device, "Initialized module\n"); @@ -172,6 +126,7 @@ static int __init kfd_module_init(void) err_topology: kfd_chardev_exit(); err_ioctl: + kfd_pasid_exit(); return err; } @@ -179,11 +134,10 @@ static void __exit kfd_module_exit(void) { amdkfd_init_completed = 0; - kfd_debugfs_fini(); - kfd_close_peer_direct(); kfd_process_destroy_wq(); kfd_topology_shutdown(); kfd_chardev_exit(); + kfd_pasid_exit(); dev_info(kfd_device, "Removed module\n"); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 8279b74..b1ef136 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -23,69 +23,14 @@ #include "kfd_priv.h" -/* Mapping queue priority to pipe priority, indexed by queue priority */ -int pipe_priority_map[] = { - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_LOW, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_HIGH, - KFD_PIPE_PRIORITY_CS_HIGH, - KFD_PIPE_PRIORITY_CS_HIGH, - KFD_PIPE_PRIORITY_CS_HIGH, - KFD_PIPE_PRIORITY_CS_HIGH -}; - -/* Mapping queue priority to SPI priority, indexed by queue priority - * SPI priority 2 and 3 are reserved for trap handler context save - */ -int spi_priority_map[] = { - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_EXTRA_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_LOW -}; - struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, struct kfd_dev *dev) { switch (dev->device_info->asic_family) { case CHIP_KAVERI: return mqd_manager_init_cik(type, dev); - case CHIP_HAWAII: - return mqd_manager_init_cik_hawaii(type, dev); case CHIP_CARRIZO: return mqd_manager_init_vi(type, dev); - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - return mqd_manager_init_vi_tonga(type, dev); - case CHIP_VEGA10: - case CHIP_RAVEN: - return mqd_manager_init_v9(type, dev); - default: - WARN(1, "Unexpected ASIC family %u", - dev->device_info->asic_family); } return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index dcaeda8..1f3a6ba 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -43,9 +43,6 @@ * * @is_occupied: Checks if the relevant HQD slot is occupied. * - * @get_wave_state: Retrieves context save state and optionally copies the - * control stack, if kept in the MQD, to the given userspace address. - * * @mqd_mutex: Mqd manager mutex. * * @dev: The kfd device structure coupled with this module. @@ -62,8 +59,7 @@ * per KFD_MQD_TYPE for each device. * */ -extern int pipe_priority_map[]; -extern int spi_priority_map[]; + struct mqd_manager { int (*init_mqd)(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, @@ -89,15 +85,6 @@ struct mqd_manager { uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); - int (*get_wave_state)(struct mqd_manager *mm, void *mqd, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size); - -#if defined(CONFIG_DEBUG_FS) - int (*debugfs_show_mqd)(struct seq_file *m, void *data); -#endif - struct mutex mqd_mutex; struct kfd_dev *dev; }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 602da80..44ffd23 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -30,80 +30,12 @@ #include "cik_regs.h" #include "cik_structs.h" #include "oss/oss_2_4_sh_mask.h" -#include "gca/gfx_7_2_sh_mask.h" static inline struct cik_mqd *get_mqd(void *mqd) { return (struct cik_mqd *)mqd; } -static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) -{ - return (struct cik_sdma_rlc_registers *)mqd; -} - -static void update_cu_mask(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct cik_mqd *m; - struct kfd_cu_info cu_info; - uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ - uint32_t cu_mask_count = q->cu_mask_count; - const uint32_t *cu_mask = q->cu_mask; - int se, cu_per_sh, cu_index, i; - - if (cu_mask_count == 0) - return; - - m = get_mqd(mqd); - m->compute_static_thread_mgmt_se0 = 0; - m->compute_static_thread_mgmt_se1 = 0; - m->compute_static_thread_mgmt_se2 = 0; - m->compute_static_thread_mgmt_se3 = 0; - - mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); - - /* If # CU mask bits > # CUs, set it to the # of CUs */ - if (cu_mask_count > cu_info.cu_active_number) - cu_mask_count = cu_info.cu_active_number; - - cu_index = 0; - for (se = 0; se < cu_info.num_shader_engines; se++) { - cu_per_sh = 0; - - /* Get the number of CUs on this Shader Engine */ - for (i = 0; i < 4; i++) - cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); - - se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); - if ((cu_per_sh + (cu_index % 32)) > 32) - se_mask[se] |= cu_mask[(cu_index / 32) + 1] - << (32 - (cu_index % 32)); - se_mask[se] &= (1 << cu_per_sh) - 1; - cu_index += cu_per_sh; - } - m->compute_static_thread_mgmt_se0 = se_mask[0]; - m->compute_static_thread_mgmt_se1 = se_mask[1]; - m->compute_static_thread_mgmt_se2 = se_mask[2]; - m->compute_static_thread_mgmt_se3 = se_mask[3]; - - pr_debug("Update cu mask to %#x %#x %#x %#x\n", - m->compute_static_thread_mgmt_se0, - m->compute_static_thread_mgmt_se1, - m->compute_static_thread_mgmt_se2, - m->compute_static_thread_mgmt_se3); -} - -static void set_priority(struct cik_mqd *m, struct queue_properties *q) -{ - m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; - m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & - (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | - (spi_priority_map[q->priority] << - COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); -} - static int init_mqd(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -142,6 +74,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, m->cp_mqd_base_addr_lo = lower_32_bits(addr); m->cp_mqd_base_addr_hi = upper_32_bits(addr); + m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN; + /* Although WinKFD writes this, I suspect it should not be necessary */ + m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; + m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | QUANTUM_DURATION(10); @@ -154,15 +90,12 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, * 1 = CS_MEDIUM (typically between HP3D and GFX * 2 = CS_HIGH (typically above HP3D) */ - set_priority(m, q); + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_iq_rptr = AQL_ENABLE; - if (priv_cp_queues) - m->cp_hqd_pq_control |= - 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; - *mqd = m; if (gart_addr) *gart_addr = addr; @@ -216,7 +149,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, { /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); - uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1); + uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, (uint32_t __user *)p->write_ptr, @@ -227,30 +160,24 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, struct queue_properties *p, struct mm_struct *mms) { - return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, - (uint32_t __user *)p->write_ptr, - mms); + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); } -static int __update_mqd(struct mqd_manager *mm, void *mqd, - struct queue_properties *q, unsigned int atc_bit) +static int update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) { struct cik_mqd *m; m = get_mqd(mqd); m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | - DEFAULT_MIN_AVAIL_SIZE; - m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE; - if (atc_bit) { - m->cp_hqd_pq_control |= PQ_ATC_EN; - m->cp_hqd_ib_control |= IB_ATC_EN; - } + DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN; /* * Calculating queue size which is log base 2 of actual queue size -1 * dwords and another -1 for ffs */ - m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; + m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) + - 1 - 1; m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); @@ -262,37 +189,24 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_pq_control |= NO_UPDATE_RPTR; - update_cu_mask(mm, mqd, q); - set_priority(m, q); - - q->is_active = (q->queue_size > 0 && + q->is_active = false; + if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); + q->queue_percent > 0) { + q->is_active = true; + } return 0; } -static int update_mqd(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - return __update_mqd(mm, mqd, q, 1); -} - -static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - return __update_mqd(mm, mqd, q, 0); -} - static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct cik_sdma_rlc_registers *m; m = get_sdma_mqd(mqd); - m->sdma_rlc_rb_cntl = order_base_2(q->queue_size / 4) - << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << + SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; @@ -301,18 +215,24 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8); m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->sdma_rlc_doorbell = - q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; + m->sdma_rlc_doorbell = q->doorbell_off << + SDMA0_RLC0_DOORBELL__OFFSET__SHIFT | + 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; m->sdma_rlc_virtual_addr = q->sdma_vm_addr; m->sdma_engine_id = q->sdma_engine_id; m->sdma_queue_id = q->sdma_queue_id; - q->is_active = (q->queue_size > 0 && + q->is_active = false; + if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); + q->queue_percent > 0) { + m->sdma_rlc_rb_cntl |= + 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; + + q->is_active = true; + } return 0; } @@ -407,7 +327,8 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, * 1 = CS_MEDIUM (typically between HP3D and GFX * 2 = CS_HIGH (typically above HP3D) */ - set_priority(m, q); + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; *mqd = m; if (gart_addr) @@ -432,42 +353,37 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, * Calculating queue size which is log base 2 of actual queue * size -1 dwords */ - m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; + m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) + - 1 - 1; m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); + m->cp_hqd_pq_doorbell_control = DOORBELL_EN | + DOORBELL_OFFSET(q->doorbell_off); m->cp_hqd_vmid = q->vmid; - q->is_active = (q->queue_size > 0 && + m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - set_priority(m, q); - return 0; -} - -#if defined(CONFIG_DEBUG_FS) + q->queue_percent > 0) { + m->cp_hqd_active = 1; + q->is_active = true; + } -static int debugfs_show_mqd(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct cik_mqd), false); return 0; } -static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) { - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct cik_sdma_rlc_registers), false); - return 0; -} + struct cik_sdma_rlc_registers *m; -#endif + m = (struct cik_sdma_rlc_registers *)mqd; + return m; +} struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_dev *dev) @@ -477,7 +393,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; - mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); if (!mqd) return NULL; @@ -492,9 +408,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif break; case KFD_MQD_TYPE_HIQ: mqd->init_mqd = init_mqd_hiq; @@ -503,9 +416,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif break; case KFD_MQD_TYPE_SDMA: mqd->init_mqd = init_mqd_sdma; @@ -514,9 +424,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; -#endif break; default: kfree(mqd); @@ -526,15 +433,3 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, return mqd; } -struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) -{ - struct mqd_manager *mqd; - - mqd = mqd_manager_init_cik(type, dev); - if (!mqd) - return NULL; - if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) - mqd->update_mqd = update_mqd_hawaii; - return mqd; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c deleted file mode 100644 index 25a20e1..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ /dev/null @@ -1,524 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include -#include -#include -#include "kfd_priv.h" -#include "kfd_mqd_manager.h" -#include "v9_structs.h" -#include "gc/gc_9_0_offset.h" -#include "gc/gc_9_0_sh_mask.h" -#include "sdma0/sdma0_4_0_sh_mask.h" - -static inline struct v9_mqd *get_mqd(void *mqd) -{ - return (struct v9_mqd *)mqd; -} - -static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) -{ - return (struct v9_sdma_mqd *)mqd; -} - -static void update_cu_mask(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct v9_mqd *m; - struct kfd_cu_info cu_info; - uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ - uint32_t cu_mask_count = q->cu_mask_count; - const uint32_t *cu_mask = q->cu_mask; - int se, cu_per_sh, cu_index, i; - - if (cu_mask_count == 0) - return; - - m = get_mqd(mqd); - m->compute_static_thread_mgmt_se0 = 0; - m->compute_static_thread_mgmt_se1 = 0; - m->compute_static_thread_mgmt_se2 = 0; - m->compute_static_thread_mgmt_se3 = 0; - - mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); - - /* If # CU mask bits > # CUs, set it to the # of CUs */ - if (cu_mask_count > cu_info.cu_active_number) - cu_mask_count = cu_info.cu_active_number; - - cu_index = 0; - for (se = 0; se < cu_info.num_shader_engines; se++) { - cu_per_sh = 0; - - /* Get the number of CUs on this Shader Engine */ - for (i = 0; i < 4; i++) - cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); - - se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); - if ((cu_per_sh + (cu_index % 32)) > 32) - se_mask[se] |= cu_mask[(cu_index / 32) + 1] - << (32 - (cu_index % 32)); - se_mask[se] &= (1 << cu_per_sh) - 1; - cu_index += cu_per_sh; - } - m->compute_static_thread_mgmt_se0 = se_mask[0]; - m->compute_static_thread_mgmt_se1 = se_mask[1]; - m->compute_static_thread_mgmt_se2 = se_mask[2]; - m->compute_static_thread_mgmt_se3 = se_mask[3]; - - pr_debug("update cu mask to %#x %#x %#x %#x\n", - m->compute_static_thread_mgmt_se0, - m->compute_static_thread_mgmt_se1, - m->compute_static_thread_mgmt_se2, - m->compute_static_thread_mgmt_se3); -} - -static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -{ - int retval; - uint64_t addr; - struct v9_mqd *m; - struct kfd_dev *kfd = mm->dev; - - /* From V9, for CWSR, the control stack is located on the next page - * boundary after the mqd, we will use the gtt allocation function - * instead of sub-allocation function. - */ - if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { - *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); - if (!*mqd_mem_obj) - return -ENOMEM; - retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, - ALIGN(q->ctl_stack_size, PAGE_SIZE) + - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), - &((*mqd_mem_obj)->gtt_mem), - &((*mqd_mem_obj)->gpu_addr), - (void *)&((*mqd_mem_obj)->cpu_ptr)); - } else - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), - mqd_mem_obj); - if (retval != 0) - return -ENOMEM; - - m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; - addr = (*mqd_mem_obj)->gpu_addr; - - memset(m, 0, sizeof(struct v9_mqd)); - - m->header = 0xC0310800; - m->compute_pipelinestat_enable = 1; - m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; - m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; - m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; - m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; - - m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | - 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; - - m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; - - m->cp_mqd_base_addr_lo = lower_32_bits(addr); - m->cp_mqd_base_addr_hi = upper_32_bits(addr); - - m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | - 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | - 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; - - m->cp_hqd_pipe_priority = 1; - m->cp_hqd_queue_priority = 15; - - if (q->format == KFD_QUEUE_FORMAT_AQL) { - m->cp_hqd_aql_control = - 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; - } - - if (q->tba_addr) { - m->compute_pgm_rsrc2 |= - (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); - } - - if (mm->dev->cwsr_enabled) { - m->cp_hqd_persistent_state |= - (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); - m->cp_hqd_ctx_save_base_addr_lo = - lower_32_bits(q->ctx_save_restore_area_address); - m->cp_hqd_ctx_save_base_addr_hi = - upper_32_bits(q->ctx_save_restore_area_address); - m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; - m->cp_hqd_cntl_stack_size = q->ctl_stack_size; - m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; - m->cp_hqd_wg_state_offset = q->ctl_stack_size; - } - - if (priv_cp_queues) - m->cp_hqd_pq_control |= - 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; - - *mqd = m; - if (gart_addr) - *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - - return retval; -} - -static int load_mqd(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) -{ - /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ - uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); - - return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, - (uint32_t __user *)p->write_ptr, - wptr_shift, 0, mms); -} - -static int update_mqd(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct v9_mqd *m; - - m = get_mqd(mqd); - - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; - m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; - pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); - - m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); - m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); - - m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); - m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); - - m->cp_hqd_pq_doorbell_control = - q->doorbell_off << - CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; - pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", - m->cp_hqd_pq_doorbell_control); - - m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | - 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT; - - /* - * HW does not clamp this field correctly. Maximum EOP queue size - * is constrained by per-SE EOP done signal count, which is 8-bit. - * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit - * more than (EOP entry count - 1) so a queue size of 0x800 dwords - * is safe, giving a maximum field value of 0xA. - */ - m->cp_hqd_eop_control = min(0xA, - order_base_2(q->eop_ring_buffer_size / 4) - 1); - m->cp_hqd_eop_base_addr_lo = - lower_32_bits(q->eop_ring_buffer_address >> 8); - m->cp_hqd_eop_base_addr_hi = - upper_32_bits(q->eop_ring_buffer_address >> 8); - - m->cp_hqd_iq_timer = 0; - - m->cp_hqd_vmid = q->vmid; - - if (q->format == KFD_QUEUE_FORMAT_AQL) { - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | - 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | - 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | - 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; - m->cp_hqd_pq_doorbell_control |= - 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; - } - if (mm->dev->cwsr_enabled) - m->cp_hqd_ctx_save_control = 0; - - update_cu_mask(mm, mqd, q); - - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; -} - - -static int destroy_mqd(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_destroy - (mm->dev->kgd, mqd, type, timeout, - pipe_id, queue_id); -} - -static void uninit_mqd(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - struct kfd_dev *kfd = mm->dev; - - if (mqd_mem_obj->gtt_mem) { - kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); - kfree(mqd_mem_obj); - } else { - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); - } -} - -static bool is_occupied(struct mqd_manager *mm, void *mqd, - uint64_t queue_address, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_is_occupied( - mm->dev->kgd, queue_address, - pipe_id, queue_id); -} - -static int get_wave_state(struct mqd_manager *mm, void *mqd, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size) -{ - struct v9_mqd *m; - - /* Control stack is located one page after MQD. */ - void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); - - m = get_mqd(mqd); - - *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - - m->cp_hqd_cntl_stack_offset; - *save_area_used_size = m->cp_hqd_wg_state_offset - - m->cp_hqd_cntl_stack_size; - - if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) - return -EFAULT; - - return 0; -} - -static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -{ - struct v9_mqd *m; - int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); - - if (retval != 0) - return retval; - - m = get_mqd(*mqd); - - m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | - 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; - - return retval; -} - -static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct v9_mqd *m; - int retval = update_mqd(mm, mqd, q); - - if (retval != 0) - return retval; - - /* TODO: what's the point? update_mqd already does this. */ - m = get_mqd(mqd); - m->cp_hqd_vmid = q->vmid; - return retval; -} - -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -{ - int retval; - struct v9_sdma_mqd *m; - - - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct v9_sdma_mqd), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; - - memset(m, 0, sizeof(struct v9_sdma_mqd)); - - *mqd = m; - if (gart_addr) - *gart_addr = (*mqd_mem_obj)->gpu_addr; - - retval = mm->update_mqd(mm, m, q); - - return retval; -} - -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -} - -static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) -{ - return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, - (uint32_t __user *)p->write_ptr, - mms); -} - -#define SDMA_RLC_DUMMY_DEFAULT 0xf - -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct v9_sdma_mqd *m; - - m = get_sdma_mqd(mqd); - m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) - << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | - q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | - 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; - - m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); - m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); - m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->sdmax_rlcx_doorbell_offset = - q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; - - m->sdma_engine_id = q->sdma_engine_id; - m->sdma_queue_id = q->sdma_queue_id; - m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; - - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; -} - -/* - * * preempt type here is ignored because there is only one way - * * to preempt sdma queue - */ -static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); -} - -static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, - uint64_t queue_address, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); -} - -#if defined(CONFIG_DEBUG_FS) - -static int debugfs_show_mqd(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct v9_mqd), false); - return 0; -} - -static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct v9_sdma_mqd), false); - return 0; -} - -#endif - -struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) -{ - struct mqd_manager *mqd; - - if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) - return NULL; - - mqd = kzalloc(sizeof(*mqd), GFP_NOIO); - if (!mqd) - return NULL; - - mqd->dev = dev; - - switch (type) { - case KFD_MQD_TYPE_CP: - case KFD_MQD_TYPE_COMPUTE: - mqd->init_mqd = init_mqd; - mqd->uninit_mqd = uninit_mqd; - mqd->load_mqd = load_mqd; - mqd->update_mqd = update_mqd; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; - mqd->get_wave_state = get_wave_state; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif - break; - case KFD_MQD_TYPE_HIQ: - mqd->init_mqd = init_mqd_hiq; - mqd->uninit_mqd = uninit_mqd; - mqd->load_mqd = load_mqd; - mqd->update_mqd = update_mqd_hiq; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif - break; - case KFD_MQD_TYPE_SDMA: - mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; - mqd->load_mqd = load_mqd_sdma; - mqd->update_mqd = update_mqd_sdma; - mqd->destroy_mqd = destroy_mqd_sdma; - mqd->is_occupied = is_occupied_sdma; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; -#endif - break; - default: - kfree(mqd); - return NULL; - } - - return mqd; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 9bf1212..73cbfe1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -30,7 +30,6 @@ #include "vi_structs.h" #include "gca/gfx_8_0_sh_mask.h" #include "gca/gfx_8_0_enum.h" -#include "oss/oss_3_0_sh_mask.h" #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 @@ -39,73 +38,6 @@ static inline struct vi_mqd *get_mqd(void *mqd) return (struct vi_mqd *)mqd; } -static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) -{ - return (struct vi_sdma_mqd *)mqd; -} - -static void update_cu_mask(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct vi_mqd *m; - struct kfd_cu_info cu_info; - uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ - uint32_t cu_mask_count = q->cu_mask_count; - const uint32_t *cu_mask = q->cu_mask; - int se, cu_per_sh, cu_index, i; - - if (cu_mask_count == 0) - return; - - m = get_mqd(mqd); - m->compute_static_thread_mgmt_se0 = 0; - m->compute_static_thread_mgmt_se1 = 0; - m->compute_static_thread_mgmt_se2 = 0; - m->compute_static_thread_mgmt_se3 = 0; - - mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); - - /* If # CU mask bits > # CUs, set it to the # of CUs */ - if (cu_mask_count > cu_info.cu_active_number) - cu_mask_count = cu_info.cu_active_number; - - cu_index = 0; - for (se = 0; se < cu_info.num_shader_engines; se++) { - cu_per_sh = 0; - - /* Get the number of CUs on this Shader Engine */ - for (i = 0; i < 4; i++) - cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); - - se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); - if ((cu_per_sh + (cu_index % 32)) > 32) - se_mask[se] |= cu_mask[(cu_index / 32) + 1] - << (32 - (cu_index % 32)); - se_mask[se] &= (1 << cu_per_sh) - 1; - cu_index += cu_per_sh; - } - m->compute_static_thread_mgmt_se0 = se_mask[0]; - m->compute_static_thread_mgmt_se1 = se_mask[1]; - m->compute_static_thread_mgmt_se2 = se_mask[2]; - m->compute_static_thread_mgmt_se3 = se_mask[3]; - - pr_debug("Update cu mask to %#x %#x %#x %#x\n", - m->compute_static_thread_mgmt_se0, - m->compute_static_thread_mgmt_se1, - m->compute_static_thread_mgmt_se2, - m->compute_static_thread_mgmt_se3); -} - -static void set_priority(struct vi_mqd *m, struct queue_properties *q) -{ - m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; - m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & - (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | - (spi_priority_map[q->priority] << - COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); -} - static int init_mqd(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -144,38 +76,14 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; - set_priority(m, q); + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT; if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_iq_rptr = 1; - if (q->tba_addr) { - m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); - m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); - m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); - m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); - m->compute_pgm_rsrc2 |= - (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); - } - - if (mm->dev->cwsr_enabled) { - m->cp_hqd_persistent_state |= - (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); - m->cp_hqd_ctx_save_base_addr_lo = - lower_32_bits(q->ctx_save_restore_area_address); - m->cp_hqd_ctx_save_base_addr_hi = - upper_32_bits(q->ctx_save_restore_area_address); - m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; - m->cp_hqd_cntl_stack_size = q->ctl_stack_size; - m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; - m->cp_hqd_wg_state_offset = q->ctl_stack_size; - } - - if (priv_cp_queues) - m->cp_hqd_pq_control |= - 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; - *mqd = m; if (gart_addr) *gart_addr = addr; @@ -190,7 +98,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, { /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); - uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1); + uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, (uint32_t __user *)p->write_ptr, @@ -208,7 +116,8 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT | atc_bit << CP_HQD_PQ_CONTROL__PQ_ATC__SHIFT | mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT; - m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; + m->cp_hqd_pq_control |= + ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); @@ -216,8 +125,6 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); - m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); m->cp_hqd_pq_doorbell_control = q->doorbell_off << @@ -240,7 +147,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, * is safe, giving a maximum field value of 0xA. */ m->cp_hqd_eop_control |= min(0xA, - order_base_2(q->eop_ring_buffer_size / 4) - 1); + ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); m->cp_hqd_eop_base_addr_lo = lower_32_bits(q->eop_ring_buffer_address >> 8); m->cp_hqd_eop_base_addr_hi = @@ -255,18 +162,13 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; } - if (mm->dev->cwsr_enabled) - m->cp_hqd_ctx_save_control = - atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | - mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; - - update_cu_mask(mm, mqd, q); - set_priority(m, q); - q->is_active = (q->queue_size > 0 && + q->is_active = false; + if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); + q->queue_percent > 0) { + q->is_active = true; + } return 0; } @@ -278,12 +180,6 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, return __update_mqd(mm, mqd, q, MTYPE_CC, 1); } -static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - return __update_mqd(mm, mqd, q, MTYPE_UC, 0); -} - static int destroy_mqd(struct mqd_manager *mm, void *mqd, enum kfd_preempt_type type, unsigned int timeout, uint32_t pipe_id, @@ -309,28 +205,6 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd, pipe_id, queue_id); } -static int get_wave_state(struct mqd_manager *mm, void *mqd, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size) -{ - struct vi_mqd *m; - - m = get_mqd(mqd); - - *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - - m->cp_hqd_cntl_stack_offset; - *save_area_used_size = m->cp_hqd_wg_state_offset - - m->cp_hqd_cntl_stack_size; - - /* Control stack is not copied to user mode for GFXv8 because - * it's part of the context save area that is already - * accessible to user mode - */ - - return 0; -} - static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -363,118 +237,6 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, return retval; } -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) -{ - int retval; - struct vi_sdma_mqd *m; - - - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct vi_sdma_mqd), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; - - memset(m, 0, sizeof(struct vi_sdma_mqd)); - - *mqd = m; - if (gart_addr) - *gart_addr = (*mqd_mem_obj)->gpu_addr; - - retval = mm->update_mqd(mm, m, q); - - return retval; -} - -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -} - -static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) -{ - return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, - (uint32_t __user *)p->write_ptr, - mms); -} - -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct queue_properties *q) -{ - struct vi_sdma_mqd *m; - - m = get_sdma_mqd(mqd); - m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) - << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | - q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | - 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; - - m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); - m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); - m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); - m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->sdmax_rlcx_doorbell = - q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; - - m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr; - - m->sdma_engine_id = q->sdma_engine_id; - m->sdma_queue_id = q->sdma_queue_id; - - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; -} - -/* - * * preempt type here is ignored because there is only one way - * * to preempt sdma queue - */ -static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); -} - -static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, - uint64_t queue_address, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); -} - -#if defined(CONFIG_DEBUG_FS) - -static int debugfs_show_mqd(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct vi_mqd), false); - return 0; -} - -static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) -{ - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - data, sizeof(struct vi_sdma_mqd), false); - return 0; -} - -#endif - struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, struct kfd_dev *dev) { @@ -483,7 +245,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; - mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); if (!mqd) return NULL; @@ -498,10 +260,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; - mqd->get_wave_state = get_wave_state; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif break; case KFD_MQD_TYPE_HIQ: mqd->init_mqd = init_mqd_hiq; @@ -510,20 +268,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd; -#endif break; case KFD_MQD_TYPE_SDMA: - mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; - mqd->load_mqd = load_mqd_sdma; - mqd->update_mqd = update_mqd_sdma; - mqd->destroy_mqd = destroy_mqd_sdma; - mqd->is_occupied = is_occupied_sdma; -#if defined(CONFIG_DEBUG_FS) - mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; -#endif break; default: kfree(mqd); @@ -532,17 +278,3 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, return mqd; } - -struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) -{ - struct mqd_manager *mqd; - - mqd = mqd_manager_init_vi(type, dev); - if (!mqd) - return NULL; - if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) - mqd->update_mqd = update_mqd_tonga; - return mqd; -} - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 98c89d2..1d31260 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -26,6 +26,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_kernel_queue.h" #include "kfd_priv.h" +#include "kfd_pm4_headers_vi.h" #include "kfd_pm4_opcodes.h" static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, @@ -38,40 +39,38 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, *wptr = temp; } +static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) +{ + union PM4_MES_TYPE_3_HEADER header; + + header.u32All = 0; + header.opcode = opcode; + header.count = packet_size/sizeof(uint32_t) - 2; + header.type = PM4_TYPE_3; + + return header.u32All; +} + static void pm_calc_rlib_size(struct packet_manager *pm, unsigned int *rlib_size, bool *over_subscription) { - unsigned int process_count, queue_count, compute_queue_count; + unsigned int process_count, queue_count; unsigned int map_queue_size; - unsigned int max_proc_per_quantum = 1; - - struct kfd_dev *dev = pm->dqm->dev; process_count = pm->dqm->processes_count; queue_count = pm->dqm->queue_count; - compute_queue_count = queue_count - pm->dqm->sdma_queue_count; - - /* check if there is over subscription - * Note: the arbitration between the number of VMIDs and - * hws_max_conc_proc has been done in - * kgd2kfd_device_init(). - */ + /* check if there is over subscription*/ *over_subscription = false; - - if (dev->max_proc_per_quantum > 1) - max_proc_per_quantum = dev->max_proc_per_quantum; - - if ((process_count > max_proc_per_quantum) || - compute_queue_count > get_queues_num(pm->dqm)) { + if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) { *over_subscription = true; pr_debug("Over subscribed runlist\n"); } - map_queue_size = pm->pmf->get_map_queues_packet_size(); + map_queue_size = sizeof(struct pm4_mes_map_queues); /* calculate run list ib allocation size */ - *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + + *rlib_size = process_count * sizeof(struct pm4_mes_map_process) + queue_count * map_queue_size; /* @@ -79,7 +78,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, * when over subscription */ if (*over_subscription) - *rlib_size += pm->pmf->get_runlist_packet_size(); + *rlib_size += sizeof(struct pm4_mes_runlist); pr_debug("runlist ib size %d\n", *rlib_size); } @@ -97,14 +96,12 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); - mutex_lock(&pm->lock); - retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, &pm->ib_buffer_obj); if (retval) { pr_err("Failed to allocate runlist IB\n"); - goto out; + return retval; } *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; @@ -112,12 +109,131 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, memset(*rl_buffer, 0, *rl_buffer_size); pm->allocated = true; - -out: - mutex_unlock(&pm->lock); return retval; } +static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) +{ + struct pm4_mes_runlist *packet; + + if (WARN_ON(!ib)) + return -EFAULT; + + packet = (struct pm4_mes_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_mes_runlist)); + packet->header.u32All = build_pm4_header(IT_RUN_LIST, + sizeof(struct pm4_mes_runlist)); + + packet->bitfields4.ib_size = ib_size_in_dwords; + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; + packet->ordinal2 = lower_32_bits(ib); + packet->bitfields3.ib_base_hi = upper_32_bits(ib); + + return 0; +} + +static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, + struct qcm_process_device *qpd) +{ + struct pm4_mes_map_process *packet; + struct queue *cur; + uint32_t num_queues; + + packet = (struct pm4_mes_map_process *)buffer; + + memset(buffer, 0, sizeof(struct pm4_mes_map_process)); + + packet->header.u32All = build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_mes_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields3.page_table_base = qpd->page_table_base; + packet->bitfields10.gds_size = qpd->gds_size; + packet->bitfields10.num_gws = qpd->num_gws; + packet->bitfields10.num_oac = qpd->num_oac; + num_queues = 0; + list_for_each_entry(cur, &qpd->queues_list, list) + num_queues++; + packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; + + /* TODO: scratch support */ + packet->sh_hidden_private_base_vmid = 0; + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + return 0; +} + +static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static) +{ + struct pm4_mes_map_queues *packet; + bool use_static = is_static; + + packet = (struct pm4_mes_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); + + packet->header.u32All = build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_mes_map_queues)); + packet->bitfields2.alloc_format = + alloc_format__mes_map_queues__one_per_pipe_vi; + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; + + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute_vi; + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_compute_vi; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + if (use_static) + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_latency_static_queue_vi; + break; + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.queue_type = + queue_type__mes_map_queues__debug_interface_queue_vi; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__sdma0_vi; + use_static = false; /* no static queues under SDMA */ + break; + default: + WARN(1, "queue type %d", q->properties.type); + return -EINVAL; + } + packet->bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} + static int pm_create_runlist_ib(struct packet_manager *pm, struct list_head *queues, uint64_t *rl_gpu_addr, @@ -140,7 +256,6 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return retval; *rl_size_bytes = alloc_size_bytes; - pm->ib_size_bytes = alloc_size_bytes; pr_debug("Building runlist ib process count: %d queues count %d\n", pm->dqm->processes_count, pm->dqm->queue_count); @@ -155,12 +270,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return -ENOMEM; } - retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); + retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); if (retval) return retval; proccesses_mapped++; - inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(), + inc_wptr(&rl_wptr, sizeof(struct pm4_mes_map_process), alloc_size_bytes); list_for_each_entry(kq, &qpd->priv_queue_list, list) { @@ -170,7 +285,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", kq->queue->queue, qpd->is_debug); - retval = pm->pmf->map_queues(pm, + retval = pm_create_map_queue(pm, &rl_buffer[rl_wptr], kq->queue, qpd->is_debug); @@ -178,7 +293,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return retval; inc_wptr(&rl_wptr, - pm->pmf->get_map_queues_packet_size(), + sizeof(struct pm4_mes_map_queues), alloc_size_bytes); } @@ -189,15 +304,16 @@ static int pm_create_runlist_ib(struct packet_manager *pm, pr_debug("static_queue, mapping user queue %d, is debug status %d\n", q->queue, qpd->is_debug); - retval = pm->pmf->map_queues(pm, + retval = pm_create_map_queue(pm, &rl_buffer[rl_wptr], q, qpd->is_debug); + if (retval) return retval; inc_wptr(&rl_wptr, - pm->pmf->get_map_queues_packet_size(), + sizeof(struct pm4_mes_map_queues), alloc_size_bytes); } } @@ -205,7 +321,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, pr_debug("Finished map process and queues to runlist\n"); if (is_over_subscription) - retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], + retval = pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, alloc_size_bytes / sizeof(uint32_t), true); @@ -217,8 +333,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return retval; } -int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, - uint16_t fw_ver) +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) { pm->dqm = dqm; mutex_init(&pm->lock); @@ -229,26 +344,6 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, } pm->allocated = false; - switch (pm->dqm->dev->device_info->asic_family) { - case CHIP_KAVERI: - case CHIP_HAWAII: - kfd_pm_func_init_cik(pm, fw_ver); - break; - case CHIP_CARRIZO: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - kfd_pm_func_init_vi(pm, fw_ver); - break; - case CHIP_VEGA10: - case CHIP_RAVEN: - kfd_pm_func_init_v9(pm, fw_ver); - break; - default: - BUG(); - } - return 0; } @@ -261,25 +356,38 @@ void pm_uninit(struct packet_manager *pm) int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res) { - uint32_t *buffer, size; + struct pm4_mes_set_resources *packet; int retval = 0; - size = pm->pmf->get_set_resources_packet_size(); mutex_lock(&pm->lock); pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, - size / sizeof(uint32_t), - (unsigned int **)&buffer); - if (!buffer) { + sizeof(*packet) / sizeof(uint32_t), + (unsigned int **)&packet); + if (!packet) { pr_err("Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } - retval = pm->pmf->set_resources(pm, buffer, res); - if (!retval) - pm->priv_queue->ops.submit_packet(pm->priv_queue); - else - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + memset(packet, 0, sizeof(struct pm4_mes_set_resources)); + packet->header.u32All = build_pm4_header(IT_SET_RESOURCES, + sizeof(struct pm4_mes_set_resources)); + + packet->bitfields2.queue_type = + queue_type__mes_set_resources__hsa_interface_queue_hiq; + packet->bitfields2.vmid_mask = res->vmid_mask; + packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY; + packet->bitfields7.oac_mask = res->oac_mask; + packet->bitfields8.gds_heap_base = res->gds_heap_base; + packet->bitfields8.gds_heap_size = res->gds_heap_size; + + packet->gws_mask_lo = lower_32_bits(res->gws_mask); + packet->gws_mask_hi = upper_32_bits(res->gws_mask); + + packet->queue_mask_lo = lower_32_bits(res->queue_mask); + packet->queue_mask_hi = upper_32_bits(res->queue_mask); + + pm->priv_queue->ops.submit_packet(pm->priv_queue); out: mutex_unlock(&pm->lock); @@ -301,8 +409,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); - packet_size_dwords = pm->pmf->get_runlist_packet_size() / - sizeof(uint32_t); + packet_size_dwords = sizeof(struct pm4_mes_runlist) / sizeof(uint32_t); mutex_lock(&pm->lock); retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, @@ -310,8 +417,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) if (retval) goto fail_acquire_packet_buffer; - retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, - rl_ib_size / sizeof(uint32_t), false); + retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, + rl_ib_size / sizeof(uint32_t), false); if (retval) goto fail_create_runlist; @@ -333,59 +440,122 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, uint32_t fence_value) { - uint32_t *buffer, size; - int retval = 0; + int retval; + struct pm4_mes_query_status *packet; if (WARN_ON(!fence_address)) return -EFAULT; - size = pm->pmf->get_query_status_packet_size(); mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, - size / sizeof(uint32_t), (unsigned int **)&buffer); - if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); - retval = -ENOMEM; - goto out; - } + retval = pm->priv_queue->ops.acquire_packet_buffer( + pm->priv_queue, + sizeof(struct pm4_mes_query_status) / sizeof(uint32_t), + (unsigned int **)&packet); + if (retval) + goto fail_acquire_packet_buffer; - retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); - if (!retval) - pm->priv_queue->ops.submit_packet(pm->priv_queue); - else - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + packet->header.u32All = build_pm4_header(IT_QUERY_STATUS, + sizeof(struct pm4_mes_query_status)); -out: + packet->bitfields2.context_id = 0; + packet->bitfields2.interrupt_sel = + interrupt_sel__mes_query_status__completion_status; + packet->bitfields2.command = + command__mes_query_status__fence_only_after_write_ack; + + packet->addr_hi = upper_32_bits((uint64_t)fence_address); + packet->addr_lo = lower_32_bits((uint64_t)fence_address); + packet->data_hi = upper_32_bits((uint64_t)fence_value); + packet->data_lo = lower_32_bits((uint64_t)fence_value); + + pm->priv_queue->ops.submit_packet(pm->priv_queue); + +fail_acquire_packet_buffer: mutex_unlock(&pm->lock); return retval; } int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - enum kfd_unmap_queues_filter filter, + enum kfd_preempt_type_filter mode, uint32_t filter_param, bool reset, unsigned int sdma_engine) { - uint32_t *buffer, size; - int retval = 0; + int retval; + uint32_t *buffer; + struct pm4_mes_unmap_queues *packet; - size = pm->pmf->get_unmap_queues_packet_size(); mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, - size / sizeof(uint32_t), (unsigned int **)&buffer); - if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); - retval = -ENOMEM; - goto out; + retval = pm->priv_queue->ops.acquire_packet_buffer( + pm->priv_queue, + sizeof(struct pm4_mes_unmap_queues) / sizeof(uint32_t), + &buffer); + if (retval) + goto err_acquire_packet_buffer; + + packet = (struct pm4_mes_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); + pr_debug("static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n", + mode, reset, type); + packet->header.u32All = build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_mes_unmap_queues)); + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + break; + default: + WARN(1, "queue type %d", type); + retval = -EINVAL; + goto err_invalid; } - retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, - reset, sdma_engine); - if (!retval) - pm->priv_queue->ops.submit_packet(pm->priv_queue); + if (reset) + packet->bitfields2.action = + action__mes_unmap_queues__reset_queues; else - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + + switch (mode) { + case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; + case KFD_PREEMPT_TYPE_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; + case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_queues; + break; + case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES: + /* in this case, we do not preempt static queues */ + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_non_static_queues; + break; + default: + WARN(1, "filter %d", mode); + retval = -EINVAL; + goto err_invalid; + } + + pm->priv_queue->ops.submit_packet(pm->priv_queue); -out: + mutex_unlock(&pm->lock); + return 0; + +err_invalid: + pm->priv_queue->ops.rollback_packet(pm->priv_queue); +err_acquire_packet_buffer: mutex_unlock(&pm->lock); return retval; } @@ -399,18 +569,3 @@ void pm_release_ib(struct packet_manager *pm) } mutex_unlock(&pm->lock); } - -int pm_debugfs_runlist(struct seq_file *m, void *data) -{ - struct packet_manager *pm = data; - - if (!pm->allocated) { - seq_puts(m, " No active runlist\n"); - return 0; - } - - seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, - pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); - - return 0; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c index 15fff44..1e06de0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c @@ -20,64 +20,78 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +#include #include #include "kfd_priv.h" -static unsigned int pasid_bits = 16; -static const struct kfd2kgd_calls *kfd2kgd; +static unsigned long *pasid_bitmap; +static unsigned int pasid_limit; +static DEFINE_MUTEX(pasid_mutex); + +int kfd_pasid_init(void) +{ + pasid_limit = KFD_MAX_NUM_OF_PROCESSES; + + pasid_bitmap = kcalloc(BITS_TO_LONGS(pasid_limit), sizeof(long), + GFP_KERNEL); + if (!pasid_bitmap) + return -ENOMEM; + + set_bit(0, pasid_bitmap); /* PASID 0 is reserved. */ + + return 0; +} + +void kfd_pasid_exit(void) +{ + kfree(pasid_bitmap); +} bool kfd_set_pasid_limit(unsigned int new_limit) { - if (new_limit < 2) - return false; - - if (new_limit < (1U << pasid_bits)) { - if (kfd2kgd) - /* We've already allocated user PASIDs, too late to - * change the limit - */ - return false; - - while (new_limit < (1U << pasid_bits)) - pasid_bits--; + if (new_limit < pasid_limit) { + bool ok; + + mutex_lock(&pasid_mutex); + + /* ensure that no pasids >= new_limit are in-use */ + ok = (find_next_bit(pasid_bitmap, pasid_limit, new_limit) == + pasid_limit); + if (ok) + pasid_limit = new_limit; + + mutex_unlock(&pasid_mutex); + + return ok; } return true; } -unsigned int kfd_get_pasid_limit(void) +inline unsigned int kfd_get_pasid_limit(void) { - return 1U << pasid_bits; + return pasid_limit; } unsigned int kfd_pasid_alloc(void) { - int r; - - /* Find the first best KFD device for calling KGD */ - if (!kfd2kgd) { - struct kfd_dev *dev = NULL; - unsigned int i = 0; - - while ((kfd_topology_enum_kfd_devices(i, &dev)) == 0) { - if (dev && dev->kfd2kgd) { - kfd2kgd = dev->kfd2kgd; - break; - } - i++; - } - - if (!kfd2kgd) - return false; - } + unsigned int found; + + mutex_lock(&pasid_mutex); + + found = find_first_zero_bit(pasid_bitmap, pasid_limit); + if (found == pasid_limit) + found = 0; + else + set_bit(found, pasid_bitmap); - r = kfd2kgd->alloc_pasid(pasid_bits); + mutex_unlock(&pasid_mutex); - return r > 0 ? r : 0; + return found; } void kfd_pasid_free(unsigned int pasid) { - if (kfd2kgd) - kfd2kgd->free_pasid(pasid); + if (!WARN_ON(pasid == 0 || pasid >= pasid_limit)) + clear_bit(pasid, pasid_bitmap); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c deleted file mode 100644 index 543ed83..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c +++ /dev/null @@ -1,513 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - - -/* NOTE: - * - * This file contains logic to dynamically detect and enable PeerDirect - * suppor. PeerDirect support is delivered e.g. as part of OFED - * from Mellanox. Because we are not able to rely on the fact that the - * corresponding OFED will be installed we should: - * - copy PeerDirect definitions locally to avoid dependency on - * corresponding header file - * - try dynamically detect address of PeerDirect function - * pointers. - * - * If dynamic detection failed then PeerDirect support should be - * enabled using the standard PeerDirect bridge driver from: - * https://github.com/RadeonOpenCompute/ROCnRDMA - * - * - * Logic to support PeerDirect relies only on official public API to be - * non-intrusive as much as possible. - * - **/ - -#include -#include -#include -#include -#include -#include -#include - -#include "kfd_priv.h" - - - -/* ----------------------- PeerDirect interface ------------------------------*/ - -/* - * Copyright (c) 2013, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#define IB_PEER_MEMORY_NAME_MAX 64 -#define IB_PEER_MEMORY_VER_MAX 16 - -struct peer_memory_client { - char name[IB_PEER_MEMORY_NAME_MAX]; - char version[IB_PEER_MEMORY_VER_MAX]; - /* acquire return code: 1-mine, 0-not mine */ - int (*acquire)(unsigned long addr, size_t size, - void *peer_mem_private_data, - char *peer_mem_name, - void **client_context); - int (*get_pages)(unsigned long addr, - size_t size, int write, int force, - struct sg_table *sg_head, - void *client_context, void *core_context); - int (*dma_map)(struct sg_table *sg_head, void *client_context, - struct device *dma_device, int dmasync, int *nmap); - int (*dma_unmap)(struct sg_table *sg_head, void *client_context, - struct device *dma_device); - void (*put_pages)(struct sg_table *sg_head, void *client_context); - unsigned long (*get_page_size)(void *client_context); - void (*release)(void *client_context); - void* (*get_context_private_data)(u64 peer_id); - void (*put_context_private_data)(void *context); -}; - -typedef int (*invalidate_peer_memory)(void *reg_handle, - void *core_context); - -void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, - invalidate_peer_memory *invalidate_callback); -void ib_unregister_peer_memory_client(void *reg_handle); - - -/*------------------- PeerDirect bridge driver ------------------------------*/ - -#define AMD_PEER_BRIDGE_DRIVER_VERSION "1.0" -#define AMD_PEER_BRIDGE_DRIVER_NAME "amdkfd" - - -static void* (*pfn_ib_register_peer_memory_client)(struct peer_memory_client - *peer_client, - invalidate_peer_memory - *invalidate_callback); - -static void (*pfn_ib_unregister_peer_memory_client)(void *reg_handle); - -static const struct amd_rdma_interface *rdma_interface; - -static invalidate_peer_memory ib_invalidate_callback; -static void *ib_reg_handle; - -struct amd_mem_context { - uint64_t va; - uint64_t size; - struct pid *pid; - - struct amd_p2p_info *p2p_info; - - /* Flag that free callback was called */ - int free_callback_called; - - /* Context received from PeerDirect call */ - void *core_context; -}; - - -static void free_callback(void *client_priv) -{ - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_priv; - - pr_debug("data 0x%p\n", mem_context); - - if (!mem_context) { - pr_warn("Invalid client context\n"); - return; - } - - pr_debug("mem_context->core_context 0x%p\n", mem_context->core_context); - - /* Call back IB stack asking to invalidate memory */ - (*ib_invalidate_callback) (ib_reg_handle, mem_context->core_context); - - /* amdkfd will free resources when we return from this callback. - * Set flag to inform that there is nothing to do on "put_pages", etc. - */ - ACCESS_ONCE(mem_context->free_callback_called) = 1; -} - - -static int amd_acquire(unsigned long addr, size_t size, - void *peer_mem_private_data, - char *peer_mem_name, void **client_context) -{ - int ret; - struct amd_mem_context *mem_context; - struct pid *pid; - - /* Get pointer to structure describing current process */ - pid = get_task_pid(current, PIDTYPE_PID); - - pr_debug("addr:0x%lx,size:0x%x, pid 0x%p\n", - addr, (unsigned int)size, pid); - - /* Check if address is handled by AMD GPU driver */ - ret = rdma_interface->is_gpu_address(addr, pid); - - if (!ret) { - pr_debug("Not GPU Address\n"); - /* This is not GPU address */ - return 0; - } - - pr_debug("GPU address\n"); - - /* Initialize context used for operation with given address */ - mem_context = kzalloc(sizeof(*mem_context), GFP_KERNEL); - - if (!mem_context) - return 0; /* Error case handled as not GPU address */ - - mem_context->free_callback_called = 0; - mem_context->va = addr; - mem_context->size = size; - - /* Save PID. It is guaranteed that the function will be - * called in the correct process context as opposite to others. - */ - mem_context->pid = pid; - - pr_debug("Client context %p\n", mem_context); - - /* Return pointer to allocated context */ - *client_context = mem_context; - - /* Return 1 to inform that this address which will be handled - * by AMD GPU driver - */ - return 1; -} - -static int amd_get_pages(unsigned long addr, size_t size, int write, int force, - struct sg_table *sg_head, - void *client_context, void *core_context) -{ - int ret; - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("addr:0x%lx,size:0x%x, core_context:%p\n", - addr, (unsigned int)size, core_context); - - if (!mem_context) { - pr_warn("Invalid client context"); - return -EINVAL; - } - - pr_debug("pid :0x%p\n", mem_context->pid); - - - if (addr != mem_context->va) { - pr_warn("Context address (0x%llx) is not the same\n", - mem_context->va); - return -EINVAL; - } - - if (size != mem_context->size) { - pr_warn("Context size (0x%llx) is not the same\n", - mem_context->size); - return -EINVAL; - } - - ret = rdma_interface->get_pages(addr, - size, - mem_context->pid, - &mem_context->p2p_info, - free_callback, - mem_context); - - if (ret || !mem_context->p2p_info) { - pr_err("Could not rdma::get_pages failure: %d\n", ret); - return ret; - } - - mem_context->core_context = core_context; - - /* Note: At this stage it is OK not to fill sg_table */ - return 0; -} - - -static int amd_dma_map(struct sg_table *sg_head, void *client_context, - struct device *dma_device, int dmasync, int *nmap) -{ - /* - * NOTE/TODO: - * We could have potentially three cases for real memory - * location: - * - all memory in the local - * - all memory in the system (RAM) - * - memory is spread (s/g) between local and system. - * - * In the case of all memory in the system we could use - * iommu driver to build DMA addresses but not in the case - * of local memory because currently iommu driver doesn't - * deal with local/device memory addresses (it requires "struct - * page"). - * - * Accordingly returning assumes that iommu funcutionality - * should be disabled so we can assume that sg_table already - * contains DMA addresses. - * - */ - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("Context 0x%p, sg_head 0x%p\n", - client_context, sg_head); - - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - if (!mem_context->p2p_info) { - pr_err("No sg table were allocated\n"); - return -EINVAL; - } - - /* Copy information about previosly allocated sg_table */ - *sg_head = *mem_context->p2p_info->pages; - - /* Return number of pages */ - *nmap = mem_context->p2p_info->pages->nents; - - return 0; -} - -static int amd_dma_unmap(struct sg_table *sg_head, void *client_context, - struct device *dma_device) -{ - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("Context 0x%p, sg_table 0x%p\n", - client_context, sg_head); - - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - /* Assume success */ - return 0; -} -static void amd_put_pages(struct sg_table *sg_head, void *client_context) -{ - int ret = 0; - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("sg_head %p client_context: 0x%p\n", - sg_head, client_context); - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - pr_debug("mem_context->p2p_info %p\n", - mem_context->p2p_info); - - if (ACCESS_ONCE(mem_context->free_callback_called)) { - pr_debug("Free callback was called\n"); - return; - } - - if (mem_context->p2p_info) { - ret = rdma_interface->put_pages(&mem_context->p2p_info); - mem_context->p2p_info = NULL; - - if (ret) - pr_err("Failure: %d (callback status %d)\n", - ret, mem_context->free_callback_called); - } else - pr_err("Pointer to p2p info is null\n"); -} -static unsigned long amd_get_page_size(void *client_context) -{ - unsigned long page_size; - int result; - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("context: %p\n", client_context); - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - - result = rdma_interface->get_page_size( - mem_context->va, - mem_context->size, - mem_context->pid, - &page_size); - - if (result) { - pr_err("Could not get page size. %d\n", result); - /* If we failed to get page size then do not know what to do. - * Let's return some default value - */ - return PAGE_SIZE; - } - - return page_size; -} - -static void amd_release(void *client_context) -{ - struct amd_mem_context *mem_context = - (struct amd_mem_context *)client_context; - - pr_debug("context: 0x%p\n", client_context); - pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", - mem_context->pid, - mem_context->va, - mem_context->size); - - kfree(mem_context); -} - - -static struct peer_memory_client amd_mem_client = { - .acquire = amd_acquire, - .get_pages = amd_get_pages, - .dma_map = amd_dma_map, - .dma_unmap = amd_dma_unmap, - .put_pages = amd_put_pages, - .get_page_size = amd_get_page_size, - .release = amd_release, - .get_context_private_data = NULL, - .put_context_private_data = NULL, -}; - -/** Initialize PeerDirect interface with RDMA Network stack. - * - * Because network stack could potentially be loaded later we check - * presence of PeerDirect when HSA process is created. If PeerDirect was - * already initialized we do nothing otherwise try to detect and register. - */ -void kfd_init_peer_direct(void) -{ - int result; - - if (pfn_ib_unregister_peer_memory_client) { - pr_debug("PeerDirect support was already initialized\n"); - return; - } - - pr_debug("Try to initialize PeerDirect support\n"); - - pfn_ib_register_peer_memory_client = - (void *(*)(struct peer_memory_client *, - invalidate_peer_memory *)) - symbol_request(ib_register_peer_memory_client); - - pfn_ib_unregister_peer_memory_client = (void (*)(void *)) - symbol_request(ib_unregister_peer_memory_client); - - if (!pfn_ib_register_peer_memory_client || - !pfn_ib_unregister_peer_memory_client) { - pr_debug("PeerDirect interface was not detected\n"); - /* Do cleanup */ - kfd_close_peer_direct(); - return; - } - - result = amdkfd_query_rdma_interface(&rdma_interface); - - if (result < 0) { - pr_err("Cannot get RDMA Interface (result = %d)\n", result); - return; - } - - strcpy(amd_mem_client.name, AMD_PEER_BRIDGE_DRIVER_NAME); - strcpy(amd_mem_client.version, AMD_PEER_BRIDGE_DRIVER_VERSION); - - ib_reg_handle = pfn_ib_register_peer_memory_client(&amd_mem_client, - &ib_invalidate_callback); - - if (!ib_reg_handle) { - pr_err("Cannot register peer memory client\n"); - /* Do cleanup */ - kfd_close_peer_direct(); - return; - } - - pr_info("PeerDirect support was initialized successfully\n"); -} - -/** - * Close connection with PeerDirect interface with RDMA Network stack. - * - */ -void kfd_close_peer_direct(void) -{ - if (pfn_ib_unregister_peer_memory_client) { - if (ib_reg_handle) - pfn_ib_unregister_peer_memory_client(ib_reg_handle); - - symbol_put(ib_unregister_peer_memory_client); - } - - if (pfn_ib_register_peer_memory_client) - symbol_put(ib_register_peer_memory_client); - - - /* Reset pointers to be safe */ - pfn_ib_unregister_peer_memory_client = NULL; - pfn_ib_register_peer_memory_client = NULL; - ib_reg_handle = NULL; -} - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h deleted file mode 100644 index ddad9be..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +++ /dev/null @@ -1,583 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef F32_MES_PM4_PACKETS_H -#define F32_MES_PM4_PACKETS_H - -#ifndef PM4_MES_HEADER_DEFINED -#define PM4_MES_HEADER_DEFINED -union PM4_MES_TYPE_3_HEADER { - struct { - uint32_t reserved1 : 8; /* < reserved */ - uint32_t opcode : 8; /* < IT opcode */ - uint32_t count : 14;/* < number of DWORDs - 1 in the - * information body. - */ - uint32_t type : 2; /* < packet identifier. - * It should be 3 for type 3 packets - */ - }; - uint32_t u32All; -}; -#endif /* PM4_MES_HEADER_DEFINED */ - -/*--------------------MES_SET_RESOURCES--------------------*/ - -#ifndef PM4_MES_SET_RESOURCES_DEFINED -#define PM4_MES_SET_RESOURCES_DEFINED -enum mes_set_resources_queue_type_enum { - queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, - queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, - queue_type__mes_set_resources__hsa_debug_interface_queue = 4 -}; - - -struct pm4_mes_set_resources { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t vmid_mask:16; - uint32_t unmap_latency:8; - uint32_t reserved1:5; - enum mes_set_resources_queue_type_enum queue_type:3; - } bitfields2; - uint32_t ordinal2; - }; - - uint32_t queue_mask_lo; - uint32_t queue_mask_hi; - uint32_t gws_mask_lo; - uint32_t gws_mask_hi; - - union { - struct { - uint32_t oac_mask:16; - uint32_t reserved2:16; - } bitfields7; - uint32_t ordinal7; - }; - - union { - struct { - uint32_t gds_heap_base:6; - uint32_t reserved3:5; - uint32_t gds_heap_size:6; - uint32_t reserved4:15; - } bitfields8; - uint32_t ordinal8; - }; - -}; -#endif - -/*--------------------MES_RUN_LIST--------------------*/ - -#ifndef PM4_MES_RUN_LIST_DEFINED -#define PM4_MES_RUN_LIST_DEFINED - -struct pm4_mes_runlist { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t reserved1:2; - uint32_t ib_base_lo:30; - } bitfields2; - uint32_t ordinal2; - }; - - uint32_t ib_base_hi; - - union { - struct { - uint32_t ib_size:20; - uint32_t chain:1; - uint32_t offload_polling:1; - uint32_t reserved2:1; - uint32_t valid:1; - uint32_t process_cnt:4; - uint32_t reserved3:4; - } bitfields4; - uint32_t ordinal4; - }; - -}; -#endif - -/*--------------------MES_MAP_PROCESS--------------------*/ - -#ifndef PM4_MES_MAP_PROCESS_DEFINED -#define PM4_MES_MAP_PROCESS_DEFINED - -struct pm4_mes_map_process { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t pasid:16; - uint32_t reserved1:8; - uint32_t diq_enable:1; - uint32_t process_quantum:7; - } bitfields2; - uint32_t ordinal2; - }; - - uint32_t vm_context_page_table_base_addr_lo32; - - uint32_t vm_context_page_table_base_addr_hi32; - - uint32_t sh_mem_bases; - - uint32_t sh_mem_config; - - uint32_t sq_shader_tba_lo; - - uint32_t sq_shader_tba_hi; - - uint32_t sq_shader_tma_lo; - - uint32_t sq_shader_tma_hi; - - uint32_t reserved6; - - uint32_t gds_addr_lo; - - uint32_t gds_addr_hi; - - union { - struct { - uint32_t num_gws:6; - uint32_t reserved7:1; - uint32_t sdma_enable:1; - uint32_t num_oac:4; - uint32_t reserved8:4; - uint32_t gds_size:6; - uint32_t num_queues:10; - } bitfields14; - uint32_t ordinal14; - }; - - uint32_t completion_signal_lo; - - uint32_t completion_signal_hi; - -}; - -#endif - -/*--------------------MES_MAP_PROCESS_VM--------------------*/ - -#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED -#define PM4_MES_MAP_PROCESS_VM_DEFINED - -struct PM4_MES_MAP_PROCESS_VM { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - uint32_t reserved1; - - uint32_t vm_context_cntl; - - uint32_t reserved2; - - uint32_t vm_context_page_table_end_addr_lo32; - - uint32_t vm_context_page_table_end_addr_hi32; - - uint32_t vm_context_page_table_start_addr_lo32; - - uint32_t vm_context_page_table_start_addr_hi32; - - uint32_t reserved3; - - uint32_t reserved4; - - uint32_t reserved5; - - uint32_t reserved6; - - uint32_t reserved7; - - uint32_t reserved8; - - uint32_t completion_signal_lo32; - - uint32_t completion_signal_hi32; - -}; -#endif - -/*--------------------MES_MAP_QUEUES--------------------*/ - -#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED -#define PM4_MES_MAP_QUEUES_VI_DEFINED -enum mes_map_queues_queue_sel_enum { - queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, -queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 -}; - -enum mes_map_queues_queue_type_enum { - queue_type__mes_map_queues__normal_compute_vi = 0, - queue_type__mes_map_queues__debug_interface_queue_vi = 1, - queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, -queue_type__mes_map_queues__low_latency_static_queue_vi = 3 -}; - -enum mes_map_queues_alloc_format_enum { - alloc_format__mes_map_queues__one_per_pipe_vi = 0, -alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 -}; - -enum mes_map_queues_engine_sel_enum { - engine_sel__mes_map_queues__compute_vi = 0, - engine_sel__mes_map_queues__sdma0_vi = 2, - engine_sel__mes_map_queues__sdma1_vi = 3 -}; - - -struct pm4_mes_map_queues { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t reserved1:4; - enum mes_map_queues_queue_sel_enum queue_sel:2; - uint32_t reserved2:15; - enum mes_map_queues_queue_type_enum queue_type:3; - enum mes_map_queues_alloc_format_enum alloc_format:2; - enum mes_map_queues_engine_sel_enum engine_sel:3; - uint32_t num_queues:3; - } bitfields2; - uint32_t ordinal2; - }; - - union { - struct { - uint32_t reserved3:1; - uint32_t check_disable:1; - uint32_t doorbell_offset:26; - uint32_t reserved4:4; - } bitfields3; - uint32_t ordinal3; - }; - - uint32_t mqd_addr_lo; - uint32_t mqd_addr_hi; - uint32_t wptr_addr_lo; - uint32_t wptr_addr_hi; -}; -#endif - -/*--------------------MES_QUERY_STATUS--------------------*/ - -#ifndef PM4_MES_QUERY_STATUS_DEFINED -#define PM4_MES_QUERY_STATUS_DEFINED -enum mes_query_status_interrupt_sel_enum { - interrupt_sel__mes_query_status__completion_status = 0, - interrupt_sel__mes_query_status__process_status = 1, - interrupt_sel__mes_query_status__queue_status = 2 -}; - -enum mes_query_status_command_enum { - command__mes_query_status__interrupt_only = 0, - command__mes_query_status__fence_only_immediate = 1, - command__mes_query_status__fence_only_after_write_ack = 2, - command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 -}; - -enum mes_query_status_engine_sel_enum { - engine_sel__mes_query_status__compute = 0, - engine_sel__mes_query_status__sdma0_queue = 2, - engine_sel__mes_query_status__sdma1_queue = 3 -}; - -struct pm4_mes_query_status { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - uint32_t context_id:28; - enum mes_query_status_interrupt_sel_enum interrupt_sel:2; - enum mes_query_status_command_enum command:2; - } bitfields2; - uint32_t ordinal2; - }; - - union { - struct { - uint32_t pasid:16; - uint32_t reserved1:16; - } bitfields3a; - struct { - uint32_t reserved2:2; - uint32_t doorbell_offset:26; - enum mes_query_status_engine_sel_enum engine_sel:3; - uint32_t reserved3:1; - } bitfields3b; - uint32_t ordinal3; - }; - - uint32_t addr_lo; - uint32_t addr_hi; - uint32_t data_lo; - uint32_t data_hi; -}; -#endif - -/*--------------------MES_UNMAP_QUEUES--------------------*/ - -#ifndef PM4_MES_UNMAP_QUEUES_DEFINED -#define PM4_MES_UNMAP_QUEUES_DEFINED -enum mes_unmap_queues_action_enum { - action__mes_unmap_queues__preempt_queues = 0, - action__mes_unmap_queues__reset_queues = 1, - action__mes_unmap_queues__disable_process_queues = 2, - action__mes_unmap_queues__reserved = 3 -}; - -enum mes_unmap_queues_queue_sel_enum { - queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, - queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, - queue_sel__mes_unmap_queues__unmap_all_queues = 2, - queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 -}; - -enum mes_unmap_queues_engine_sel_enum { - engine_sel__mes_unmap_queues__compute = 0, - engine_sel__mes_unmap_queues__sdma0 = 2, - engine_sel__mes_unmap_queues__sdmal = 3 -}; - -struct pm4_mes_unmap_queues { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; - }; - - union { - struct { - enum mes_unmap_queues_action_enum action:2; - uint32_t reserved1:2; - enum mes_unmap_queues_queue_sel_enum queue_sel:2; - uint32_t reserved2:20; - enum mes_unmap_queues_engine_sel_enum engine_sel:3; - uint32_t num_queues:3; - } bitfields2; - uint32_t ordinal2; - }; - - union { - struct { - uint32_t pasid:16; - uint32_t reserved3:16; - } bitfields3a; - struct { - uint32_t reserved4:2; - uint32_t doorbell_offset0:26; - int32_t reserved5:4; - } bitfields3b; - uint32_t ordinal3; - }; - - union { - struct { - uint32_t reserved6:2; - uint32_t doorbell_offset1:26; - uint32_t reserved7:4; - } bitfields4; - uint32_t ordinal4; - }; - - union { - struct { - uint32_t reserved8:2; - uint32_t doorbell_offset2:26; - uint32_t reserved9:4; - } bitfields5; - uint32_t ordinal5; - }; - - union { - struct { - uint32_t reserved10:2; - uint32_t doorbell_offset3:26; - uint32_t reserved11:4; - } bitfields6; - uint32_t ordinal6; - }; -}; -#endif - -#ifndef PM4_MEC_RELEASE_MEM_DEFINED -#define PM4_MEC_RELEASE_MEM_DEFINED - -enum mec_release_mem_event_index_enum { - event_index__mec_release_mem__end_of_pipe = 5, - event_index__mec_release_mem__shader_done = 6 -}; - -enum mec_release_mem_cache_policy_enum { - cache_policy__mec_release_mem__lru = 0, - cache_policy__mec_release_mem__stream = 1 -}; - -enum mec_release_mem_pq_exe_status_enum { - pq_exe_status__mec_release_mem__default = 0, - pq_exe_status__mec_release_mem__phase_update = 1 -}; - -enum mec_release_mem_dst_sel_enum { - dst_sel__mec_release_mem__memory_controller = 0, - dst_sel__mec_release_mem__tc_l2 = 1, - dst_sel__mec_release_mem__queue_write_pointer_register = 2, - dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 -}; - -enum mec_release_mem_int_sel_enum { - int_sel__mec_release_mem__none = 0, - int_sel__mec_release_mem__send_interrupt_only = 1, - int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, - int_sel__mec_release_mem__send_data_after_write_confirm = 3, - int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, - int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, - int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 -}; - -enum mec_release_mem_data_sel_enum { - data_sel__mec_release_mem__none = 0, - data_sel__mec_release_mem__send_32_bit_low = 1, - data_sel__mec_release_mem__send_64_bit_data = 2, - data_sel__mec_release_mem__send_gpu_clock_counter = 3, - data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, - data_sel__mec_release_mem__store_gds_data_to_memory = 5 -}; - -struct pm4_mec_release_mem { - union { - union PM4_MES_TYPE_3_HEADER header; /*header */ - unsigned int ordinal1; - }; - - union { - struct { - unsigned int event_type:6; - unsigned int reserved1:2; - enum mec_release_mem_event_index_enum event_index:4; - unsigned int tcl1_vol_action_ena:1; - unsigned int tc_vol_action_ena:1; - unsigned int reserved2:1; - unsigned int tc_wb_action_ena:1; - unsigned int tcl1_action_ena:1; - unsigned int tc_action_ena:1; - uint32_t reserved3:1; - uint32_t tc_nc_action_ena:1; - uint32_t tc_wc_action_ena:1; - uint32_t tc_md_action_ena:1; - uint32_t reserved4:3; - enum mec_release_mem_cache_policy_enum cache_policy:2; - uint32_t reserved5:2; - enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; - uint32_t reserved6:2; - } bitfields2; - unsigned int ordinal2; - }; - - union { - struct { - uint32_t reserved7:16; - enum mec_release_mem_dst_sel_enum dst_sel:2; - uint32_t reserved8:6; - enum mec_release_mem_int_sel_enum int_sel:3; - uint32_t reserved9:2; - enum mec_release_mem_data_sel_enum data_sel:3; - } bitfields3; - unsigned int ordinal3; - }; - - union { - struct { - uint32_t reserved10:2; - unsigned int address_lo_32b:30; - } bitfields4; - struct { - uint32_t reserved11:3; - uint32_t address_lo_64b:29; - } bitfields4b; - uint32_t reserved12; - unsigned int ordinal4; - }; - - union { - uint32_t address_hi; - uint32_t reserved13; - uint32_t ordinal5; - }; - - union { - uint32_t data_lo; - uint32_t cmp_data_lo; - struct { - uint32_t dw_offset:16; - uint32_t num_dwords:16; - } bitfields6c; - uint32_t reserved14; - uint32_t ordinal6; - }; - - union { - uint32_t data_hi; - uint32_t cmp_data_hi; - uint32_t reserved15; - uint32_t reserved16; - uint32_t ordinal7; - }; - - uint32_t int_ctxid; - -}; - -#endif - -enum { - CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 -}; -#endif - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h index 0b314a8..a0ff348 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h @@ -77,6 +77,103 @@ struct pm4__indirect_buffer_pasid { #endif +/*--------------------_RELEASE_MEM-------------------- */ + +#ifndef _PM4__RELEASE_MEM_DEFINED +#define _PM4__RELEASE_MEM_DEFINED +enum _RELEASE_MEM_event_index_enum { + event_index___release_mem__end_of_pipe = 5, + event_index___release_mem__shader_done = 6 +}; + +enum _RELEASE_MEM_cache_policy_enum { + cache_policy___release_mem__lru = 0, + cache_policy___release_mem__stream = 1, + cache_policy___release_mem__bypass = 2 +}; + +enum _RELEASE_MEM_dst_sel_enum { + dst_sel___release_mem__memory_controller = 0, + dst_sel___release_mem__tc_l2 = 1, + dst_sel___release_mem__queue_write_pointer_register = 2, + dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 +}; + +enum _RELEASE_MEM_int_sel_enum { + int_sel___release_mem__none = 0, + int_sel___release_mem__send_interrupt_only = 1, + int_sel___release_mem__send_interrupt_after_write_confirm = 2, + int_sel___release_mem__send_data_after_write_confirm = 3 +}; + +enum _RELEASE_MEM_data_sel_enum { + data_sel___release_mem__none = 0, + data_sel___release_mem__send_32_bit_low = 1, + data_sel___release_mem__send_64_bit_data = 2, + data_sel___release_mem__send_gpu_clock_counter = 3, + data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, + data_sel___release_mem__store_gds_data_to_memory = 5 +}; + +struct pm4__release_mem { + union { + union PM4_MES_TYPE_3_HEADER header; /*header */ + unsigned int ordinal1; + }; + + union { + struct { + unsigned int event_type:6; + unsigned int reserved1:2; + enum _RELEASE_MEM_event_index_enum event_index:4; + unsigned int tcl1_vol_action_ena:1; + unsigned int tc_vol_action_ena:1; + unsigned int reserved2:1; + unsigned int tc_wb_action_ena:1; + unsigned int tcl1_action_ena:1; + unsigned int tc_action_ena:1; + unsigned int reserved3:6; + unsigned int atc:1; + enum _RELEASE_MEM_cache_policy_enum cache_policy:2; + unsigned int reserved4:5; + } bitfields2; + unsigned int ordinal2; + }; + + union { + struct { + unsigned int reserved5:16; + enum _RELEASE_MEM_dst_sel_enum dst_sel:2; + unsigned int reserved6:6; + enum _RELEASE_MEM_int_sel_enum int_sel:3; + unsigned int reserved7:2; + enum _RELEASE_MEM_data_sel_enum data_sel:3; + } bitfields3; + unsigned int ordinal3; + }; + + union { + struct { + unsigned int reserved8:2; + unsigned int address_lo_32b:30; + } bitfields4; + struct { + unsigned int reserved9:3; + unsigned int address_lo_64b:29; + } bitfields5; + unsigned int ordinal4; + }; + + unsigned int address_hi; + + unsigned int data_lo; + + unsigned int data_hi; + +}; +#endif + + /*--------------------_SET_CONFIG_REG-------------------- */ #ifndef _PM4__SET_CONFIG_REG_DEFINED diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 28fac2d..b397ec7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -24,55 +24,19 @@ #define KFD_PRIV_H_INCLUDED #include -#include #include #include #include #include #include #include -#include #include -#include -#include -#include -#include -#include #include -#include -#include "amd_shared.h" - #define KFD_SYSFS_FILE_MODE 0444 -/* GPU ID hash width in bits */ -#define KFD_GPU_ID_HASH_WIDTH 16 - -/* Use upper bits of mmap offset to store KFD driver specific information. - * BITS[63:62] - Encode MMAP type - * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to - * BITS[45:40] - Reserved. Not Used. - * BITS[39:0] - MMAP offset value. Used by TTM. - * - * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these - * defines are w.r.t to PAGE_SIZE - */ -#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) -#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) -#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) -#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) -#define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT) - -#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) -#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ - << KFD_MMAP_GPU_ID_SHIFT) -#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ - & KFD_MMAP_GPU_ID_MASK) -#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ - >> KFD_MMAP_GPU_ID_SHIFT) - -#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT) -#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) +#define KFD_MMAP_DOORBELL_MASK 0x8000000000000 +#define KFD_MMAP_EVENTS_MASK 0x4000000000000 /* * When working with cp scheduler we should assign the HIQ manually or via @@ -84,6 +48,8 @@ #define KFD_CIK_HIQ_PIPE 4 #define KFD_CIK_HIQ_QUEUE 0 +/* GPU ID hash width in bits */ +#define KFD_GPU_ID_HASH_WIDTH 16 /* Macro for allocating structures */ #define kfd_alloc_struct(ptr_to_struct) \ @@ -93,15 +59,6 @@ #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024 /* - * Size of the per-process TBA+TMA buffer: 2 pages - * - * The first page is the TBA used for the CWSR ISA code. The second - * page is used as TMA for daisy changing a user-mode trap handler. - */ -#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2) -#define KFD_CWSR_TMA_OFFSET PAGE_SIZE - -/* * Kernel module parameter to specify maximum number of supported queues per * device */ @@ -117,50 +74,12 @@ extern int max_num_of_queues_per_device; /* Kernel module parameter to specify the scheduling policy */ extern int sched_policy; -extern int cwsr_enable; - -/* - * Kernel module parameter to specify the maximum process - * number per HW scheduler - */ -extern int hws_max_conc_proc; - /* * Kernel module parameter to specify whether to send sigterm to HSA process on * unhandled exception */ extern int send_sigterm; -/* - * This kernel module is used to simulate large bar machine on non-large bar - * enabled machines. - */ -extern int debug_largebar; - -/* - * Ignore CRAT table during KFD initialization, can be used to work around - * broken CRAT tables on some AMD systems - */ -extern int ignore_crat; - -/* - * Set sh_mem_config.retry_disable on Vega10 - */ -extern int vega10_noretry; - -/* - * Enable privileged mode for all CP queues including user queues - */ -extern int priv_cp_queues; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0) && defined(BUILD_AS_DKMS) -/* - * Currently, mm_access() function is not exported. So for DKMS build, - * CMA will be enabled only if module param is set. - */ -extern int cma_enable; -#endif - /** * enum kfd_sched_policy * @@ -193,30 +112,26 @@ enum cache_policy { cache_policy_noncoherent }; -#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) +enum asic_family_type { + CHIP_KAVERI = 0, + CHIP_CARRIZO +}; struct kfd_event_interrupt_class { bool (*interrupt_isr)(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, uint32_t *patched_ihre, - bool *patched_flag); + const uint32_t *ih_ring_entry); void (*interrupt_wq)(struct kfd_dev *dev, - const uint32_t *ih_ring_entry); + const uint32_t *ih_ring_entry); }; struct kfd_device_info { - enum amd_asic_type asic_family; + unsigned int asic_family; const struct kfd_event_interrupt_class *event_interrupt_class; unsigned int max_pasid_bits; unsigned int max_no_of_hqd; - unsigned int doorbell_size; size_t ih_ring_entry_size; uint8_t num_of_watch_points; uint16_t mqd_size_aligned; - bool is_need_iommu_device; - bool supports_cwsr; - bool needs_pci_atomics; - /* obtain from adev->sdma.num_instances */ - unsigned int num_sdma_engines; }; struct kfd_mem_obj { @@ -224,13 +139,6 @@ struct kfd_mem_obj { uint32_t range_end; uint64_t gpu_addr; uint32_t *cpu_ptr; - void *gtt_mem; -}; - -struct kfd_vmid_info { - uint32_t first_vmid_kfd; - uint32_t last_vmid_kfd; - uint32_t vmid_num_kfd; }; struct kfd_dev { @@ -249,12 +157,14 @@ struct kfd_dev { * to HW doorbell, GFX reserved some * at the start) */ + size_t doorbell_process_limit; /* Number of processes we have doorbell + * space for. + */ u32 __iomem *doorbell_kernel_ptr; /* This is a pointer for a doorbells * page used by kernel queue */ struct kgd2kfd_shared_resources shared_resources; - struct kfd_vmid_info vm_info; const struct kfd2kgd_calls *kfd2kgd; struct mutex doorbell_mutex; @@ -270,8 +180,10 @@ struct kfd_dev { unsigned int gtt_sa_num_of_chunks; /* Interrupts */ - struct kfifo ih_fifo; - struct workqueue_struct *ih_wq; + void *interrupt_ring; + size_t interrupt_ring_size; + atomic_t interrupt_ring_rptr; + atomic_t interrupt_ring_wptr; struct work_struct interrupt_work; spinlock_t interrupt_lock; @@ -279,7 +191,6 @@ struct kfd_dev { struct device_queue_manager *dqm; bool init_complete; - /* * Interrupts of interest to KFD are copied * from the HW ring into a SW ring. @@ -287,31 +198,7 @@ struct kfd_dev { bool interrupts_active; /* Debug manager */ - struct kfd_dbgmgr *dbgmgr; - - /* MEC firmware version*/ - uint16_t mec_fw_version; - - /* Maximum process number mapped to HW scheduler */ - unsigned int max_proc_per_quantum; - - /* CWSR */ - bool cwsr_enabled; - const void *cwsr_isa; - unsigned int cwsr_isa_size; - - /* IB usage */ - uint32_t ib_size; -}; - -struct kfd_ipc_obj; - -struct kfd_bo { - void *mem; - struct interval_tree_node it; - struct kfd_dev *dev; - struct list_head cb_data_head; - struct kfd_ipc_obj *kfd_ipc_obj; + struct kfd_dbgmgr *dbgmgr; }; /* KGD2KFD callbacks */ @@ -334,22 +221,22 @@ void kfd_chardev_exit(void); struct device *kfd_chardev(void); /** - * enum kfd_unmap_queues_filter + * enum kfd_preempt_type_filter * - * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue. + * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue. * - * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the + * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the * running queues list. * - * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to + * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to * specific process. * */ -enum kfd_unmap_queues_filter { - KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE, - KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, - KFD_UNMAP_QUEUES_FILTER_BY_PASID +enum kfd_preempt_type_filter { + KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE, + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, + KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, + KFD_PREEMPT_TYPE_FILTER_BY_PASID }; /** @@ -375,11 +262,6 @@ enum kfd_queue_format { KFD_QUEUE_FORMAT_AQL }; -enum KFD_QUEUE_PRIORITY { - KFD_QUEUE_PRIORITY_MINIMUM = 0, - KFD_QUEUE_PRIORITY_MAXIMUM = 15 -}; - /** * struct queue_properties * @@ -434,10 +316,9 @@ struct queue_properties { uint32_t queue_percent; uint32_t *read_ptr; uint32_t *write_ptr; - void __iomem *doorbell_ptr; + uint32_t __iomem *doorbell_ptr; uint32_t doorbell_off; bool is_interop; - bool is_evicted; /* true -> queue is evicted */ bool is_active; /* Not relevant for user mode queues in cp scheduling */ unsigned int vmid; @@ -450,12 +331,6 @@ struct queue_properties { uint32_t eop_ring_buffer_size; uint64_t ctx_save_restore_area_address; uint32_t ctx_save_restore_area_size; - uint32_t ctl_stack_size; - uint64_t tba_addr; - uint64_t tma_addr; - /* Relevant for CU */ - uint32_t cu_mask_count; /* Must be a multiple of 32 */ - uint32_t *cu_mask; }; /** @@ -500,7 +375,6 @@ struct queue { uint32_t queue; unsigned int sdma_id; - unsigned int doorbell_id; struct kfd_process *process; struct kfd_dev *device; @@ -517,19 +391,6 @@ enum KFD_MQD_TYPE { KFD_MQD_TYPE_MAX }; -enum KFD_PIPE_PRIORITY { - KFD_PIPE_PRIORITY_CS_LOW = 0, - KFD_PIPE_PRIORITY_CS_MEDIUM, - KFD_PIPE_PRIORITY_CS_HIGH -}; - -enum KFD_SPI_PRIORITY { - KFD_SPI_PRIORITY_EXTRA_LOW = 0, - KFD_SPI_PRIORITY_LOW, - KFD_SPI_PRIORITY_MEDIUM, - KFD_SPI_PRIORITY_HIGH -}; - struct scheduling_resources { unsigned int vmid_mask; enum kfd_queue_type type; @@ -543,6 +404,7 @@ struct scheduling_resources { struct process_queue_manager { /* data */ struct kfd_process *process; + unsigned int num_concurrent_processes; struct list_head queues; unsigned long *queue_slot_bitmap; }; @@ -556,16 +418,8 @@ struct qcm_process_device { struct list_head priv_queue_list; unsigned int queue_count; - /* a data field only meaningful for non-HWS case */ unsigned int vmid; bool is_debug; - unsigned int evicted; /* eviction counter, 0=active */ - - /* This flag tells if we should reset all wavefronts on - * process termination - */ - bool reset_wavefronts; - /* * All the memory management data should be here too */ @@ -579,49 +433,6 @@ struct qcm_process_device { uint32_t num_gws; uint32_t num_oac; uint32_t sh_hidden_private_base; - - /* CWSR memory */ - void *cwsr_kaddr; - uint64_t cwsr_base; - uint64_t tba_addr; - uint64_t tma_addr; - - /* IB memory */ - uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */ - void *ib_kaddr; - - /*doorbell resources per process per device*/ - unsigned long *doorbell_bitmap; -}; - -/* KFD Memory Eviction */ - -/* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 -/* Approx. back off time if restore fails due to lack of memory */ -#define PROCESS_BACK_OFF_TIME_MS 100 -/* Approx. time before evicting the process again */ -#define PROCESS_ACTIVE_TIME_MS 10 - -void kfd_evict_bo_worker(struct work_struct *work); -void kfd_restore_bo_worker(struct work_struct *work); -int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, - struct dma_fence *fence); -int quiesce_process_mm(struct kfd_process *p); - - -/* 8 byte handle containing GPU ID in the most significant 4 bytes and - * idr_handle in the least significant 4 bytes - */ -#define MAKE_HANDLE(gpu_id, idr_handle) \ - (((uint64_t)(gpu_id) << 32) + idr_handle) -#define GET_GPU_ID(handle) (handle >> 32) -#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) - -enum kfd_pdd_bound { - PDD_UNBOUND = 0, - PDD_BOUND, - PDD_BOUND_SUSPENDED, }; /* Data that is per-process-per device. */ @@ -635,8 +446,6 @@ struct kfd_process_device { /* The device that owns this data. */ struct kfd_dev *dev; - /* The process that owns this kfd_process_device. */ - struct kfd_process *process; /* per-process-per device QCM data structure */ struct qcm_process_device qpd; @@ -648,24 +457,14 @@ struct kfd_process_device { uint64_t gpuvm_limit; uint64_t scratch_base; uint64_t scratch_limit; - uint64_t dgpu_base; - uint64_t dgpu_limit; /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ - enum kfd_pdd_bound bound; - - /* VM context for GPUVM allocations */ - void *vm; + bool bound; - /* GPUVM allocations storage */ - struct idr alloc_idr; - - /* Flag used to tell the pdd has dequeued from the dqm. - * This is used to prevent dev->dqm->ops.process_termination() from - * being called twice when it is already called in IOMMU callback - * function. + /* This flag tells if we should reset all + * wavefronts on process termination */ - bool already_dequeued; + bool reset_wavefronts; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -678,15 +477,7 @@ struct kfd_process { */ struct hlist_node kfd_processes; - /* - * Opaque pointer to mm_struct. We don't hold a reference to - * it so it should never be dereferenced from here. This is - * only used for looking up processes by their mm. - */ - void *mm; - - struct kref ref; - struct work_struct release_work; + struct mm_struct *mm; struct mutex mutex; @@ -694,8 +485,6 @@ struct kfd_process { * In any process, the thread that started main() is the lead * thread and outlives the rest. * It is here because amd_iommu_bind_pasid wants a task_struct. - * It can also be used for safely getting a reference to the - * mm_struct of the process. */ struct task_struct *lead_thread; @@ -706,7 +495,6 @@ struct kfd_process { struct rcu_head rcu; unsigned int pasid; - unsigned int doorbell_index; /* * List of kfd_process_device structures, @@ -716,37 +504,23 @@ struct kfd_process { struct process_queue_manager pqm; + /* The process's queues. */ + size_t queue_array_size; + + /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ + struct kfd_queue **queues; + /*Is the user space process 32 bit?*/ bool is_32bit_user_mode; /* Event-related data */ struct mutex event_mutex; - /* Event ID allocator and lookup */ - struct idr event_idr; - /* Event page */ - struct kfd_signal_page *signal_page; + /* All events in process hashed by ID, linked on kfd_event.events. */ + DECLARE_HASHTABLE(events, 4); + /* struct slot_page_header.event_pages */ + struct list_head signal_event_pages; + u32 next_nonsignal_event_id; size_t signal_event_count; - bool signal_event_limit_reached; - - struct rb_root_cached bo_interval_tree; - - /* Information used for memory eviction */ - void *process_info; - /* Eviction fence that is attached to all the BOs of this process. The - * fence will be triggered during eviction and new one will be created - * during restore - */ - struct dma_fence *ef; - - /* Work items for evicting and restoring BOs */ - struct delayed_work eviction_work; - struct delayed_work restore_work; - /* seqno of the last scheduled eviction */ - unsigned int last_eviction_seqno; - /* Approx. the last timestamp (in jiffies) when the process was - * restored after an eviction - */ - unsigned long last_restore_timestamp; }; /** @@ -769,50 +543,18 @@ struct amdkfd_ioctl_desc { void kfd_process_create_wq(void); void kfd_process_destroy_wq(void); -struct kfd_process *kfd_create_process(struct file *filep); -struct kfd_process *kfd_get_process(const struct task_struct *task); +struct kfd_process *kfd_create_process(const struct task_struct *); +struct kfd_process *kfd_get_process(const struct task_struct *); struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); -struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); -void kfd_unref_process(struct kfd_process *p); -void kfd_suspend_all_processes(void); -int kfd_resume_all_processes(void); struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process *p); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -int kfd_bind_processes_to_device(struct kfd_dev *dev); -void kfd_unbind_processes_from_device(struct kfd_dev *dev); -#endif -void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid); +void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process *p); struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, struct kfd_process *p); -int kfd_reserved_mem_mmap(struct kfd_process *process, - struct vm_area_struct *vma); - -/* KFD process API for creating and translating handles */ -int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, - void *mem, uint64_t start, - uint64_t length, - struct kfd_ipc_obj *ipc_obj); -void *kfd_process_device_translate_handle(struct kfd_process_device *p, - int handle); -struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, - int handle); -void *kfd_process_find_bo_from_interval(struct kfd_process *p, - uint64_t start_addr, - uint64_t last_addr); -void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, - int handle); - -void run_rdma_free_callback(struct kfd_bo *buf_obj); -struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); - -/* kfd dgpu memory */ -int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd); - /* Process device data iterator */ struct kfd_process_device *kfd_get_first_process_device_data( struct kfd_process *p); @@ -830,24 +572,17 @@ unsigned int kfd_pasid_alloc(void); void kfd_pasid_free(unsigned int pasid); /* Doorbells */ -size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); int kfd_doorbell_init(struct kfd_dev *kfd); void kfd_doorbell_fini(struct kfd_dev *kfd); -int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process, - struct vm_area_struct *vma); -void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, unsigned int *doorbell_off); void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); u32 read_kernel_doorbell(u32 __iomem *db); -void write_kernel_doorbell(void __iomem *db, u32 value); -void write_kernel_doorbell64(void __iomem *db, u64 value); -unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, +void write_kernel_doorbell(u32 __iomem *db, u32 value); +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, struct kfd_process *process, - unsigned int doorbell_id); -phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, - struct kfd_process *process); -int kfd_alloc_process_doorbells(struct kfd_process *process); -void kfd_free_process_doorbells(struct kfd_process *process); + unsigned int queue_id); /* GTT Sub-Allocator */ @@ -863,37 +598,27 @@ int kfd_topology_init(void); void kfd_topology_shutdown(void); int kfd_topology_add_device(struct kfd_dev *gpu); int kfd_topology_remove_device(struct kfd_dev *gpu); -struct kfd_topology_device *kfd_topology_device_by_proximity_domain( - uint32_t proximity_domain); struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); -struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd); -int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); -int kfd_numa_node_to_apic_id(int numa_node_id); +struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); /* Interrupts */ int kfd_interrupt_init(struct kfd_dev *dev); void kfd_interrupt_exit(struct kfd_dev *dev); void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry); -bool interrupt_is_wanted(struct kfd_dev *dev, - const uint32_t *ih_ring_entry, - uint32_t *patched_ihre, bool *flag); +bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry); /* Power Management */ void kgd2kfd_suspend(struct kfd_dev *kfd); int kgd2kfd_resume(struct kfd_dev *kfd); -/* GPU reset */ -int kgd2kfd_pre_reset(struct kfd_dev *kfd); -int kgd2kfd_post_reset(struct kfd_dev *kfd); - /* amdkfd Apertures */ int kfd_init_apertures(struct kfd_process *process); -int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, - uint64_t base, uint64_t limit); /* Queue Context Management */ +struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); + int init_queue(struct queue **q, const struct queue_properties *properties); void uninit_queue(struct queue *q); void print_queue_properties(struct queue_properties *q); @@ -903,20 +628,13 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_dev *dev); -struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, struct kfd_dev *dev); -struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); -struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, enum kfd_queue_type type); void kernel_queue_uninit(struct kernel_queue *kq); -int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid); /* Process Queue Manager */ struct process_queue_node { @@ -925,40 +643,32 @@ struct process_queue_node { struct list_head process_queue_list; }; -void kfd_process_dequeue_from_device(struct kfd_process_device *pdd); -void kfd_process_dequeue_from_all_devices(struct kfd_process *p); int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p); void pqm_uninit(struct process_queue_manager *pqm); int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_dev *dev, struct file *f, struct queue_properties *properties, + unsigned int flags, + enum kfd_queue_type type, unsigned int *qid); int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, struct queue_properties *p); -int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, - struct queue_properties *p); struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, unsigned int qid); -int pqm_get_wave_state(struct process_queue_manager *pqm, - unsigned int qid, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size); -int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm); -int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); int amdkfd_fence_wait_timeout(unsigned int *fence_addr, unsigned int fence_value, - unsigned int timeout_ms); + unsigned long timeout); /* Packet Manager */ +#define KFD_HIQ_TIMEOUT (500) + #define KFD_FENCE_COMPLETED (100) #define KFD_FENCE_INIT (10) - -struct packet_manager_func; +#define KFD_UNMAP_LATENCY (150) struct packet_manager { struct device_queue_manager *dqm; @@ -966,42 +676,9 @@ struct packet_manager { struct mutex lock; bool allocated; struct kfd_mem_obj *ib_buffer_obj; - unsigned int ib_size_bytes; - - struct packet_manager_funcs *pmf; }; -struct packet_manager_funcs { - /* Support different firmware versions for PM4 packets */ - int (*map_process)(struct packet_manager *pm, uint32_t *buffer, - struct qcm_process_device *qpd); - int (*runlist)(struct packet_manager *pm, uint32_t *buffer, - uint64_t ib, size_t ib_size_in_dwords, bool chain); - int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res); - int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, - struct queue *q, bool is_static); - int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, - enum kfd_queue_type type, - enum kfd_unmap_queues_filter mode, - uint32_t filter_param, bool reset, - unsigned int sdma_engine); - int (*query_status)(struct packet_manager *pm, uint32_t *buffer, - uint64_t fence_address, uint32_t fence_value); - uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); - - uint32_t (*get_map_process_packet_size)(void); - uint32_t (*get_runlist_packet_size)(void); - uint32_t (*get_set_resources_packet_size)(void); - uint32_t (*get_map_queues_packet_size)(void); - uint32_t (*get_unmap_queues_packet_size)(void); - uint32_t (*get_query_status_packet_size)(void); - uint32_t (*get_release_mem_packet_size)(void); - -}; - -int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, - uint16_t fw_ver); +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); void pm_uninit(struct packet_manager *pm); int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res); @@ -1010,109 +687,47 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, uint32_t fence_value); int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - enum kfd_unmap_queues_filter mode, + enum kfd_preempt_type_filter mode, uint32_t filter_param, bool reset, unsigned int sdma_engine); void pm_release_ib(struct packet_manager *pm); -/* Following PM funcs can be shared among CIK and VI */ -unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); -int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, - uint64_t ib, size_t ib_size_in_dwords, bool chain); -int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, - struct queue *q, bool is_static); -int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res); -int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, - enum kfd_queue_type type, - enum kfd_unmap_queues_filter filter, - uint32_t filter_param, bool reset, - unsigned int sdma_engine); -int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, - uint64_t fence_address, uint32_t fence_value); -uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer); - -uint32_t pm_get_map_process_packet_size_vi(void); -uint32_t pm_get_runlist_packet_size_vi(void); -uint32_t pm_get_set_resources_packet_size_vi(void); -uint32_t pm_get_map_queues_packet_size_vi(void); -uint32_t pm_get_unmap_queues_packet_size_vi(void); -uint32_t pm_get_query_status_packet_size_vi(void); -uint32_t pm_get_release_mem_packet_size_vi(void); - - -void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver); -void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver); - -void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver); - - uint64_t kfd_get_number_elems(struct kfd_dev *kfd); +phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process); /* Events */ extern const struct kfd_event_interrupt_class event_interrupt_class_cik; -extern const struct kfd_event_interrupt_class event_interrupt_class_v9; - extern const struct kfd_device_global_init_class device_global_init_class_cik; +enum kfd_event_wait_result { + KFD_WAIT_COMPLETE, + KFD_WAIT_TIMEOUT, + KFD_WAIT_ERROR +}; + void kfd_event_init_process(struct kfd_process *p); void kfd_event_free_process(struct kfd_process *p); int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma); int kfd_wait_on_events(struct kfd_process *p, uint32_t num_events, void __user *data, bool all, uint32_t user_timeout_ms, - uint32_t *wait_result); + enum kfd_event_wait_result *wait_result); void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, unsigned long address, bool is_write_requested, bool is_execute_requested); -#endif void kfd_signal_hw_exception_event(unsigned int pasid); int kfd_set_event(struct kfd_process *p, uint32_t event_id); int kfd_reset_event(struct kfd_process *p, uint32_t event_id); int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint32_t event_type, bool auto_reset, uint32_t node_id, uint32_t *event_id, uint32_t *event_trigger_data, - uint64_t *event_page_offset, uint32_t *event_slot_index, - void *kern_addr); + uint64_t *event_page_offset, uint32_t *event_slot_index); int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); -void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, - struct kfd_vm_fault_info *info); - -void kfd_flush_tlb(struct kfd_dev *dev, struct kfd_process *p); - int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); -#define KFD_SCRATCH_KV_FW_VER 413 - -/* PeerDirect support */ -void kfd_init_peer_direct(void); -void kfd_close_peer_direct(void); - -/* IPC Support */ -int kfd_ipc_init(void); - -/* Debugfs */ -#if defined(CONFIG_DEBUG_FS) - -void kfd_debugfs_init(void); -void kfd_debugfs_fini(void); -int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data); -int pqm_debugfs_mqds(struct seq_file *m, void *data); -int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data); -int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data); -int kfd_debugfs_rls_by_device(struct seq_file *m, void *data); -int pm_debugfs_runlist(struct seq_file *m, void *data); - -#else - -static inline void kfd_debugfs_init(void) {} -static inline void kfd_debugfs_fini(void) {} - -#endif - #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 71438ac..c74cf22 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -24,24 +24,24 @@ #include #include #include -#include #include -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) #include -#endif #include #include -#include -#include -#include "kfd_ipc.h" struct mm_struct; #include "kfd_priv.h" -#include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" /* + * Initial size for the array of queues. + * The allocated size is doubled each time + * it is exceeded up to MAX_PROCESS_QUEUES. + */ +#define INITIAL_QUEUE_ARRAY_SIZE 16 + +/* * List of struct kfd_process (field kfd_process). * Unique/indexed by mm_struct* */ @@ -53,16 +53,13 @@ DEFINE_STATIC_SRCU(kfd_processes_srcu); static struct workqueue_struct *kfd_process_wq; -#define MIN_IDR_ID 1 -#define MAX_IDR_ID 0 /*0 - for unlimited*/ - -static struct kfd_process *find_process(const struct task_struct *thread, - bool ref); -static void kfd_process_ref_release(struct kref *ref); -static struct kfd_process *create_process(const struct task_struct *thread, - struct file *filep); -static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); +struct kfd_process_release_work { + struct work_struct kfd_work; + struct kfd_process *p; +}; +static struct kfd_process *find_process(const struct task_struct *thread); +static struct kfd_process *create_process(const struct task_struct *thread); void kfd_process_create_wq(void) { @@ -78,135 +75,10 @@ void kfd_process_destroy_wq(void) } } -static void kfd_process_free_gpuvm(struct kgd_mem *mem, - struct kfd_process_device *pdd) -{ - kfd_unmap_memory_from_gpu(mem, pdd); - pdd->dev->kfd2kgd->free_memory_of_gpu(pdd->dev->kgd, mem, pdd->vm); -} - -/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process - * This function should be only called right after the process - * is created and when kfd_processes_mutex is still being held - * to avoid concurrency. Because of that exclusiveness, we do - * not need to take p->mutex. - */ -static int kfd_process_alloc_gpuvm(struct kfd_process *p, - struct kfd_dev *kdev, uint64_t gpu_va, uint32_t size, - void **kptr, struct kfd_process_device *pdd, uint32_t flags) -{ - int err; - void *mem = NULL; - int handle; - - err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, - pdd->vm, - (struct kgd_mem **)&mem, NULL, flags); - if (err) - goto err_alloc_mem; - - err = kdev->kfd2kgd->map_memory_to_gpu( - kdev->kgd, (struct kgd_mem *)mem, pdd->vm); - if (err) - goto err_map_mem; - - err = kdev->kfd2kgd->sync_memory(kdev->kgd, (struct kgd_mem *) mem, - true); - if (err) { - pr_debug("Sync memory failed, wait interrupted by user signal\n"); - goto sync_memory_failed; - } - - /* Create an obj handle so kfd_process_device_remove_obj_handle - * will take care of the bo removal when the process finishes. - * We do not need to take p->mutex, because the process is just - * created and the ioctls have not had the chance to run. - */ - handle = kfd_process_device_create_obj_handle( - pdd, mem, gpu_va, size, NULL); - - if (handle < 0) { - err = handle; - goto free_gpuvm; - } - - if (kptr) { - err = kdev->kfd2kgd->map_gtt_bo_to_kernel(kdev->kgd, - (struct kgd_mem *)mem, kptr); - if (err) { - pr_debug("Map GTT BO to kernel failed\n"); - goto free_obj_handle; - } - } - - return err; - -free_obj_handle: - kfd_process_device_remove_obj_handle(pdd, handle); -free_gpuvm: -sync_memory_failed: - kfd_process_free_gpuvm(mem, pdd); - return err; - -err_map_mem: - kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem, pdd->vm); -err_alloc_mem: - *kptr = NULL; - return err; -} - -/* kfd_process_reserve_ib_mem - Reserve memory inside the process for IB usage - * The memory reserved is for KFD to submit IB to AMDGPU from kernel. - * If the memory is reserved successfully, ib_kaddr_assigned will have - * the CPU/kernel address. Check ib_kaddr_assigned before accessing the - * memory. - */ -static int kfd_process_reserve_ib_mem(struct kfd_process *p) -{ - int ret = 0; - struct kfd_process_device *temp, *pdd = NULL; - struct kfd_dev *kdev = NULL; - struct qcm_process_device *qpd = NULL; - void *kaddr; - uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | - ALLOC_MEM_FLAGS_NO_SUBSTITUTE | - ALLOC_MEM_FLAGS_EXECUTE_ACCESS; - - list_for_each_entry_safe(pdd, temp, &p->per_device_data, - per_device_list) { - kdev = pdd->dev; - qpd = &pdd->qpd; - if (!kdev->ib_size || qpd->ib_kaddr) - continue; - - if (qpd->ib_base) { /* is dGPU */ - ret = kfd_process_alloc_gpuvm(p, kdev, - qpd->ib_base, kdev->ib_size, - &kaddr, pdd, flags); - if (!ret) - qpd->ib_kaddr = kaddr; - else - /* In case of error, the kfd_bos for some pdds - * which are already allocated successfully - * will be freed in upper level function - * i.e. create_process(). - */ - return ret; - } else { - /* FIXME: Support APU */ - continue; - } - } - - return 0; -} - -struct kfd_process *kfd_create_process(struct file *filep) +struct kfd_process *kfd_create_process(const struct task_struct *thread) { struct kfd_process *process; - struct task_struct *thread = current; - if (!thread->mm) return ERR_PTR(-EINVAL); @@ -214,6 +86,9 @@ struct kfd_process *kfd_create_process(struct file *filep) if (thread->group_leader->mm != thread->mm) return ERR_PTR(-EINVAL); + /* Take mmap_sem because we call __mmu_notifier_register inside */ + down_write(&thread->mm->mmap_sem); + /* * take kfd processes mutex before starting of process creation * so there won't be a case where two threads of the same process @@ -222,14 +97,17 @@ struct kfd_process *kfd_create_process(struct file *filep) mutex_lock(&kfd_processes_mutex); /* A prior open of /dev/kfd could have already created the process. */ - process = find_process(thread, false); + process = find_process(thread); if (process) pr_debug("Process already found\n"); - else - process = create_process(thread, filep); + + if (!process) + process = create_process(thread); mutex_unlock(&kfd_processes_mutex); + up_write(&thread->mm->mmap_sem); + return process; } @@ -244,7 +122,7 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) if (thread->group_leader->mm != thread->mm) return ERR_PTR(-EINVAL); - process = find_process(thread, false); + process = find_process(thread); return process; } @@ -261,156 +139,79 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) return NULL; } -static struct kfd_process *find_process(const struct task_struct *thread, - bool ref) +static struct kfd_process *find_process(const struct task_struct *thread) { struct kfd_process *p; int idx; idx = srcu_read_lock(&kfd_processes_srcu); p = find_process_by_mm(thread->mm); - if (p && ref) - kref_get(&p->ref); srcu_read_unlock(&kfd_processes_srcu, idx); return p; } -void kfd_unref_process(struct kfd_process *p) -{ - kref_put(&p->ref, kfd_process_ref_release); -} - -/* This increments the process->ref counter. */ -struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid) +static void kfd_process_wq_release(struct work_struct *work) { - struct task_struct *task = NULL; - struct kfd_process *p = NULL; - - if (!pid) - task = current; - else - task = get_pid_task(pid, PIDTYPE_PID); - - if (task) - p = find_process(task, true); + struct kfd_process_release_work *my_work; + struct kfd_process_device *pdd, *temp; + struct kfd_process *p; - return p; -} + my_work = (struct kfd_process_release_work *) work; -static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p) -{ - struct kfd_process_device *pdd, *peer_pdd; - struct kfd_bo *buf_obj; - int id; + p = my_work->p; - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - /* - * Remove all handles from idr and release appropriate - * local memory object - */ - idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) { - list_for_each_entry(peer_pdd, &p->per_device_data, - per_device_list) { - peer_pdd->dev->kfd2kgd->unmap_memory_to_gpu( - peer_pdd->dev->kgd, - buf_obj->mem, peer_pdd->vm); - } - - run_rdma_free_callback(buf_obj); - pdd->dev->kfd2kgd->free_memory_of_gpu( - pdd->dev->kgd, buf_obj->mem, pdd->vm); - kfd_process_device_remove_obj_handle(pdd, id); - } - } -} + pr_debug("Releasing process (pasid %d) in workqueue\n", + p->pasid); -static void kfd_process_destroy_pdds(struct kfd_process *p) -{ - struct kfd_process_device *pdd, *temp; + mutex_lock(&p->mutex); list_for_each_entry_safe(pdd, temp, &p->per_device_data, - per_device_list) { - /* Destroy the GPUVM VM context */ - if (pdd->vm) - pdd->dev->kfd2kgd->destroy_process_vm( - pdd->dev->kgd, pdd->vm); - - list_del(&pdd->per_device_list); + per_device_list) { + pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", + pdd->dev->id, p->pasid); - if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base) - free_pages((unsigned long)pdd->qpd.cwsr_kaddr, - get_order(KFD_CWSR_TBA_TMA_SIZE)); + if (pdd->reset_wavefronts) + dbgdev_wave_reset_wavefronts(pdd->dev, p); - kfree(pdd->qpd.doorbell_bitmap); - idr_destroy(&pdd->alloc_idr); + amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); + list_del(&pdd->per_device_list); kfree(pdd); } -} - -/* No process locking is needed in this function, because the process - * is not findable any more. We must assume that no other thread is - * using it any more, otherwise we couldn't safely free the process - * structure in the end. - */ -static void kfd_process_wq_release(struct work_struct *work) -{ - struct kfd_process *p = container_of(work, struct kfd_process, - release_work); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - struct kfd_process_device *pdd; - - pr_debug("Releasing process (pasid %d)\n", - p->pasid); - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", - pdd->dev->id, p->pasid); - - if (pdd->dev->device_info->is_need_iommu_device) { - if (pdd->bound == PDD_BOUND) { - amd_iommu_unbind_pasid(pdd->dev->pdev, - p->pasid); - pdd->bound = PDD_UNBOUND; - } - } - } -#endif - - kfd_process_free_outstanding_kfd_bos(p); - - kfd_process_destroy_pdds(p); - dma_fence_put(p->ef); kfd_event_free_process(p); kfd_pasid_free(p->pasid); - kfd_free_process_doorbells(p); + + mutex_unlock(&p->mutex); mutex_destroy(&p->mutex); - put_task_struct(p->lead_thread); + kfree(p->queues); kfree(p); + + kfree(work); } -static void kfd_process_ref_release(struct kref *ref) +static void kfd_process_destroy_delayed(struct rcu_head *rcu) { - struct kfd_process *p = container_of(ref, struct kfd_process, ref); + struct kfd_process_release_work *work; + struct kfd_process *p; - if (WARN_ON(!kfd_process_wq)) - return; + p = container_of(rcu, struct kfd_process, rcu); + WARN_ON(atomic_read(&p->mm->mm_count) <= 0); - INIT_WORK(&p->release_work, kfd_process_wq_release); - queue_work(kfd_process_wq, &p->release_work); -} + mmdrop(p->mm); -static void kfd_process_destroy_delayed(struct rcu_head *rcu) -{ - struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); + work = kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC); - kfd_unref_process(p); + if (work) { + INIT_WORK((struct work_struct *) work, kfd_process_wq_release); + work->p = p; + queue_work(kfd_process_wq, (struct work_struct *) work); + } } static void kfd_process_notifier_release(struct mmu_notifier *mn, @@ -432,37 +233,35 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, mutex_unlock(&kfd_processes_mutex); synchronize_srcu(&kfd_processes_srcu); - cancel_delayed_work_sync(&p->eviction_work); - cancel_delayed_work_sync(&p->restore_work); - mutex_lock(&p->mutex); - /* Iterate over all process device data structures and if the - * pdd is in debug mode, we should first force unregistration, - * then we will be able to destroy the queues + /* In case our notifier is called before IOMMU notifier */ + pqm_uninit(&p->pqm); + + /* Iterate over all process device data structure and check + * if we should delete debug managers and reset all wavefronts */ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - struct kfd_dev *dev = pdd->dev; - - mutex_lock(kfd_get_dbgmgr_mutex()); - if (dev && dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) { - if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) { - kfd_dbgmgr_destroy(dev->dbgmgr); - dev->dbgmgr = NULL; - } + if ((pdd->dev->dbgmgr) && + (pdd->dev->dbgmgr->pasid == p->pasid)) + kfd_dbgmgr_destroy(pdd->dev->dbgmgr); + + if (pdd->reset_wavefronts) { + pr_warn("Resetting all wave fronts\n"); + dbgdev_wave_reset_wavefronts(pdd->dev, p); + pdd->reset_wavefronts = false; } - mutex_unlock(kfd_get_dbgmgr_mutex()); } - kfd_process_dequeue_from_all_devices(p); - pqm_uninit(&p->pqm); - - /* Indicate to other users that MM is no longer valid */ - p->mm = NULL; - mutex_unlock(&p->mutex); - mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); + /* + * Because we drop mm_count inside kfd_process_destroy_delayed + * and because the mmu_notifier_unregister function also drop + * mm_count we need to take an extra count here. + */ + mmgrab(p->mm); + mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm); mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); } @@ -470,67 +269,7 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { .release = kfd_process_notifier_release, }; -static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep) -{ - int ret; - unsigned long offset; - struct kfd_process_device *temp, *pdd = NULL; - struct kfd_dev *dev = NULL; - struct qcm_process_device *qpd = NULL; - void *kaddr; - uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | - ALLOC_MEM_FLAGS_NO_SUBSTITUTE | - ALLOC_MEM_FLAGS_READONLY | - ALLOC_MEM_FLAGS_EXECUTE_ACCESS; - - list_for_each_entry_safe(pdd, temp, &p->per_device_data, - per_device_list) { - dev = pdd->dev; - qpd = &pdd->qpd; - if (!dev->cwsr_enabled || qpd->cwsr_kaddr) - continue; - if (qpd->cwsr_base) { - /* cwsr_base is only set for DGPU */ - ret = kfd_process_alloc_gpuvm(p, dev, qpd->cwsr_base, - KFD_CWSR_TBA_TMA_SIZE, &kaddr, pdd, flags); - if (!ret) { - qpd->cwsr_kaddr = kaddr; - qpd->tba_addr = qpd->cwsr_base; - } else - /* In case of error, the kfd_bos for some pdds - * which are already allocated successfully - * will be freed in upper level function - * i.e. create_process(). - */ - return ret; - } else { - offset = (dev->id | - KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT; - qpd->tba_addr = (uint64_t)vm_mmap(filep, 0, - KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, - MAP_SHARED, offset); - - if (IS_ERR_VALUE(qpd->tba_addr)) { - pr_err("Failure to set tba address. error -%d.\n", - (int)qpd->tba_addr); - qpd->tba_addr = 0; - qpd->cwsr_kaddr = NULL; - return -ENOMEM; - } - } - - memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size); - - qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET; - pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n", - qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr); - } - - return 0; -} - -static struct kfd_process *create_process(const struct task_struct *thread, - struct file *filep) +static struct kfd_process *create_process(const struct task_struct *thread) { struct kfd_process *process; int err = -ENOMEM; @@ -540,24 +279,22 @@ static struct kfd_process *create_process(const struct task_struct *thread, if (!process) goto err_alloc_process; - process->bo_interval_tree = RB_ROOT_CACHED; + process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, + sizeof(process->queues[0]), GFP_KERNEL); + if (!process->queues) + goto err_alloc_queues; process->pasid = kfd_pasid_alloc(); if (process->pasid == 0) goto err_alloc_pasid; - if (kfd_alloc_process_doorbells(process) < 0) - goto err_alloc_doorbells; - - kref_init(&process->ref); - mutex_init(&process->mutex); process->mm = thread->mm; /* register notifier */ process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; - err = mmu_notifier_register(&process->mmu_notifier, process->mm); + err = __mmu_notifier_register(&process->mmu_notifier, process->mm); if (err) goto err_mmu_notifier; @@ -565,7 +302,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, (uintptr_t)process->mm); process->lead_thread = thread->group_leader; - get_task_struct(process->lead_thread); + + process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE; INIT_LIST_HEAD(&process->per_device_data); @@ -581,28 +319,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, if (err != 0) goto err_init_apertures; - INIT_DELAYED_WORK(&process->eviction_work, kfd_evict_bo_worker); - INIT_DELAYED_WORK(&process->restore_work, kfd_restore_bo_worker); - process->last_restore_timestamp = get_jiffies_64(); - - err = kfd_process_reserve_ib_mem(process); - if (err) - goto err_reserve_ib_mem; - err = kfd_process_init_cwsr(process, filep); - if (err) - goto err_init_cwsr; - - /* If PeerDirect interface was not detected try to detect it again - * in case if network driver was loaded later. - */ - kfd_init_peer_direct(); - return process; -err_init_cwsr: -err_reserve_ib_mem: - kfd_process_free_outstanding_kfd_bos(process); - kfd_process_destroy_pdds(process); err_init_apertures: pqm_uninit(&process->pqm); err_process_pqm_init: @@ -611,40 +329,15 @@ static struct kfd_process *create_process(const struct task_struct *thread, mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm); err_mmu_notifier: mutex_destroy(&process->mutex); - kfd_free_process_doorbells(process); -err_alloc_doorbells: kfd_pasid_free(process->pasid); err_alloc_pasid: + kfree(process->queues); +err_alloc_queues: kfree(process); err_alloc_process: return ERR_PTR(err); } -static int init_doorbell_bitmap(struct qcm_process_device *qpd, - struct kfd_dev *dev) -{ - unsigned int i; - - if (!KFD_IS_SOC15(dev->device_info->asic_family)) - return 0; - - qpd->doorbell_bitmap = - kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, - BITS_PER_BYTE), GFP_KERNEL); - if (!qpd->doorbell_bitmap) - return -ENOMEM; - - /* Mask out any reserved doorbells */ - for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) - if ((dev->shared_resources.reserved_doorbell_mask & i) == - dev->shared_resources.reserved_doorbell_val) { - set_bit(i, qpd->doorbell_bitmap); - pr_debug("reserved doorbell 0x%03x\n", i); - } - - return 0; -} - struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process *p) { @@ -652,9 +345,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, list_for_each_entry(pdd, &p->per_device_data, per_device_list) if (pdd->dev == dev) - return pdd; + break; - return NULL; + return pdd; } struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, @@ -663,41 +356,16 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, struct kfd_process_device *pdd = NULL; pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); - if (!pdd) - return NULL; - - pdd->dev = dev; - INIT_LIST_HEAD(&pdd->qpd.queues_list); - INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); - pdd->qpd.dqm = dev->dqm; - pdd->qpd.pqm = &p->pqm; - pdd->qpd.evicted = 0; - pdd->process = p; - pdd->bound = PDD_UNBOUND; - pdd->already_dequeued = false; - list_add(&pdd->per_device_list, &p->per_device_data); - - /* Init idr used for memory handle translation */ - idr_init(&pdd->alloc_idr); - if (init_doorbell_bitmap(&pdd->qpd, dev)) { - pr_err("Failed to init doorbell for process\n"); - goto err_create_pdd; + if (pdd != NULL) { + pdd->dev = dev; + INIT_LIST_HEAD(&pdd->qpd.queues_list); + INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); + pdd->qpd.dqm = dev->dqm; + pdd->reset_wavefronts = false; + list_add(&pdd->per_device_list, &p->per_device_data); } - /* Create the GPUVM context for this specific device */ - if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm, - &p->process_info, &p->ef)) { - pr_err("Failed to create process VM object\n"); - goto err_create_pdd; - } return pdd; - -err_create_pdd: - kfree(pdd->qpd.doorbell_bitmap); - idr_destroy(&pdd->alloc_idr); - list_del(&pdd->per_device_list); - kfree(pdd); - return NULL; } /* @@ -711,6 +379,7 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process *p) { struct kfd_process_device *pdd; + int err; pdd = kfd_get_process_device_data(dev, p); if (!pdd) { @@ -718,94 +387,19 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, return ERR_PTR(-ENOMEM); } - if (pdd->bound == PDD_BOUND) { + if (pdd->bound) return pdd; - } else if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) { - pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); - return ERR_PTR(-EINVAL); - } -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - if (dev->device_info->is_need_iommu_device) { - int err = amd_iommu_bind_pasid(dev->pdev, p->pasid, - p->lead_thread); - if (err < 0) - return ERR_PTR(err); - } -#endif + err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); + if (err < 0) + return ERR_PTR(err); - pdd->bound = PDD_BOUND; + pdd->bound = true; return pdd; } -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -/* - * Bind processes do the device that have been temporarily unbound - * (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device. - */ -int kfd_bind_processes_to_device(struct kfd_dev *dev) -{ - struct kfd_process_device *pdd; - struct kfd_process *p; - unsigned int temp; - int err = 0; - - int idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - mutex_lock(&p->mutex); - pdd = kfd_get_process_device_data(dev, p); - if (pdd->bound != PDD_BOUND_SUSPENDED) { - mutex_unlock(&p->mutex); - continue; - } - - err = amd_iommu_bind_pasid(dev->pdev, p->pasid, - p->lead_thread); - if (err < 0) { - pr_err("Unexpected pasid %d binding failure\n", - p->pasid); - mutex_unlock(&p->mutex); - break; - } - - pdd->bound = PDD_BOUND; - mutex_unlock(&p->mutex); - } - - srcu_read_unlock(&kfd_processes_srcu, idx); - - return err; -} - -/* - * Mark currently bound processes as PDD_BOUND_SUSPENDED. These - * processes will be restored to PDD_BOUND state in - * kfd_bind_processes_to_device. - */ -void kfd_unbind_processes_from_device(struct kfd_dev *dev) -{ - struct kfd_process_device *pdd; - struct kfd_process *p; - unsigned int temp; - - int idx = srcu_read_lock(&kfd_processes_srcu); - - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - mutex_lock(&p->mutex); - pdd = kfd_get_process_device_data(dev, p); - - if (pdd->bound == PDD_BOUND) - pdd->bound = PDD_BOUND_SUSPENDED; - mutex_unlock(&p->mutex); - } - - srcu_read_unlock(&kfd_processes_srcu, idx); -} - -void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) +void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) { struct kfd_process *p; struct kfd_process_device *pdd; @@ -821,31 +415,34 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) pr_debug("Unbinding process %d from IOMMU\n", pasid); - mutex_lock(kfd_get_dbgmgr_mutex()); + if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) + kfd_dbgmgr_destroy(dev->dbgmgr); - if (dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) { - if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) { - kfd_dbgmgr_destroy(dev->dbgmgr); - dev->dbgmgr = NULL; - } - } + pqm_uninit(&p->pqm); - mutex_unlock(kfd_get_dbgmgr_mutex()); + pdd = kfd_get_process_device_data(dev, p); - mutex_lock(&p->mutex); + if (!pdd) { + mutex_unlock(&p->mutex); + return; + } - pdd = kfd_get_process_device_data(dev, p); - if (pdd) - /* For GPU relying on IOMMU, we need to dequeue here - * when PASID is still bound. - */ - kfd_process_dequeue_from_device(pdd); + if (pdd->reset_wavefronts) { + dbgdev_wave_reset_wavefronts(pdd->dev, p); + pdd->reset_wavefronts = false; + } - mutex_unlock(&p->mutex); + /* + * Just mark pdd as unbound, because we still need it + * to call amd_iommu_unbind_pasid() in when the + * process exits. + * We don't call amd_iommu_unbind_pasid() here + * because the IOMMU called us. + */ + pdd->bound = false; - kfd_unref_process(p); + mutex_unlock(&p->mutex); } -#endif /* CONFIG_AMD_IOMMU_V2 */ struct kfd_process_device *kfd_get_first_process_device_data( struct kfd_process *p) @@ -869,277 +466,22 @@ bool kfd_has_process_device_data(struct kfd_process *p) return !(list_empty(&p->per_device_data)); } -/* Create specific handle mapped to mem from process local memory idr - * Assumes that the process lock is held. - */ -int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, - void *mem, uint64_t start, - uint64_t length, - struct kfd_ipc_obj *ipc_obj) -{ - int handle; - struct kfd_bo *buf_obj; - struct kfd_process *p; - - p = pdd->process; - - buf_obj = kzalloc(sizeof(*buf_obj), GFP_KERNEL); - - if (!buf_obj) - return -ENOMEM; - - buf_obj->it.start = start; - buf_obj->it.last = start + length - 1; - interval_tree_insert(&buf_obj->it, &p->bo_interval_tree); - - buf_obj->mem = mem; - buf_obj->dev = pdd->dev; - buf_obj->kfd_ipc_obj = ipc_obj; - - INIT_LIST_HEAD(&buf_obj->cb_data_head); - - idr_preload(GFP_KERNEL); - - handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID, - GFP_NOWAIT); - - idr_preload_end(); - - if (handle < 0) - kfree(buf_obj); - - return handle; -} - -struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, - int handle) -{ - if (handle < 0) - return NULL; - - return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle); -} - -/* Translate specific handle from process local memory idr - * Assumes that the process lock is held. - */ -void *kfd_process_device_translate_handle(struct kfd_process_device *pdd, - int handle) -{ - struct kfd_bo *buf_obj; - - buf_obj = kfd_process_device_find_bo(pdd, handle); - - return buf_obj->mem; -} - -void *kfd_process_find_bo_from_interval(struct kfd_process *p, - uint64_t start_addr, - uint64_t last_addr) -{ - struct interval_tree_node *it_node; - struct kfd_bo *buf_obj; - - it_node = interval_tree_iter_first(&p->bo_interval_tree, - start_addr, last_addr); - if (!it_node) { - pr_err("0x%llx-0x%llx does not relate to an existing buffer\n", - start_addr, last_addr); - return NULL; - } - - if (interval_tree_iter_next(it_node, start_addr, last_addr)) { - pr_err("0x%llx-0x%llx spans more than a single BO\n", - start_addr, last_addr); - return NULL; - } - - buf_obj = container_of(it_node, struct kfd_bo, it); - - return buf_obj; -} - -/* Remove specific handle from process local memory idr - * Assumes that the process lock is held. - */ -void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, - int handle) -{ - struct kfd_bo *buf_obj; - struct kfd_process *p; - - p = pdd->process; - - if (handle < 0) - return; - - buf_obj = kfd_process_device_find_bo(pdd, handle); - - if (buf_obj->kfd_ipc_obj) - ipc_obj_put(&buf_obj->kfd_ipc_obj); - - idr_remove(&pdd->alloc_idr, handle); - - interval_tree_remove(&buf_obj->it, &p->bo_interval_tree); - - kfree(buf_obj); -} - -/* This increments the process->ref counter. */ +/* This returns with process->mutex locked. */ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) { - struct kfd_process *p, *ret_p = NULL; + struct kfd_process *p; unsigned int temp; int idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { if (p->pasid == pasid) { - kref_get(&p->ref); - ret_p = p; + mutex_lock(&p->mutex); break; } } srcu_read_unlock(&kfd_processes_srcu, idx); - return ret_p; -} - -void kfd_suspend_all_processes(void) -{ - struct kfd_process *p; - unsigned int temp; - int idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - cancel_delayed_work_sync(&p->eviction_work); - cancel_delayed_work_sync(&p->restore_work); - - if (quiesce_process_mm(p)) - pr_err("Failed to suspend process %d\n", p->pasid); - dma_fence_signal(p->ef); - dma_fence_put(p->ef); - p->ef = NULL; - } - srcu_read_unlock(&kfd_processes_srcu, idx); -} - -int kfd_resume_all_processes(void) -{ - struct kfd_process *p; - unsigned int temp; - int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - if (!schedule_delayed_work(&p->restore_work, 0)) { - pr_err("Restore process %d failed during resume\n", - p->pasid); - ret = -EFAULT; - } - } - srcu_read_unlock(&kfd_processes_srcu, idx); - return ret; -} - -/* This increments the process->ref counter. */ -struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) -{ - struct kfd_process *p; - - int idx = srcu_read_lock(&kfd_processes_srcu); - - p = find_process_by_mm(mm); - if (p) - kref_get(&p->ref); - - srcu_read_unlock(&kfd_processes_srcu, idx); - return p; } - -int kfd_reserved_mem_mmap(struct kfd_process *process, - struct vm_area_struct *vma) -{ - struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); - struct kfd_process_device *pdd; - struct qcm_process_device *qpd; - - if (!dev) - return -EINVAL; - if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { - pr_err("Incorrect CWSR mapping size.\n"); - return -EINVAL; - } - - pdd = kfd_get_process_device_data(dev, process); - if (!pdd) - return -EINVAL; - qpd = &pdd->qpd; - - qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(KFD_CWSR_TBA_TMA_SIZE)); - if (!qpd->cwsr_kaddr) { - pr_err("Error allocating per process CWSR buffer.\n"); - return -ENOMEM; - } - - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND - | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; - /* Mapping pages to user process */ - return remap_pfn_range(vma, vma->vm_start, - PFN_DOWN(__pa(qpd->cwsr_kaddr)), - KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot); -} - -#if defined(CONFIG_DEBUG_FS) - -int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) -{ - struct kfd_process *p; - unsigned int temp; - int r = 0; - - int idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - seq_printf(m, "Process %d PASID %d:\n", - p->lead_thread->tgid, p->pasid); - - mutex_lock(&p->mutex); - r = pqm_debugfs_mqds(m, &p->pqm); - mutex_unlock(&p->mutex); - - if (r != 0) - break; - } - - srcu_read_unlock(&kfd_processes_srcu, idx); - - return r; -} - -#endif - -void kfd_flush_tlb(struct kfd_dev *dev, struct kfd_process *p) -{ - const struct kfd2kgd_calls *f2g = dev->kfd2kgd; - - if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { - struct kfd_process_device *pdd = - kfd_get_process_device_data(dev, p); - if (!pdd) { - pr_err("could not find pdd for pasid %d\n", p->pasid); - return; - } - - /* vmid allocation is delayed to the creation of the first - * queue of the process. For buffers allocated and mapped - * before queue creation, vmid is still no allocated (valued 0). - * Ignore tlb invalidation request for this case. - */ - if (pdd->qpd.vmid) - f2g->invalidate_tlbs_vmid(dev->kgd, pdd->qpd.vmid); - } else - f2g->invalidate_tlbs(dev->kgd, p->pasid); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index a7ec177..5f82905 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -63,25 +63,6 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, return 0; } -void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) -{ - struct kfd_dev *dev = pdd->dev; - - if (pdd->already_dequeued) - return; - - dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd); - pdd->already_dequeued = true; -} - -void kfd_process_dequeue_from_all_devices(struct kfd_process *p) -{ - struct kfd_process_device *pdd; - - list_for_each_entry(pdd, &p->per_device_data, per_device_list) - kfd_process_dequeue_from_device(pdd); -} - int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) { INIT_LIST_HEAD(&pqm->queues); @@ -97,14 +78,21 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) void pqm_uninit(struct process_queue_manager *pqm) { + int retval; struct process_queue_node *pqn, *next; list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { - uninit_queue(pqn->q); - list_del(&pqn->process_queue_list); - kfree(pqn); + retval = pqm_destroy_queue( + pqm, + (pqn->q != NULL) ? + pqn->q->properties.queue_id : + pqn->kq->queue->properties.queue_id); + + if (retval != 0) { + pr_err("failed to destroy queue\n"); + return; + } } - kfree(pqm->queue_slot_bitmap); pqm->queue_slot_bitmap = NULL; } @@ -119,6 +107,9 @@ static int create_cp_queue(struct process_queue_manager *pqm, /* Doorbell initialized in user space*/ q_properties->doorbell_ptr = NULL; + q_properties->doorbell_off = + kfd_queue_id_to_doorbell(dev, pqm->process, qid); + /* let DQM handle it*/ q_properties->vmid = 0; q_properties->queue_id = qid; @@ -139,16 +130,20 @@ int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_dev *dev, struct file *f, struct queue_properties *properties, + unsigned int flags, + enum kfd_queue_type type, unsigned int *qid) { int retval; struct kfd_process_device *pdd; + struct queue_properties q_properties; struct queue *q; struct process_queue_node *pqn; struct kernel_queue *kq; - enum kfd_queue_type type = properties->type; - unsigned int max_queues = 127; /* HWS limit */ + int num_queues = 0; + struct queue *cur; + memcpy(&q_properties, properties, sizeof(struct queue_properties)); q = NULL; kq = NULL; @@ -164,19 +159,22 @@ int pqm_create_queue(struct process_queue_manager *pqm, * If we are just about to create DIQ, the is_debug flag is not set yet * Hence we also check the type as well */ - if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ)) - max_queues = dev->device_info->max_no_of_hqd/2; - - if (pdd->qpd.queue_count >= max_queues) - return -ENOSPC; + if ((pdd->qpd.is_debug) || + (type == KFD_QUEUE_TYPE_DIQ)) { + list_for_each_entry(cur, &pdd->qpd.queues_list, list) + num_queues++; + if (num_queues >= dev->device_info->max_no_of_hqd/2) + return -ENOSPC; + } retval = find_available_queue_slot(pqm, qid); if (retval != 0) return retval; - if (list_empty(&pdd->qpd.queues_list) && - list_empty(&pdd->qpd.priv_queue_list)) + if (list_empty(&pqm->queues)) { + pdd->qpd.pqm = pqm; dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); + } pqn = kzalloc(sizeof(*pqn), GFP_KERNEL); if (!pqn) { @@ -186,40 +184,23 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: - if (dev->dqm->sdma_queue_count - >= get_num_sdma_queues(dev->dqm)) { - pr_debug("Over-subscription is not allowed for SDMA\n"); - retval = -EPERM; - goto err_create_queue; - } - - retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); - if (retval != 0) - goto err_create_queue; - pqn->q = q; - pqn->kq = NULL; - retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd); - pr_debug("DQM returned %d for create_queue\n", retval); - print_queue(q); - break; - case KFD_QUEUE_TYPE_COMPUTE: /* check if there is over subscription */ - if ((dev->dqm->sched_policy == - KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && - ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || + if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && + ((dev->dqm->processes_count >= VMID_PER_DEVICE) || (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) { - pr_debug("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); + pr_err("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); retval = -EPERM; goto err_create_queue; } - retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); + retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid); if (retval != 0) goto err_create_queue; pqn->q = q; pqn->kq = NULL; - retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd); + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, + &q->properties.vmid); pr_debug("DQM returned %d for create_queue\n", retval); print_queue(q); break; @@ -245,22 +226,14 @@ int pqm_create_queue(struct process_queue_manager *pqm, goto err_create_queue; } - if (q) - /* Return the doorbell offset within the doorbell page - * to the caller so it can be passed up to user mode - * (in bytes). - */ - properties->doorbell_off = - (q->properties.doorbell_off * sizeof(uint32_t)) & - (kfd_doorbell_process_slice(dev) - 1); - pr_debug("PQM After DQM create queue\n"); list_add(&pqn->process_queue_list, &pqm->queues); if (q) { + *properties = q->properties; pr_debug("PQM done creating queue\n"); - print_queue_properties(&q->properties); + print_queue_properties(properties); } return retval; @@ -270,8 +243,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, err_allocate_pqn: /* check if queues list is empty unregister process from device */ clear_bit(*qid, pqm->queue_slot_bitmap); - if (list_empty(&pdd->qpd.queues_list) && - list_empty(&pdd->qpd.priv_queue_list)) + if (list_empty(&pqm->queues)) dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); return retval; } @@ -317,13 +289,10 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (pqn->q) { dqm = pqn->q->device->dqm; - kfree(pqn->q->properties.cu_mask); - pqn->q->properties.cu_mask = NULL; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); - if (retval) { - pr_debug("Destroy queue failed, returned %d\n", retval); - goto err_destroy_queue; - } + if (retval != 0) + return retval; + uninit_queue(pqn->q); } @@ -331,11 +300,9 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) kfree(pqn); clear_bit(qid, pqm->queue_slot_bitmap); - if (list_empty(&pdd->qpd.queues_list) && - list_empty(&pdd->qpd.priv_queue_list)) + if (list_empty(&pqm->queues)) dqm->ops.unregister_process(dqm, &pdd->qpd); -err_destroy_queue: return retval; } @@ -364,34 +331,6 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, return 0; } -int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, - struct queue_properties *p) -{ - int retval; - struct process_queue_node *pqn; - - pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { - pr_debug("No queue %d exists for update operation\n", qid); - return -EFAULT; - } - - /* Free the old CU mask memory if it is already allocated, then - * allocate memory for the new CU mask. - */ - kfree(pqn->q->properties.cu_mask); - - pqn->q->properties.cu_mask_count = p->cu_mask_count; - pqn->q->properties.cu_mask = p->cu_mask; - - retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, - pqn->q); - if (retval != 0) - return retval; - - return 0; -} - struct kernel_queue *pqm_get_kernel_queue( struct process_queue_manager *pqm, unsigned int qid) @@ -405,89 +344,4 @@ struct kernel_queue *pqm_get_kernel_queue( return NULL; } -int pqm_get_wave_state(struct process_queue_manager *pqm, - unsigned int qid, - void __user *ctl_stack, - u32 *ctl_stack_used_size, - u32 *save_area_used_size) -{ - struct process_queue_node *pqn; - - pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { - pr_debug("amdkfd: No queue %d exists for operation\n", - qid); - return -EFAULT; - } - - return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm, - pqn->q, - ctl_stack, - ctl_stack_used_size, - save_area_used_size); -} - -#if defined(CONFIG_DEBUG_FS) - -int pqm_debugfs_mqds(struct seq_file *m, void *data) -{ - struct process_queue_manager *pqm = data; - struct process_queue_node *pqn; - struct queue *q; - enum KFD_MQD_TYPE mqd_type; - struct mqd_manager *mqd_manager; - int r = 0; - - list_for_each_entry(pqn, &pqm->queues, process_queue_list) { - if (pqn->q) { - q = pqn->q; - switch (q->properties.type) { - case KFD_QUEUE_TYPE_SDMA: - seq_printf(m, " SDMA queue on device %x\n", - q->device->id); - mqd_type = KFD_MQD_TYPE_SDMA; - break; - case KFD_QUEUE_TYPE_COMPUTE: - seq_printf(m, " Compute queue on device %x\n", - q->device->id); - mqd_type = KFD_MQD_TYPE_CP; - break; - default: - seq_printf(m, - " Bad user queue type %d on device %x\n", - q->properties.type, q->device->id); - continue; - } - mqd_manager = q->device->dqm->ops.get_mqd_manager( - q->device->dqm, mqd_type); - } else if (pqn->kq) { - q = pqn->kq->queue; - mqd_manager = pqn->kq->mqd; - switch (q->properties.type) { - case KFD_QUEUE_TYPE_DIQ: - seq_printf(m, " DIQ on device %x\n", - pqn->kq->dev->id); - mqd_type = KFD_MQD_TYPE_HIQ; - break; - default: - seq_printf(m, - " Bad kernel queue type %d on device %x\n", - q->properties.type, - pqn->kq->dev->id); - continue; - } - } else { - seq_printf(m, - " Weird: Queue node with neither kernel nor user queue\n"); - continue; - } - - r = mqd_manager->debugfs_show_mqd(m, q->mqd); - if (r != 0) - break; - } - - return r; -} -#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c deleted file mode 100644 index 2f5cdb9..0000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Copyright 2015 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include "kfd_priv.h" - - -struct rdma_cb { - struct list_head node; - struct amd_p2p_info amd_p2p_data; - void (*free_callback)(void *client_priv); - void *client_priv; -}; - -/** - * This function makes the pages underlying a range of GPU virtual memory - * accessible for DMA operations from another PCIe device - * - * \param address - The start address in the Unified Virtual Address - * space in the specified process - * \param length - The length of requested mapping - * \param pid - Pointer to structure pid to which address belongs. - * Could be NULL for current process address space. - * \param p2p_data - On return: Pointer to structure describing - * underlying pages/locations - * \param free_callback - Pointer to callback which will be called when access - * to such memory must be stopped immediately: Memory - * was freed, GECC events, etc. - * Client should immediately stop any transfer - * operations and returned as soon as possible. - * After return all resources associated with address - * will be release and no access will be allowed. - * \param client_priv - Pointer to be passed as parameter on - * 'free_callback; - * - * \return 0 if operation was successful - */ -static int get_pages(uint64_t address, uint64_t length, struct pid *pid, - struct amd_p2p_info **amd_p2p_data, - void (*free_callback)(void *client_priv), - void *client_priv) -{ - struct kfd_bo *buf_obj; - struct kgd_mem *mem; - struct sg_table *sg_table_tmp; - struct kfd_dev *dev; - uint64_t last = address + length - 1; - uint64_t offset; - struct kfd_process *p; - struct rdma_cb *rdma_cb_data; - int ret = 0; - - p = kfd_lookup_process_by_pid(pid); - if (!p) { - pr_err("Could not find the process\n"); - return -EINVAL; - } - mutex_lock(&p->mutex); - - buf_obj = kfd_process_find_bo_from_interval(p, address, last); - if (!buf_obj) { - pr_err("Cannot find a kfd_bo for the range\n"); - ret = -EINVAL; - goto out; - } - - rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL); - if (!rdma_cb_data) { - *amd_p2p_data = NULL; - ret = -ENOMEM; - goto out; - } - - mem = buf_obj->mem; - dev = buf_obj->dev; - offset = address - buf_obj->it.start; - - ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem, - offset, length, &sg_table_tmp); - - if (ret) { - pr_err("pin_get_sg_table_bo failed.\n"); - *amd_p2p_data = NULL; - goto free_mem; - } - - rdma_cb_data->amd_p2p_data.va = address; - rdma_cb_data->amd_p2p_data.size = length; - rdma_cb_data->amd_p2p_data.pid = pid; - rdma_cb_data->amd_p2p_data.priv = buf_obj; - rdma_cb_data->amd_p2p_data.pages = sg_table_tmp; - - rdma_cb_data->free_callback = free_callback; - rdma_cb_data->client_priv = client_priv; - - list_add(&rdma_cb_data->node, &buf_obj->cb_data_head); - - *amd_p2p_data = &rdma_cb_data->amd_p2p_data; - - goto out; - -free_mem: - kfree(rdma_cb_data); -out: - mutex_unlock(&p->mutex); - kfd_unref_process(p); - - return ret; -} - -static int put_pages_helper(struct amd_p2p_info *p2p_data) -{ - struct kfd_bo *buf_obj; - struct kfd_dev *dev; - struct sg_table *sg_table_tmp; - struct rdma_cb *rdma_cb_data; - - if (!p2p_data) { - pr_err("amd_p2p_info pointer is invalid.\n"); - return -EINVAL; - } - - rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data); - - buf_obj = p2p_data->priv; - dev = buf_obj->dev; - sg_table_tmp = p2p_data->pages; - - list_del(&rdma_cb_data->node); - kfree(rdma_cb_data); - - dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp); - - - return 0; -} - -void run_rdma_free_callback(struct kfd_bo *buf_obj) -{ - struct rdma_cb *tmp, *rdma_cb_data; - - list_for_each_entry_safe(rdma_cb_data, tmp, - &buf_obj->cb_data_head, node) { - if (rdma_cb_data->free_callback) - rdma_cb_data->free_callback( - rdma_cb_data->client_priv); - - put_pages_helper(&rdma_cb_data->amd_p2p_data); - } -} - -/** - * - * This function release resources previously allocated by get_pages() call. - * - * \param p_p2p_data - A pointer to pointer to amd_p2p_info entries - * allocated by get_pages() call. - * - * \return 0 if operation was successful - */ -static int put_pages(struct amd_p2p_info **p_p2p_data) -{ - struct kfd_process *p = NULL; - int ret = 0; - - if (!(*p_p2p_data)) { - pr_err("amd_p2p_info pointer is invalid.\n"); - return -EINVAL; - } - - p = kfd_lookup_process_by_pid((*p_p2p_data)->pid); - if (!p) { - pr_err("Could not find the process\n"); - return -EINVAL; - } - - ret = put_pages_helper(*p_p2p_data); - - if (!ret) - *p_p2p_data = NULL; - - kfd_unref_process(p); - - return ret; -} - -/** - * Check if given address belongs to GPU address space. - * - * \param address - Address to check - * \param pid - Process to which given address belongs. - * Could be NULL if current one. - * - * \return 0 - This is not GPU address managed by AMD driver - * 1 - This is GPU address managed by AMD driver - */ -static int is_gpu_address(uint64_t address, struct pid *pid) -{ - struct kfd_bo *buf_obj; - struct kfd_process *p; - - p = kfd_lookup_process_by_pid(pid); - if (!p) { - pr_debug("Could not find the process\n"); - return 0; - } - - buf_obj = kfd_process_find_bo_from_interval(p, address, address); - - kfd_unref_process(p); - if (!buf_obj) - return 0; - - return 1; -} - -/** - * Return the single page size to be used when building scatter/gather table - * for given range. - * - * \param address - Address - * \param length - Range length - * \param pid - Process id structure. Could be NULL if current one. - * \param page_size - On return: Page size - * - * \return 0 if operation was successful - */ -static int get_page_size(uint64_t address, uint64_t length, struct pid *pid, - unsigned long *page_size) -{ - /* - * As local memory is always consecutive, we can assume the local - * memory page size to be arbitrary. - * Currently we assume the local memory page size to be the same - * as system memory, which is 4KB. - */ - *page_size = PAGE_SIZE; - - return 0; -} - - -/** - * Singleton object: rdma interface function pointers - */ -static const struct amd_rdma_interface rdma_ops = { - .get_pages = get_pages, - .put_pages = put_pages, - .is_gpu_address = is_gpu_address, - .get_page_size = get_page_size, -}; - -/** - * amdkfd_query_rdma_interface - Return interface (function pointers table) for - * rdma interface - * - * - * \param interace - OUT: Pointer to interface - * - * \return 0 if operation was successful. - */ -int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops) -{ - *ops = &rdma_ops; - - return 0; -} -EXPORT_SYMBOL(amdkfd_query_rdma_interface); - - - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 58a5bef..19ce590 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -28,32 +28,27 @@ #include #include #include -#include -#include #include "kfd_priv.h" #include "kfd_crat.h" #include "kfd_topology.h" -#include "kfd_device_queue_manager.h" -/* topology_device_list - Master list of all topology devices */ static struct list_head topology_device_list; +static int topology_crat_parsed; static struct kfd_system_properties sys_props; static DECLARE_RWSEM(topology_lock); -static atomic_t topology_crat_proximity_domain; -struct kfd_topology_device *kfd_topology_device_by_proximity_domain( - uint32_t proximity_domain) +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) { struct kfd_topology_device *top_dev; - struct kfd_topology_device *device = NULL; + struct kfd_dev *device = NULL; down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->proximity_domain == proximity_domain) { - device = top_dev; + if (top_dev->gpu_id == gpu_id) { + device = top_dev->gpu; break; } @@ -62,7 +57,7 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain( return device; } -struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) { struct kfd_topology_device *top_dev; struct kfd_dev *device = NULL; @@ -70,7 +65,7 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu_id == gpu_id) { + if (top_dev->gpu->pdev == pdev) { device = top_dev->gpu; break; } @@ -80,49 +75,282 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) return device; } -struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) +static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) { - struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; + struct acpi_table_header *crat_table; + acpi_status status; - down_read(&topology_lock); + if (!size) + return -EINVAL; - list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu && top_dev->gpu->pdev == pdev) { - device = top_dev->gpu; + /* + * Fetch the CRAT table from ACPI + */ + status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); + if (status == AE_NOT_FOUND) { + pr_warn("CRAT table not found\n"); + return -ENODATA; + } else if (ACPI_FAILURE(status)) { + const char *err = acpi_format_exception(status); + + pr_err("CRAT table error: %s\n", err); + return -EINVAL; + } + + if (*size >= crat_table->length && crat_image != NULL) + memcpy(crat_image, crat_table, crat_table->length); + + *size = crat_table->length; + + return 0; +} + +static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + dev->node_props.cpu_cores_count = cu->num_cpu_cores; + dev->node_props.cpu_core_id_base = cu->processor_id_low; + if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + + pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, + cu->processor_id_low); +} + +static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + dev->node_props.simd_id_base = cu->processor_id_low; + dev->node_props.simd_count = cu->num_simd_cores; + dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; + dev->node_props.max_waves_per_simd = cu->max_waves_simd; + dev->node_props.wave_front_size = cu->wave_front_size; + dev->node_props.mem_banks_count = cu->num_banks; + dev->node_props.array_count = cu->num_arrays; + dev->node_props.cu_per_simd_array = cu->num_cu_per_array; + dev->node_props.simd_per_cu = cu->num_simd_per_cu; + dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; + if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) + dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; + pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, + cu->processor_id_low); +} + +/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ +static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) +{ + struct kfd_topology_device *dev; + int i = 0; + + pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", + cu->proximity_domain, cu->hsa_capability); + list_for_each_entry(dev, &topology_device_list, list) { + if (cu->proximity_domain == i) { + if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) + kfd_populated_cu_info_cpu(dev, cu); + + if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) + kfd_populated_cu_info_gpu(dev, cu); break; } + i++; + } - up_read(&topology_lock); + return 0; +} - return device; +/* + * kfd_parse_subtype_mem is called when the topology mutex is + * already acquired + */ +static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) +{ + struct kfd_mem_properties *props; + struct kfd_topology_device *dev; + int i = 0; + + pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", + mem->promixity_domain); + list_for_each_entry(dev, &topology_device_list, list) { + if (mem->promixity_domain == i) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + if (dev->node_props.cpu_cores_count == 0) + props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; + else + props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; + + if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) + props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; + if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) + props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; + + props->size_in_bytes = + ((uint64_t)mem->length_high << 32) + + mem->length_low; + props->width = mem->width; + + dev->mem_bank_count++; + list_add_tail(&props->list, &dev->mem_props); + + break; + } + i++; + } + + return 0; } -struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd) +/* + * kfd_parse_subtype_cache is called when the topology mutex + * is already acquired + */ +static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) { - struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; + struct kfd_cache_properties *props; + struct kfd_topology_device *dev; + uint32_t id; - down_read(&topology_lock); + id = cache->processor_id_low; + + pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); + list_for_each_entry(dev, &topology_device_list, list) + if (id == dev->node_props.cpu_core_id_base || + id == dev->node_props.simd_id_base) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->processor_id_low = id; + props->cache_level = cache->cache_level; + props->cache_size = cache->cache_size; + props->cacheline_size = cache->cache_line_size; + props->cachelines_per_tag = cache->lines_per_tag; + props->cache_assoc = cache->associativity; + props->cache_latency = cache->cache_latency; + + if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) + props->cache_type |= HSA_CACHE_TYPE_DATA; + if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) + props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; + if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) + props->cache_type |= HSA_CACHE_TYPE_CPU; + if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) + props->cache_type |= HSA_CACHE_TYPE_HSACU; + + dev->cache_count++; + dev->node_props.caches_count++; + list_add_tail(&props->list, &dev->cache_props); - list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu && top_dev->gpu->kgd == kgd) { - device = top_dev->gpu; break; } - up_read(&topology_lock); + return 0; +} - return device; +/* + * kfd_parse_subtype_iolink is called when the topology mutex + * is already acquired + */ +static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) +{ + struct kfd_iolink_properties *props; + struct kfd_topology_device *dev; + uint32_t i = 0; + uint32_t id_from; + uint32_t id_to; + + id_from = iolink->proximity_domain_from; + id_to = iolink->proximity_domain_to; + + pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); + list_for_each_entry(dev, &topology_device_list, list) { + if (id_from == i) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->node_from = id_from; + props->node_to = id_to; + props->ver_maj = iolink->version_major; + props->ver_min = iolink->version_minor; + + /* + * weight factor (derived from CDIR), currently always 1 + */ + props->weight = 1; + + props->min_latency = iolink->minimum_latency; + props->max_latency = iolink->maximum_latency; + props->min_bandwidth = iolink->minimum_bandwidth_mbs; + props->max_bandwidth = iolink->maximum_bandwidth_mbs; + props->rec_transfer_size = + iolink->recommended_transfer_size; + + dev->io_link_count++; + dev->node_props.io_links_count++; + list_add_tail(&props->list, &dev->io_link_props); + + break; + } + i++; + } + + return 0; +} + +static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) +{ + struct crat_subtype_computeunit *cu; + struct crat_subtype_memory *mem; + struct crat_subtype_cache *cache; + struct crat_subtype_iolink *iolink; + int ret = 0; + + switch (sub_type_hdr->type) { + case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + ret = kfd_parse_subtype_cu(cu); + break; + case CRAT_SUBTYPE_MEMORY_AFFINITY: + mem = (struct crat_subtype_memory *)sub_type_hdr; + ret = kfd_parse_subtype_mem(mem); + break; + case CRAT_SUBTYPE_CACHE_AFFINITY: + cache = (struct crat_subtype_cache *)sub_type_hdr; + ret = kfd_parse_subtype_cache(cache); + break; + case CRAT_SUBTYPE_TLB_AFFINITY: + /* + * For now, nothing to do here + */ + pr_info("Found TLB entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: + /* + * For now, nothing to do here + */ + pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_IOLINK_AFFINITY: + iolink = (struct crat_subtype_iolink *)sub_type_hdr; + ret = kfd_parse_subtype_iolink(iolink); + break; + default: + pr_warn("Unknown subtype (%d) in CRAT\n", + sub_type_hdr->type); + } + + return ret; } -/* Called with write topology_lock acquired */ static void kfd_release_topology_device(struct kfd_topology_device *dev) { struct kfd_mem_properties *mem; struct kfd_cache_properties *cache; struct kfd_iolink_properties *iolink; - struct kfd_perf_properties *perf; list_del(&dev->list); @@ -147,35 +375,25 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) kfree(iolink); } - while (dev->perf_props.next != &dev->perf_props) { - perf = container_of(dev->perf_props.next, - struct kfd_perf_properties, list); - list_del(&perf->list); - kfree(perf); - } - kfree(dev); + + sys_props.num_devices--; } -void kfd_release_topology_device_list(struct list_head *device_list) +static void kfd_release_live_view(void) { struct kfd_topology_device *dev; - while (!list_empty(device_list)) { - dev = list_first_entry(device_list, - struct kfd_topology_device, list); + while (topology_device_list.next != &topology_device_list) { + dev = container_of(topology_device_list.next, + struct kfd_topology_device, list); kfd_release_topology_device(dev); - } } -static void kfd_release_live_view(void) -{ - kfd_release_topology_device_list(&topology_device_list); memset(&sys_props, 0, sizeof(sys_props)); } -struct kfd_topology_device *kfd_create_topology_device( - struct list_head *device_list) +static struct kfd_topology_device *kfd_create_topology_device(void) { struct kfd_topology_device *dev; @@ -188,13 +406,66 @@ struct kfd_topology_device *kfd_create_topology_device( INIT_LIST_HEAD(&dev->mem_props); INIT_LIST_HEAD(&dev->cache_props); INIT_LIST_HEAD(&dev->io_link_props); - INIT_LIST_HEAD(&dev->perf_props); - list_add_tail(&dev->list, device_list); + list_add_tail(&dev->list, &topology_device_list); + sys_props.num_devices++; return dev; } +static int kfd_parse_crat_table(void *crat_image) +{ + struct kfd_topology_device *top_dev; + struct crat_subtype_generic *sub_type_hdr; + uint16_t node_id; + int ret; + struct crat_header *crat_table = (struct crat_header *)crat_image; + uint16_t num_nodes; + uint32_t image_len; + + if (!crat_image) + return -EINVAL; + + num_nodes = crat_table->num_domains; + image_len = crat_table->length; + + pr_info("Parsing CRAT table with %d nodes\n", num_nodes); + + for (node_id = 0; node_id < num_nodes; node_id++) { + top_dev = kfd_create_topology_device(); + if (!top_dev) { + kfd_release_live_view(); + return -ENOMEM; + } + } + + sys_props.platform_id = + (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); + sys_props.platform_rev = crat_table->revision; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < + ((char *)crat_image) + image_len) { + if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { + ret = kfd_parse_subtype(sub_type_hdr); + if (ret != 0) { + kfd_release_live_view(); + return ret; + } + } + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + } + + sys_props.generation_count++; + topology_crat_parsed = 1; + + return 0; +} + + #define sysfs_show_gen_prop(buffer, fmt, ...) \ snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) #define sysfs_show_32bit_prop(buffer, name, value) \ @@ -203,8 +474,6 @@ struct kfd_topology_device *kfd_create_topology_device( sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) #define sysfs_show_32bit_val(buffer, value) \ sysfs_show_gen_prop(buffer, "%u\n", value) -#define sysfs_show_64bit_val(buffer, value) \ - sysfs_show_gen_prop(buffer, "%llu\n", value) #define sysfs_show_str_val(buffer, value) \ sysfs_show_gen_prop(buffer, "%s\n", value) @@ -232,17 +501,11 @@ static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, return ret; } -static void kfd_topology_kobj_release(struct kobject *kobj) -{ - kfree(kobj); -} - static const struct sysfs_ops sysprops_ops = { .show = sysprops_show, }; static struct kobj_type sysprops_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &sysprops_ops, }; @@ -278,7 +541,6 @@ static const struct sysfs_ops iolink_ops = { }; static struct kobj_type iolink_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &iolink_ops, }; @@ -287,23 +549,11 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, { ssize_t ret; struct kfd_mem_properties *mem; - uint64_t used_mem; /* Making sure that the buffer is an empty string */ buffer[0] = 0; - if (strcmp(attr->name, "used_memory") == 0) { - mem = container_of(attr, struct kfd_mem_properties, - attr_used); - if (mem->gpu) { - used_mem = mem->gpu->kfd2kgd->get_vram_usage(mem->gpu->kgd); - return sysfs_show_64bit_val(buffer, used_mem); - } - /* TODO: Report APU/CPU-allocated memory; For now return 0 */ - return 0; - } - - mem = container_of(attr, struct kfd_mem_properties, attr_props); + mem = container_of(attr, struct kfd_mem_properties, attr); sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); sysfs_show_32bit_prop(buffer, "flags", mem->flags); @@ -318,7 +568,6 @@ static const struct sysfs_ops mem_ops = { }; static struct kobj_type mem_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &mem_ops, }; @@ -326,7 +575,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, char *buffer) { ssize_t ret; - uint32_t i, j; + uint32_t i; struct kfd_cache_properties *cache; /* Making sure that the buffer is an empty string */ @@ -344,18 +593,12 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); sysfs_show_32bit_prop(buffer, "type", cache->cache_type); snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); - for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) - for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { - /* Check each bit */ - if (cache->sibling_map[i] & (1 << j)) - ret = snprintf(buffer, PAGE_SIZE, - "%s%d%s", buffer, 1, ","); - else - ret = snprintf(buffer, PAGE_SIZE, - "%s%d%s", buffer, 0, ","); - } - /* Replace the last "," with end of line */ - *(buffer + strlen(buffer) - 1) = 0xA; + for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) + ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", + buffer, cache->sibling_map[i], + (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? + "\n" : ","); + return ret; } @@ -364,43 +607,9 @@ static const struct sysfs_ops cache_ops = { }; static struct kobj_type cache_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &cache_ops, }; -/****** Sysfs of Performance Counters ******/ - -struct kfd_perf_attr { - struct kobj_attribute attr; - uint32_t data; -}; - -static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, - char *buf) -{ - struct kfd_perf_attr *attr; - - buf[0] = 0; - attr = container_of(attrs, struct kfd_perf_attr, attr); - if (!attr->data) /* invalid data for PMC */ - return 0; - else - return sysfs_show_32bit_val(buf, attr->data); -} - -#define KFD_PERF_DESC(_name, _data) \ -{ \ - .attr = __ATTR(_name, 0444, perf_show, NULL), \ - .data = _data, \ -} - -static struct kfd_perf_attr perf_attr_iommu[] = { - KFD_PERF_DESC(max_concurrent, 0), - KFD_PERF_DESC(num_counters, 0), - KFD_PERF_DESC(counter_ids, 0), -}; -/****************************************/ - static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char *buffer) { @@ -408,7 +617,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; uint32_t i; uint32_t log_max_watch_addr; - struct kfd_local_mem_info local_mem_info; /* Making sure that the buffer is an empty string */ buffer[0] = 0; @@ -438,8 +646,18 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.cpu_cores_count); sysfs_show_32bit_prop(buffer, "simd_count", dev->node_props.simd_count); - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->node_props.mem_banks_count); + + if (dev->mem_bank_count < dev->node_props.mem_banks_count) { + pr_info_once("mem_banks_count truncated from %d to %d\n", + dev->node_props.mem_banks_count, + dev->mem_bank_count); + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->mem_bank_count); + } else { + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->node_props.mem_banks_count); + } + sysfs_show_32bit_prop(buffer, "caches_count", dev->node_props.caches_count); sysfs_show_32bit_prop(buffer, "io_links_count", @@ -472,8 +690,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.device_id); sysfs_show_32bit_prop(buffer, "location_id", dev->node_props.location_id); - sysfs_show_32bit_prop(buffer, "drm_render_minor", - dev->node_props.drm_render_minor); if (dev->gpu) { log_max_watch_addr = @@ -489,29 +705,17 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); } - if (dev->gpu->device_info->asic_family == CHIP_TONGA) - dev->node_props.capability |= - HSA_CAP_AQL_QUEUE_DOUBLE_MAP; - sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", - dev->node_props.max_engine_clk_fcompute); + dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( + dev->gpu->kgd)); - /* - * If the ASIC is APU except Kaveri, set local memory size - * to 0 to disable local memory support - */ - if (!dev->gpu->device_info->is_need_iommu_device - || dev->gpu->device_info->asic_family == CHIP_KAVERI) { - dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, - &local_mem_info); - sysfs_show_64bit_prop(buffer, "local_mem_size", - local_mem_info.local_mem_size_private + - local_mem_info.local_mem_size_public); - } else - sysfs_show_64bit_prop(buffer, "local_mem_size", 0ULL); + sysfs_show_64bit_prop(buffer, "local_mem_size", + (unsigned long long int) 0); sysfs_show_32bit_prop(buffer, "fw_version", - dev->gpu->mec_fw_version); + dev->gpu->kfd2kgd->get_fw_version( + dev->gpu->kgd, + KGD_ENGINE_MEC1)); sysfs_show_32bit_prop(buffer, "capability", dev->node_props.capability); } @@ -525,7 +729,6 @@ static const struct sysfs_ops node_ops = { }; static struct kobj_type node_type = { - .release = kfd_topology_kobj_release, .sysfs_ops = &node_ops, }; @@ -541,7 +744,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) struct kfd_iolink_properties *iolink; struct kfd_cache_properties *cache; struct kfd_mem_properties *mem; - struct kfd_perf_properties *perf; if (dev->kobj_iolink) { list_for_each_entry(iolink, &dev->io_link_props, list) @@ -570,12 +772,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) if (dev->kobj_mem) { list_for_each_entry(mem, &dev->mem_props, list) if (mem->kobj) { - /* TODO: Remove when CPU/APU supported */ - if (dev->node_props.cpu_cores_count == 0) - sysfs_remove_file(mem->kobj, - &mem->attr_used); - kfd_remove_sysfs_file(mem->kobj, - &mem->attr_props); + kfd_remove_sysfs_file(mem->kobj, &mem->attr); mem->kobj = NULL; } kobject_del(dev->kobj_mem); @@ -583,16 +780,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) dev->kobj_mem = NULL; } - if (dev->kobj_perf) { - list_for_each_entry(perf, &dev->perf_props, list) { - kfree(perf->attr_group); - perf->attr_group = NULL; - } - kobject_del(dev->kobj_perf); - kobject_put(dev->kobj_perf); - dev->kobj_perf = NULL; - } - if (dev->kobj_node) { sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); sysfs_remove_file(dev->kobj_node, &dev->attr_name); @@ -609,9 +796,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, struct kfd_iolink_properties *iolink; struct kfd_cache_properties *cache; struct kfd_mem_properties *mem; - struct kfd_perf_properties *perf; - uint32_t num_attrs; - struct attribute **attrs; int ret; uint32_t i; @@ -642,10 +826,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (!dev->kobj_iolink) return -ENOMEM; - dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node); - if (!dev->kobj_perf) - return -ENOMEM; - /* * Creating sysfs files for node properties */ @@ -678,23 +858,12 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (ret < 0) return ret; - mem->attr_props.name = "properties"; - mem->attr_props.mode = KFD_SYSFS_FILE_MODE; - sysfs_attr_init(&mem->attr_props); - ret = sysfs_create_file(mem->kobj, &mem->attr_props); + mem->attr.name = "properties"; + mem->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&mem->attr); + ret = sysfs_create_file(mem->kobj, &mem->attr); if (ret < 0) return ret; - - /* TODO: Support APU/CPU memory usage */ - if (dev->node_props.cpu_cores_count == 0) { - mem->attr_used.name = "used_memory"; - mem->attr_used.mode = KFD_SYSFS_FILE_MODE; - sysfs_attr_init(&mem->attr_used); - ret = sysfs_create_file(mem->kobj, &mem->attr_used); - if (ret < 0) - return ret; - } - i++; } @@ -734,38 +903,11 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (ret < 0) return ret; i++; - } - - /* All hardware blocks have the same number of attributes. */ - num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr); - list_for_each_entry(perf, &dev->perf_props, list) { - perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr) - * num_attrs + sizeof(struct attribute_group), - GFP_KERNEL); - if (!perf->attr_group) - return -ENOMEM; - - attrs = (struct attribute **)(perf->attr_group + 1); - if (!strcmp(perf->block_name, "iommu")) { - /* Information of IOMMU's num_counters and counter_ids is shown - * under /sys/bus/event_source/devices/amd_iommu. We don't - * duplicate here. - */ - perf_attr_iommu[0].data = perf->max_concurrent; - for (i = 0; i < num_attrs; i++) - attrs[i] = &perf_attr_iommu[i].attr.attr; - } - perf->attr_group->name = perf->block_name; - perf->attr_group->attrs = attrs; - ret = sysfs_create_group(dev->kobj_perf, perf->attr_group); - if (ret < 0) - return ret; - } +} return 0; } -/* Called with write topology lock acquired */ static int kfd_build_sysfs_node_tree(void) { struct kfd_topology_device *dev; @@ -782,7 +924,6 @@ static int kfd_build_sysfs_node_tree(void) return 0; } -/* Called with write topology lock acquired */ static void kfd_remove_sysfs_node_tree(void) { struct kfd_topology_device *dev; @@ -854,251 +995,75 @@ static void kfd_topology_release_sysfs(void) } } -/* Called with write topology_lock acquired */ -static void kfd_topology_update_device_list(struct list_head *temp_list, - struct list_head *master_list) -{ - while (!list_empty(temp_list)) { - list_move_tail(temp_list->next, master_list); - sys_props.num_devices++; - } -} - -static void kfd_debug_print_topology(void) -{ - struct kfd_topology_device *dev; - - down_read(&topology_lock); - - dev = list_last_entry(&topology_device_list, - struct kfd_topology_device, list); - if (dev) { - if (dev->node_props.cpu_cores_count && - dev->node_props.simd_count) { - pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", - dev->node_props.device_id, - dev->node_props.vendor_id); - } else if (dev->node_props.cpu_cores_count) - pr_info("Topology: Add CPU node\n"); - else if (dev->node_props.simd_count) - pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", - dev->node_props.device_id, - dev->node_props.vendor_id); - } - up_read(&topology_lock); -} - -/* Helper function for intializing platform_xx members of kfd_system_properties - */ -static void kfd_update_system_properties(void) -{ - struct kfd_topology_device *dev; - - down_read(&topology_lock); - dev = list_last_entry(&topology_device_list, - struct kfd_topology_device, list); - if (dev) { - sys_props.platform_id = - (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; - sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); - sys_props.platform_rev = dev->oem_revision; - } - up_read(&topology_lock); -} - -static void find_system_memory(const struct dmi_header *dm, - void *private) -{ - struct kfd_mem_properties *mem; - u16 mem_width, mem_clock; - struct kfd_topology_device *kdev = - (struct kfd_topology_device *)private; - const u8 *dmi_data = (const u8 *)(dm + 1); - - if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { - mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); - mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); - list_for_each_entry(mem, &kdev->mem_props, list) { - if (mem_width != 0xFFFF && mem_width != 0) - mem->width = mem_width; - if (mem_clock != 0) - mem->mem_clk_max = mem_clock; - } - } -} - -/* - * Performance counters information is not part of CRAT but we would like to - * put them in the sysfs under topology directory for Thunk to get the data. - * This function is called before updating the sysfs. - */ -static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) -{ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) - struct kfd_perf_properties *props; - - if (amd_iommu_pc_supported()) { - props = kfd_alloc_struct(props); - if (!props) - return -ENOMEM; - strcpy(props->block_name, "iommu"); - props->max_concurrent = amd_iommu_pc_get_max_banks(0) * - amd_iommu_pc_get_max_counters(0); /* assume one iommu */ - list_add_tail(&props->list, &kdev->perf_props); - } -#endif - - return 0; -} - -/* kfd_add_non_crat_information - Add information that is not currently - * defined in CRAT but is necessary for KFD topology - * @dev - topology device to which addition info is added - */ -static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) -{ - /* Check if CPU only node. */ - if (!kdev->gpu) { - /* Add system memory information */ - dmi_walk(find_system_memory, kdev); - } - /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ -} - -#ifdef CONFIG_ACPI -/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices. - * Ignore CRAT for all other devices. AMD APU is identified if both CPU - * and GPU cores are present. - * @device_list - topology device list created by parsing ACPI CRAT table. - * @return - TRUE if invalid, FALSE is valid. - */ -static bool kfd_is_acpi_crat_invalid(struct list_head *device_list) -{ - struct kfd_topology_device *dev; - - list_for_each_entry(dev, device_list, list) { - if (dev->node_props.cpu_cores_count && - dev->node_props.simd_count) - return false; - } - pr_info("Ignoring ACPI CRAT on non-APU system\n"); - return true; -} -#endif - int kfd_topology_init(void) { void *crat_image = NULL; size_t image_size = 0; int ret; - struct list_head temp_topology_device_list; - int cpu_only_node = 0; - struct kfd_topology_device *kdev; - int proximity_domain; - - /* topology_device_list - Master list of all topology devices - * temp_topology_device_list - temporary list created while parsing CRAT - * or VCRAT. Once parsing is complete the contents of list is moved to - * topology_device_list - */ - /* Initialize the head for the both the lists */ + /* + * Initialize the head for the topology device list + */ INIT_LIST_HEAD(&topology_device_list); - INIT_LIST_HEAD(&temp_topology_device_list); init_rwsem(&topology_lock); + topology_crat_parsed = 0; memset(&sys_props, 0, sizeof(sys_props)); - /* Proximity domains in ACPI CRAT tables start counting at - * 0. The same should be true for virtual CRAT tables created - * at this stage. GPUs added later in kfd_topology_add_device - * use a counter. - */ - proximity_domain = 0; - /* - * Get the CRAT image from the ACPI. If ACPI doesn't have one - * or if ACPI CRAT is invalid create a virtual CRAT. - * NOTE: The current implementation expects all AMD APUs to have - * CRAT. If no CRAT is available, it is assumed to be a CPU + * Get the CRAT image from the ACPI */ -#ifdef CONFIG_ACPI - ret = kfd_create_crat_image_acpi(&crat_image, &image_size); - if (ret == 0) { - ret = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, - proximity_domain); - if (ret || - kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { - - kfd_release_topology_device_list( - &temp_topology_device_list); - kfd_destroy_crat_image(crat_image); - crat_image = NULL; - } - } -#endif - if (!crat_image) { - ret = kfd_create_crat_image_virtual(&crat_image, &image_size, - COMPUTE_UNIT_CPU, NULL, - proximity_domain); - cpu_only_node = 1; - if (ret) { - pr_err("Error creating VCRAT table for CPU\n"); - return ret; - } - - ret = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, - proximity_domain); - if (ret) { - pr_err("Error parsing VCRAT table for CPU\n"); + ret = kfd_topology_get_crat_acpi(crat_image, &image_size); + if (ret == 0 && image_size > 0) { + pr_info("Found CRAT image with size=%zd\n", image_size); + crat_image = kmalloc(image_size, GFP_KERNEL); + if (!crat_image) { + ret = -ENOMEM; + pr_err("No memory for allocating CRAT image\n"); goto err; } - } - - kdev = list_first_entry(&temp_topology_device_list, - struct kfd_topology_device, list); - kfd_add_perf_to_topology(kdev); - - down_write(&topology_lock); - kfd_topology_update_device_list(&temp_topology_device_list, - &topology_device_list); - atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1); - ret = kfd_topology_update_sysfs(); - up_write(&topology_lock); - - if (ret == 0) { - sys_props.generation_count++; - kfd_update_system_properties(); - kfd_debug_print_topology(); - pr_info("Finished initializing topology\n"); - } else - pr_err("Failed to update topology in sysfs ret=%d\n", ret); - - /* For nodes with GPU, this information gets added - * when GPU is detected (kfd_topology_add_device). - */ - if (cpu_only_node) { - /* Add additional information to CPU only node created above */ - down_write(&topology_lock); - kdev = list_first_entry(&topology_device_list, - struct kfd_topology_device, list); - up_write(&topology_lock); - kfd_add_non_crat_information(kdev); + ret = kfd_topology_get_crat_acpi(crat_image, &image_size); + + if (ret == 0) { + down_write(&topology_lock); + ret = kfd_parse_crat_table(crat_image); + if (ret == 0) + ret = kfd_topology_update_sysfs(); + up_write(&topology_lock); + } else { + pr_err("Couldn't get CRAT table size from ACPI\n"); + } + kfree(crat_image); + } else if (ret == -ENODATA) { + ret = 0; + } else { + pr_err("Couldn't get CRAT table size from ACPI\n"); } err: - kfd_destroy_crat_image(crat_image); + pr_info("Finished initializing topology ret=%d\n", ret); return ret; } void kfd_topology_shutdown(void) { - down_write(&topology_lock); kfd_topology_release_sysfs(); kfd_release_live_view(); - up_write(&topology_lock); +} + +static void kfd_debug_print_topology(void) +{ + struct kfd_topology_device *dev; + uint32_t i = 0; + + pr_info("DEBUG PRINT OF TOPOLOGY:"); + list_for_each_entry(dev, &topology_device_list, list) { + pr_info("Node: %d\n", i); + pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); + pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); + pr_info("\tSIMD count: %d", dev->node_props.simd_count); + i++; + } } static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) @@ -1107,15 +1072,11 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) uint32_t buf[7]; uint64_t local_mem_size; int i; - struct kfd_local_mem_info local_mem_info; if (!gpu) return 0; - gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info); - - local_mem_size = local_mem_info.local_mem_size_private + - local_mem_info.local_mem_size_public; + local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd); buf[0] = gpu->pdev->devfn; buf[1] = gpu->pdev->subsystem_vendor; @@ -1130,32 +1091,18 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) return hashout; } -/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If - * the GPU device is not already present in the topology device - * list then return NULL. This means a new topology device has to - * be created for this GPU. - * TODO: Rather than assiging @gpu to first topology device withtout - * gpu attached, it will better to have more stringent check. - */ + static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) { struct kfd_topology_device *dev; struct kfd_topology_device *out_dev = NULL; - struct kfd_mem_properties *mem; - down_write(&topology_lock); list_for_each_entry(dev, &topology_device_list, list) if (!dev->gpu && (dev->node_props.simd_count > 0)) { dev->gpu = gpu; out_dev = dev; - - /* Assign mem->gpu */ - list_for_each_entry(mem, &dev->mem_props, list) - mem->gpu = dev->gpu; - break; } - up_write(&topology_lock); return out_dev; } @@ -1168,204 +1115,84 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) */ } -/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, - * patch this after CRAT parsing. - */ -static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) -{ - struct kfd_mem_properties *mem; - struct kfd_local_mem_info local_mem_info; - - if (!dev) - return; - - /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with - * single bank of VRAM local memory. - * for dGPUs - VCRAT reports only one bank of Local Memory - * for APUs - If CRAT from ACPI reports more than one bank, then - * all the banks will report the same mem_clk_max information - */ - dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, - &local_mem_info); - - list_for_each_entry(mem, &dev->mem_props, list) - mem->mem_clk_max = local_mem_info.mem_clk_max; -} - -static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) -{ - struct kfd_iolink_properties *link; - - if (!dev || !dev->gpu) - return; - - /* GPU only creates direck links so apply flags setting to all */ - if (dev->gpu->device_info->asic_family == CHIP_HAWAII) - list_for_each_entry(link, &dev->io_link_props, list) - link->flags = CRAT_IOLINK_FLAGS_ENABLED | - CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | - CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; -} - int kfd_topology_add_device(struct kfd_dev *gpu) { uint32_t gpu_id; struct kfd_topology_device *dev; - struct kfd_cu_info cu_info; - int res = 0; - struct list_head temp_topology_device_list; - void *crat_image = NULL; - size_t image_size = 0; - int proximity_domain; - - INIT_LIST_HEAD(&temp_topology_device_list); + int res; gpu_id = kfd_generate_gpu_id(gpu); pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); - proximity_domain = atomic_inc_return(& - topology_crat_proximity_domain); - - /* Check to see if this gpu device exists in the topology_device_list. - * If so, assign the gpu to that device, - * else create a Virtual CRAT for this gpu device and then parse that - * CRAT to create a new topology device. Once created assign the gpu to - * that topology device + down_write(&topology_lock); + /* + * Try to assign the GPU to existing topology device (generated from + * CRAT table */ dev = kfd_assign_gpu(gpu); if (!dev) { - res = kfd_create_crat_image_virtual(&crat_image, &image_size, - COMPUTE_UNIT_GPU, - gpu, proximity_domain); - if (res) { - pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", - gpu_id); - return res; - } - res = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, proximity_domain); - if (res) { - pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + pr_info("GPU was not found in the current topology. Extending.\n"); + kfd_debug_print_topology(); + dev = kfd_create_topology_device(); + if (!dev) { + res = -ENOMEM; goto err; } + dev->gpu = gpu; - down_write(&topology_lock); - kfd_topology_update_device_list(&temp_topology_device_list, - &topology_device_list); + /* + * TODO: Make a call to retrieve topology information from the + * GPU vBIOS + */ /* Update the SYSFS tree, since we added another topology * device */ - res = kfd_topology_update_sysfs(); - up_write(&topology_lock); - - if (res == 0) - sys_props.generation_count++; - else - pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", - gpu_id, res); - dev = kfd_assign_gpu(gpu); - if (!dev) { - pr_err("Could not assign GPU\n"); - res = -ENODEV; - goto err; - } + if (kfd_topology_update_sysfs() < 0) + kfd_topology_release_sysfs(); + } dev->gpu_id = gpu_id; gpu->id = gpu_id; - - /* TODO: Move the following lines to function - * kfd_add_non_crat_information - */ - - /* Fill-in additional information that is not available in CRAT but - * needed for the topology - */ - - dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); - dev->node_props.simd_arrays_per_engine = - cu_info.num_shader_arrays_per_engine; - dev->node_props.vendor_id = gpu->pdev->vendor; dev->node_props.device_id = gpu->pdev->device; - dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, - gpu->pdev->devfn); - dev->node_props.max_engine_clk_fcompute = - dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); - dev->node_props.max_engine_clk_ccompute = - cpufreq_quick_get_max(0) / 1000; - dev->node_props.drm_render_minor = - gpu->shared_resources.drm_render_minor; - - kfd_fill_mem_clk_max_info(dev); - kfd_fill_iolink_non_crat_info(dev); - - switch (dev->gpu->device_info->asic_family) { - case CHIP_KAVERI: - case CHIP_HAWAII: - case CHIP_TONGA: - dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << - HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & - HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); - break; - case CHIP_CARRIZO: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - pr_debug("Adding doorbell packet type capability\n"); - dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << - HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & - HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); - break; - case CHIP_VEGA10: - case CHIP_RAVEN: - dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << - HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & - HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); - break; - default: - BUG(); - } - - /* Fix errors in CZ CRAT. - * simd_count: Carrizo CRAT reports wrong simd_count, probably because - * it doesn't consider masked out CUs - * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd. - * capability flag: Carrizo CRAT doesn't report IOMMU flags. + dev->node_props.location_id = (gpu->pdev->bus->number << 24) + + (gpu->pdev->devfn & 0xffffff); + /* + * TODO: Retrieve max engine clock values from KGD */ + if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { - dev->node_props.simd_count = - cu_info.simd_per_cu * cu_info.cu_active_number; - dev->node_props.max_waves_per_simd = 10; - dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; + pr_info("Adding doorbell packet type capability\n"); } - kfd_debug_print_topology(); + res = 0; - if (!res) - kfd_notify_gpu_change(gpu_id, 1); err: - kfd_destroy_crat_image(crat_image); + up_write(&topology_lock); + + if (res == 0) + kfd_notify_gpu_change(gpu_id, 1); + return res; } int kfd_topology_remove_device(struct kfd_dev *gpu) { - struct kfd_topology_device *dev, *tmp; + struct kfd_topology_device *dev; uint32_t gpu_id; int res = -ENODEV; down_write(&topology_lock); - list_for_each_entry_safe(dev, tmp, &topology_device_list, list) + list_for_each_entry(dev, &topology_device_list, list) if (dev->gpu == gpu) { gpu_id = dev->gpu_id; kfd_remove_sysfs_node_entry(dev); kfd_release_topology_device(dev); - sys_props.num_devices--; res = 0; if (kfd_topology_update_sysfs() < 0) kfd_topology_release_sysfs(); @@ -1380,26 +1207,22 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) return res; } -/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD - * topology. If GPU device is found @idx, then valid kfd_dev pointer is - * returned through @kdev - * Return - 0: On success (@kdev will be NULL for non GPU nodes) - * -1: If end of list +/* + * When idx is out of bounds, the function will return NULL */ -int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) +struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) { struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; uint8_t device_idx = 0; - *kdev = NULL; down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) { if (device_idx == idx) { - *kdev = top_dev->gpu; - up_read(&topology_lock); - return 0; + device = top_dev->gpu; + break; } device_idx++; @@ -1407,89 +1230,6 @@ int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) up_read(&topology_lock); - return -1; - -} - -static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) -{ - int first_cpu_of_numa_node; - - if (!cpumask || (cpumask == cpu_none_mask)) - return -1; - first_cpu_of_numa_node = cpumask_first(cpumask); - if (first_cpu_of_numa_node >= nr_cpu_ids) - return -1; -#ifdef CONFIG_X86_64 - return cpu_data(first_cpu_of_numa_node).apicid; -#else - return first_cpu_of_numa_node; -#endif -} - -/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor - * of the given NUMA node (numa_node_id) - * Return -1 on failure - */ -int kfd_numa_node_to_apic_id(int numa_node_id) -{ - if (numa_node_id == -1) { - pr_warn("Invalid NUMA Node. Use online CPU mask\n"); - return kfd_cpumask_to_apic_id(cpu_online_mask); - } - return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); -} - -#if defined(CONFIG_DEBUG_FS) - -int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) -{ - struct kfd_topology_device *dev; - unsigned int i = 0; - int r = 0; - - down_read(&topology_lock); - - list_for_each_entry(dev, &topology_device_list, list) { - if (!dev->gpu) { - i++; - continue; - } - - seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); - r = device_queue_manager_debugfs_hqds(m, dev->gpu->dqm); - if (r != 0) - break; - } - - up_read(&topology_lock); - - return r; -} - -int kfd_debugfs_rls_by_device(struct seq_file *m, void *data) -{ - struct kfd_topology_device *dev; - unsigned int i = 0; - int r = 0; - - down_read(&topology_lock); - - list_for_each_entry(dev, &topology_device_list, list) { - if (!dev->gpu) { - i++; - continue; - } - - seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); - r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets); - if (r != 0) - break; - } - - up_read(&topology_lock); + return device; - return r; } - -#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index d1c9ba3..c3ddb9b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -39,17 +39,8 @@ #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 -#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 -#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 -#define HSA_CAP_RESERVED 0xffffc000 - -#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 -#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 -#define HSA_CAP_DOORBELL_TYPE_2_0 0x2 -#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 -#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 +#define HSA_CAP_RESERVED 0xfffff000 #define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 -#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 struct kfd_node_properties { uint32_t cpu_cores_count; @@ -75,7 +66,6 @@ struct kfd_node_properties { uint32_t location_id; uint32_t max_engine_clk_fcompute; uint32_t max_engine_clk_ccompute; - int32_t drm_render_minor; uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; }; @@ -98,11 +88,11 @@ struct kfd_mem_properties { uint32_t width; uint32_t mem_clk_max; struct kobject *kobj; - struct kfd_dev *gpu; - struct attribute attr_props; - struct attribute attr_used; + struct attribute attr; }; +#define KFD_TOPOLOGY_CPU_SIBLINGS 256 + #define HSA_CACHE_TYPE_DATA 0x00000001 #define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 #define HSA_CACHE_TYPE_CPU 0x00000004 @@ -119,7 +109,7 @@ struct kfd_cache_properties { uint32_t cache_assoc; uint32_t cache_latency; uint32_t cache_type; - uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; struct kobject *kobj; struct attribute attr; }; @@ -142,36 +132,24 @@ struct kfd_iolink_properties { struct attribute attr; }; -struct kfd_perf_properties { - struct list_head list; - char block_name[16]; - uint32_t max_concurrent; - struct attribute_group *attr_group; -}; - struct kfd_topology_device { struct list_head list; uint32_t gpu_id; - uint32_t proximity_domain; struct kfd_node_properties node_props; + uint32_t mem_bank_count; struct list_head mem_props; uint32_t cache_count; struct list_head cache_props; uint32_t io_link_count; struct list_head io_link_props; - struct list_head perf_props; struct kfd_dev *gpu; struct kobject *kobj_node; struct kobject *kobj_mem; struct kobject *kobj_cache; struct kobject *kobj_iolink; - struct kobject *kobj_perf; struct attribute attr_gpuid; struct attribute attr_name; struct attribute attr_props; - uint8_t oem_id[CRAT_OEMID_LENGTH]; - uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; - uint32_t oem_revision; }; struct kfd_system_properties { @@ -186,14 +164,6 @@ struct kfd_system_properties { struct attribute attr_props; }; -struct kfd_topology_device *kfd_create_topology_device( - struct list_head *device_list); -void kfd_release_topology_device_list(struct list_head *device_list); -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -extern bool amd_iommu_pc_supported(void); -extern u8 amd_iommu_pc_get_max_banks(u16 devid); -extern u8 amd_iommu_pc_get_max_counters(u16 devid); -#endif #endif /* __KFD_TOPOLOGY_H__ */ diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h deleted file mode 100644 index e00d03d..0000000 --- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef HSA_SOC15_INT_H_INCLUDED -#define HSA_SOC15_INT_H_INCLUDED -/* - * vega10+ IH clients - */ -enum soc15_ih_client_id { - SOC15_IH_CLIENTID_IH = 0x00, - SOC15_IH_CLIENTID_ACP = 0x01, - SOC15_IH_CLIENTID_ATHUB = 0x02, - SOC15_IH_CLIENTID_BIF = 0x03, - SOC15_IH_CLIENTID_DCE = 0x04, - SOC15_IH_CLIENTID_ISP = 0x05, - SOC15_IH_CLIENTID_PCIE0 = 0x06, - SOC15_IH_CLIENTID_RLC = 0x07, - SOC15_IH_CLIENTID_SDMA0 = 0x08, - SOC15_IH_CLIENTID_SDMA1 = 0x09, - SOC15_IH_CLIENTID_SE0SH = 0x0a, - SOC15_IH_CLIENTID_SE1SH = 0x0b, - SOC15_IH_CLIENTID_SE2SH = 0x0c, - SOC15_IH_CLIENTID_SE3SH = 0x0d, - SOC15_IH_CLIENTID_SYSHUB = 0x0e, - SOC15_IH_CLIENTID_THM = 0x0f, - SOC15_IH_CLIENTID_UVD = 0x10, - SOC15_IH_CLIENTID_VCE0 = 0x11, - SOC15_IH_CLIENTID_VMC = 0x12, - SOC15_IH_CLIENTID_XDMA = 0x13, - SOC15_IH_CLIENTID_GRBM_CP = 0x14, - SOC15_IH_CLIENTID_ATS = 0x15, - SOC15_IH_CLIENTID_ROM_SMUIO = 0x16, - SOC15_IH_CLIENTID_DF = 0x17, - SOC15_IH_CLIENTID_VCE1 = 0x18, - SOC15_IH_CLIENTID_PWR = 0x19, - SOC15_IH_CLIENTID_UTCL2 = 0x1b, - SOC15_IH_CLIENTID_EA = 0x1c, - SOC15_IH_CLIENTID_UTCL2LOG = 0x1d, - SOC15_IH_CLIENTID_MP0 = 0x1e, - SOC15_IH_CLIENTID_MP1 = 0x1f, - - SOC15_IH_CLIENTID_MAX -}; - - -#define SOC15_INTSRC_CP_END_OF_PIPE 181 -#define SOC15_INTSRC_CP_BAD_OPCODE 183 -#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 -#define SOC15_INTSRC_VMC_FAULT 0 -#define SOC15_INTSRC_SDMA_TRAP 224 - - -#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) -#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) -#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) -#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) -#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) -#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) -#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) -#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) -#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) -#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) - -#endif - -- 2.7.4