diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch | 6577 |
1 files changed, 6577 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch new file mode 100644 index 00000000..1b2fa380 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch @@ -0,0 +1,6577 @@ +From 6cd5da3b1655f692cc68c402546fba401095b059 Mon Sep 17 00:00:00 2001 +From: Felix Kuehling <Felix.Kuehling@amd.com> +Date: Tue, 14 Mar 2017 23:38:24 -0400 +Subject: [PATCH 1624/4131] drm/amdkfd: Add Vega10 support for KFD + +Change-Id: Id024a9fed3bf233142a9e747e4c77659cf7ab7c1 +Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> + + Conflicts: + drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +--- + drivers/gpu/drm/amd/amdkfd/Makefile | 10 +- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 1392 ++++++++++++++++++++ + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 19 +- + drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 6 + + drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 10 +- + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 52 +- + .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 196 ++- + .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 4 + + .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 87 ++ + drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 68 +- + drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 81 +- + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 130 ++ + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 14 +- + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 7 +- + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | 125 ++ + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 385 ++++++ + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 368 ++++++ + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 2 + + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 509 +++++++ + drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 567 +------- + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h | 469 +------ + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h | 583 ++++++++ + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h | 130 +- + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 96 +- + .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 12 +- + drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 5 + + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 1 + + drivers/gpu/drm/amd/amdkfd/soc15_int.h | 84 ++ + 28 files changed, 4285 insertions(+), 1127 deletions(-) + create mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h + create mode 100644 drivers/gpu/drm/amd/amdkfd/soc15_int.h + +diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile +index 84646ed..fde693c 100644 +--- a/drivers/gpu/drm/amd/amdkfd/Makefile ++++ b/drivers/gpu/drm/amd/amdkfd/Makefile +@@ -11,11 +11,13 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ + kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ + kfd_process.o kfd_queue.o kfd_mqd_manager.o \ + kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ ++ kfd_mqd_manager_v9.o \ + kfd_kernel_queue.o kfd_kernel_queue_cik.o \ +- kfd_kernel_queue_vi.o kfd_packet_manager.o \ +- kfd_process_queue_manager.o kfd_device_queue_manager.o \ +- kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ +- kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ ++ kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ ++ kfd_packet_manager.o kfd_process_queue_manager.o \ ++ kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ ++ kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ ++ kfd_interrupt.o kfd_events.o cik_event_interrupt.o kfd_int_process_v9.o \ + kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o \ + kfd_peerdirect.o kfd_ipc.o + +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +new file mode 100644 +index 0000000..0106e77 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +@@ -0,0 +1,1392 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#if 0 ++HW (GFX9) source code for CWSR trap handler ++#Version 18 + multiple trap handler ++ ++// this performance-optimal version was originally from Seven Xu at SRDC ++ ++// Revison #18 --... ++/* Rev History ++** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) ++** #4. SR Memory Layout: ++** 1. VGPR-SGPR-HWREG-{LDS} ++** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. ++** #5. Update: 1. Accurate g8sr_ts_save_d timestamp ++** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) ++** #7. Update: 1. don't barrier if noLDS ++** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version ++** 2. Fix SQ issue by s_sleep 2 ++** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last ++** 2. optimize s_buffer save by burst 16sgprs... ++** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. ++** #11. Update 1. Add 2 more timestamp for debug version ++** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance ++** #13. Integ 1. Always use MUBUF for PV trap shader... ++** #14. Update 1. s_buffer_store soft clause... ++** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. ++** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree ++** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] ++** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... ++** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 ++** 2. FUNC - Handle non-CWSR traps ++*/ ++ ++var G8SR_WDMEM_HWREG_OFFSET = 0 ++var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes ++ ++// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. ++ ++var G8SR_DEBUG_TIMESTAMP = 0 ++var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset ++var s_g8sr_ts_save_s = s[34:35] // save start ++var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi ++var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ ++var s_g8sr_ts_save_d = s[40:41] // save end ++var s_g8sr_ts_restore_s = s[42:43] // restore start ++var s_g8sr_ts_restore_d = s[44:45] // restore end ++ ++var G8SR_VGPR_SR_IN_DWX4 = 0 ++var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes ++var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 ++ ++ ++/*************************************************************************/ ++/* control on how to run the shader */ ++/*************************************************************************/ ++//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) ++var EMU_RUN_HACK = 0 ++var EMU_RUN_HACK_RESTORE_NORMAL = 0 ++var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 ++var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 ++var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK ++var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK ++var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK ++var SAVE_LDS = 1 ++var WG_BASE_ADDR_LO = 0x9000a000 ++var WG_BASE_ADDR_HI = 0x0 ++var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem ++var CTX_SAVE_CONTROL = 0x0 ++var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL ++var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) ++var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write ++var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes ++var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing ++ ++/**************************************************************************/ ++/* variables */ ++/**************************************************************************/ ++var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 ++var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 ++var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 ++ ++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 ++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 ++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 ++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 ++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 ++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits ++ ++var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 ++var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask ++var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 ++var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 ++var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 ++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF ++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 ++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 ++var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 ++var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 ++var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 ++ ++var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME ++var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME ++var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME ++ ++var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 ++var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 ++ ++ ++/* Save */ ++var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes ++var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE ++ ++var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit ++var S_SAVE_SPI_INIT_ATC_SHIFT = 27 ++var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype ++var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 ++var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG ++var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 ++ ++var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used ++var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME ++var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME ++var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME ++ ++var s_save_spi_init_lo = exec_lo ++var s_save_spi_init_hi = exec_hi ++ ++ //tba_lo and tba_hi need to be saved/restored ++var tba_lo = ttmp12 ++var tba_hi = ttmp13 ++var tma_lo = ttmp14 ++var tma_hi = ttmp15 ++ ++var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} ++var s_save_pc_hi = ttmp1 ++var s_save_exec_lo = ttmp2 ++var s_save_exec_hi = ttmp3 ++var s_save_status = ttmp4 ++var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine ++var s_save_xnack_mask_lo = ttmp6 ++var s_save_xnack_mask_hi = ttmp7 ++var s_save_buf_rsrc0 = ttmp8 ++var s_save_buf_rsrc1 = ttmp9 ++var s_save_buf_rsrc2 = ttmp10 ++var s_save_buf_rsrc3 = ttmp11 ++ ++var s_save_mem_offset = tma_lo ++var s_save_alloc_size = s_save_trapsts //conflict ++var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) ++var s_save_m0 = tma_hi ++ ++/* Restore */ ++var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE ++var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC ++ ++var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit ++var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 ++var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype ++var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 ++var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG ++var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 ++ ++var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT ++var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK ++var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT ++var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK ++ ++var s_restore_spi_init_lo = exec_lo ++var s_restore_spi_init_hi = exec_hi ++ ++var s_restore_mem_offset = ttmp2 ++var s_restore_alloc_size = ttmp3 ++var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored ++var s_restore_mem_offset_save = s_restore_tmp //no conflict ++ ++var s_restore_m0 = s_restore_alloc_size //no conflict ++ ++var s_restore_mode = ttmp7 ++ ++var s_restore_pc_lo = ttmp0 ++var s_restore_pc_hi = ttmp1 ++var s_restore_exec_lo = tma_lo //no conflict ++var s_restore_exec_hi = tma_hi //no conflict ++var s_restore_status = ttmp4 ++var s_restore_trapsts = ttmp5 ++var s_restore_xnack_mask_lo = xnack_mask_lo ++var s_restore_xnack_mask_hi = xnack_mask_hi ++var s_restore_buf_rsrc0 = ttmp8 ++var s_restore_buf_rsrc1 = ttmp9 ++var s_restore_buf_rsrc2 = ttmp10 ++var s_restore_buf_rsrc3 = ttmp11 ++ ++/**************************************************************************/ ++/* trap handler entry points */ ++/**************************************************************************/ ++/* Shader Main*/ ++ ++shader main ++ asic(GFX9) ++ type(CS) ++ ++ ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore ++ //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC ++ s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC ++ s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. ++ s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE ++ //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE ++ s_branch L_SKIP_RESTORE //NOT restore, SAVE actually ++ else ++ s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save ++ end ++ ++L_JUMP_TO_RESTORE: ++ s_branch L_RESTORE //restore ++ ++L_SKIP_RESTORE: ++ ++ s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC ++ s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save ++ s_cbranch_scc1 L_SAVE //this is the operation for save ++ ++ // ********* Handle non-CWSR traps ******************* ++if (!EMU_RUN_HACK) ++ /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ ++ s_getreg_b32 tma_lo,hwreg(HW_REG_SQ_SHADER_TMA_LO) ++ s_getreg_b32 tma_hi,hwreg(HW_REG_SQ_SHADER_TMA_HI) ++ s_load_dwordx4 [tba_lo,tba_hi,tma_lo, tma_hi], [tma_lo,tma_hi], 0 ++ s_waitcnt lgkmcnt(0) ++ s_or_b32 ttmp11, tba_lo, tba_hi ++ s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set ++ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) ++ s_setpc_b64 [tba_lo,tba_hi] //jump to next level trap handler ++ ++L_NO_NEXT_TRAP: ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception ++ s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. ++ s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 ++ s_addc_u32 ttmp1, ttmp1, 0 ++L_EXCP_CASE: ++ s_and_b32 ttmp1, ttmp1, 0xFFFF ++ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) ++ s_rfe_b64 [ttmp0, ttmp1] ++end ++ // ********* End handling of non-CWSR traps ******************* ++ ++/**************************************************************************/ ++/* save routine */ ++/**************************************************************************/ ++ ++L_SAVE: ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_save_s ++ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? ++end ++ ++ //check whether there is mem_viol ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK ++ s_cbranch_scc0 L_NO_PC_REWIND ++ ++ //if so, need rewind PC assuming GDS operation gets NACKed ++ s_mov_b32 s_save_tmp, 0 //clear mem_viol bit ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit ++ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] ++ s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 ++ s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc ++ ++L_NO_PC_REWIND: ++ s_mov_b32 s_save_tmp, 0 //clear saveCtx bit ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit ++ ++ s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK ++ s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT ++ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT ++ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY ++ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT ++ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS ++ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG ++ ++ s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp ++ ++ /* inform SPI the readiness and wait for SPI's go signal */ ++ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI ++ s_mov_b32 s_save_exec_hi, exec_hi ++ s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_sq_save_msg ++ s_waitcnt lgkmcnt(0) ++end ++ ++ if (EMU_RUN_HACK) ++ ++ else ++ s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC ++ end ++ ++ L_SLEEP: ++ s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 ++ ++ if (EMU_RUN_HACK) ++ ++ else ++ s_cbranch_execz L_SLEEP ++ end ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_spi_wrexec ++ s_waitcnt lgkmcnt(0) ++end ++ ++ /* setup Resource Contants */ ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) ++ //calculate wd_addr using absolute thread id ++ v_readlane_b32 s_save_tmp, v9, 0 ++ s_lshr_b32 s_save_tmp, s_save_tmp, 6 ++ s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE ++ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO ++ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI ++ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL ++ else ++ end ++ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) ++ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO ++ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI ++ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL ++ else ++ end ++ ++ ++ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo ++ s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE ++ s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited ++ s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC ++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK ++ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position ++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC ++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK ++ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position ++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE ++ ++ //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) ++ s_mov_b32 s_save_m0, m0 //save M0 ++ ++ /* global mem offset */ ++ s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 ++ ++ ++ ++ ++ /* save HW registers */ ++ ////////////////////////////// ++ ++ L_SAVE_HWREG: ++ // HWREG SR memory offset : size(VGPR)+size(SGPR) ++ get_vgpr_size_bytes(s_save_mem_offset) ++ get_sgpr_size_bytes(s_save_tmp) ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp ++ ++ ++ s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 ++ ++ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) ++ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 ++ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over ++ s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO ++ s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI ++ end ++ ++ write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC ++ write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) ++ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC ++ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) ++ write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS ++ ++ //s_save_trapsts conflicts with s_save_alloc_size ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS ++ ++ write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO ++ write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI ++ ++ //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 ++ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE ++ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) ++ write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO ++ write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI ++ ++ ++ ++ /* the first wave in the threadgroup */ ++ // save fist_wave bits in tba_hi unused bit.26 ++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit ++ //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] ++ s_mov_b32 s_save_exec_hi, 0x0 ++ s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] ++ ++ ++ /* save SGPRs */ ++ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... ++ ////////////////////////////// ++ ++ // SGPR SR memory offset : size(VGPR) ++ get_vgpr_size_bytes(s_save_mem_offset) ++ // TODO, change RSRC word to rearrange memory layout for SGPRS ++ ++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) ++ ++ if (SGPR_SAVE_USE_SQC) ++ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes ++ else ++ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) ++ end ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 ++ //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 ++ s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 ++ s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset ++ ++ s_mov_b32 m0, 0x0 //SGPR initial index value =0 ++ s_nop 0x0 //Manually inserted wait states ++ L_SAVE_SGPR_LOOP: ++ // SGPR is allocated in 16 SGPR granularity ++ s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] ++ s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] ++ s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] ++ s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] ++ s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] ++ s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] ++ s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] ++ s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] ++ ++ write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 ++ s_add_u32 m0, m0, 16 //next sgpr index ++ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? ++ // restore s_save_buf_rsrc0,1 ++ //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo ++ s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo ++ ++ ++ ++ ++ /* save first 4 VGPR, then LDS save could use */ ++ // each wave will alloc 4 vgprs at least... ++ ///////////////////////////////////////////////////////////////////////////////////// ++ ++ s_mov_b32 s_save_mem_offset, 0 ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ // VGPR Allocated in 4-GPR granularity ++ ++if G8SR_VGPR_SR_IN_DWX4 ++ // the const stride for DWx4 is 4*4 bytes ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes ++ ++ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes ++else ++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 ++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 ++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 ++end ++ ++ ++ ++ /* save LDS */ ++ ////////////////////////////// ++ ++ L_SAVE_LDS: ++ ++ // Change EXEC to all threads... ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size ++ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? ++ s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE ++ ++ s_barrier //LDS is used? wait for other waves in the same TG ++ //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here ++ s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here ++ s_cbranch_scc0 L_SAVE_LDS_DONE ++ ++ // first wave do LDS save; ++ ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes ++ s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes ++ ++ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) ++ // ++ get_vgpr_size_bytes(s_save_mem_offset) ++ get_sgpr_size_bytes(s_save_tmp) ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() ++ ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 ++ ++ ++var LDS_DMA_ENABLE = 0 ++var UNROLL = 0 ++if UNROLL==0 && LDS_DMA_ENABLE==1 ++ s_mov_b32 s3, 256*2 ++ s_nop 0 ++ s_nop 0 ++ s_nop 0 ++ L_SAVE_LDS_LOOP: ++ //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? ++ if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW ++ end ++ ++ s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes ++ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? ++ ++elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss ++ // store from higest LDS address to lowest ++ s_mov_b32 s3, 256*2 ++ s_sub_u32 m0, s_save_alloc_size, s3 ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 ++ s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... ++ s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest ++ s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc ++ s_nop 0 ++ s_nop 0 ++ s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes ++ s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved ++ s_add_u32 s0, s0,s_save_alloc_size ++ s_addc_u32 s1, s1, 0 ++ s_setpc_b64 s[0:1] ++ ++ ++ for var i =0; i< 128; i++ ++ // be careful to make here a 64Byte aligned address, which could improve performance... ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW ++ ++ if i!=127 ++ s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline ++ s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 ++ end ++ end ++ ++else // BUFFER_STORE ++ v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 ++ v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid ++ v_mul_i32_i24 v2, v3, 8 // tid*8 ++ v_mov_b32 v3, 256*2 ++ s_mov_b32 m0, 0x10000 ++ s_mov_b32 s0, s_save_buf_rsrc3 ++ s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid ++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT ++ ++L_SAVE_LDS_LOOP_VECTOR: ++ ds_read_b64 v[0:1], v2 //x =LDS[a], byte address ++ s_waitcnt lgkmcnt(0) ++ buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 ++// s_waitcnt vmcnt(0) ++// v_add_u32 v2, vcc[0:1], v2, v3 ++ v_add_u32 v2, v2, v3 ++ v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size ++ s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR ++ ++ // restore rsrc3 ++ s_mov_b32 s_save_buf_rsrc3, s0 ++ ++end ++ ++L_SAVE_LDS_DONE: ++ ++ ++ /* save VGPRs - set the Rest VGPRs */ ++ ////////////////////////////////////////////////////////////////////////////////////// ++ L_SAVE_VGPR: ++ // VGPR SR memory offset: 0 ++ // TODO rearrange the RSRC words to use swizzle for VGPR save... ++ ++ s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible ++ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ // VGPR Allocated in 4-GPR granularity ++ ++if G8SR_VGPR_SR_IN_DWX4 ++ // the const stride for DWx4 is 4*4 bytes ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes ++ ++ s_mov_b32 m0, 4 // skip first 4 VGPRs ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs ++ ++ s_set_gpr_idx_on m0, 0x1 // This will change M0 ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 ++L_SAVE_VGPR_LOOP: ++ v_mov_b32 v0, v0 // v0 = v[0+m0] ++ v_mov_b32 v1, v1 ++ v_mov_b32 v2, v2 ++ v_mov_b32 v3, v3 ++ ++ ++ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ s_add_u32 m0, m0, 4 ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? ++ s_set_gpr_idx_off ++L_SAVE_VGPR_LOOP_END: ++ ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes ++else ++ // VGPR store using dw burst ++ s_mov_b32 m0, 0x4 //VGPR initial index value =0 ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc0 L_SAVE_VGPR_END ++ ++ ++ s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later ++ ++ L_SAVE_VGPR_LOOP: ++ v_mov_b32 v0, v0 //v0 = v[0+m0] ++ v_mov_b32 v1, v1 //v0 = v[0+m0] ++ v_mov_b32 v2, v2 //v0 = v[0+m0] ++ v_mov_b32 v3, v3 //v0 = v[0+m0] ++ ++ if(USE_MTBUF_INSTEAD_OF_MUBUF) ++ tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 ++ else ++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 ++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 ++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 ++ end ++ ++ s_add_u32 m0, m0, 4 //next vgpr index ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes ++ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? ++ s_set_gpr_idx_off ++end ++ ++L_SAVE_VGPR_END: ++ ++ ++ ++ ++ ++ ++ /* S_PGM_END_SAVED */ //FIXME graphics ONLY ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) ++ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] ++ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 ++ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over ++ s_rfe_b64 s_save_pc_lo //Return to the main shader program ++ else ++ end ++ ++// Save Done timestamp ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_save_d ++ // SGPR SR memory offset : size(VGPR) ++ get_vgpr_size_bytes(s_save_mem_offset) ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET ++ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? ++ // Need reset rsrc2?? ++ s_mov_b32 m0, s_save_mem_offset ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 ++end ++ ++ ++ s_branch L_END_PGM ++ ++ ++ ++/**************************************************************************/ ++/* restore routine */ ++/**************************************************************************/ ++ ++L_RESTORE: ++ /* Setup Resource Contants */ ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) ++ //calculate wd_addr using absolute thread id ++ v_readlane_b32 s_restore_tmp, v9, 0 ++ s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 ++ s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE ++ s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO ++ s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI ++ s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL ++ else ++ end ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_restore_s ++ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? ++ // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... ++ s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] ++ s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. ++end ++ ++ ++ ++ s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo ++ s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi ++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE ++ s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) ++ s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC ++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK ++ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position ++ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC ++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK ++ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position ++ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE ++ ++ /* global mem offset */ ++// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 ++ ++ /* the first wave in the threadgroup */ ++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK ++ s_cbranch_scc0 L_RESTORE_VGPR ++ ++ /* restore LDS */ ++ ////////////////////////////// ++ L_RESTORE_LDS: ++ ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size ++ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? ++ s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes ++ s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes ++ ++ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) ++ // ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ get_sgpr_size_bytes(s_restore_tmp) ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? ++ ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 ++ ++ L_RESTORE_LDS_LOOP: ++ if (SAVE_LDS) ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW ++ end ++ s_add_u32 m0, m0, 256*2 // 128 DW ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW ++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? ++ ++ ++ /* restore VGPRs */ ++ ////////////////////////////// ++ L_RESTORE_VGPR: ++ // VGPR SR memory offset : 0 ++ s_mov_b32 s_restore_mem_offset, 0x0 ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size ++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) ++ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++if G8SR_VGPR_SR_IN_DWX4 ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 ++ ++ // the const stride for DWx4 is 4*4 bytes ++ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes ++ ++ s_mov_b32 m0, s_restore_alloc_size ++ s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 ++ ++L_RESTORE_VGPR_LOOP: ++ buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 ++ s_waitcnt vmcnt(0) ++ s_sub_u32 m0, m0, 4 ++ v_mov_b32 v0, v0 // v[0+m0] = v0 ++ v_mov_b32 v1, v1 ++ v_mov_b32 v2, v2 ++ v_mov_b32 v3, v3 ++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 ++ s_cmp_eq_u32 m0, 0x8000 ++ s_cbranch_scc0 L_RESTORE_VGPR_LOOP ++ s_set_gpr_idx_off ++ ++ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes ++ ++else ++ // VGPR load using dw burst ++ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 ++ s_mov_b32 m0, 4 //VGPR initial index value = 1 ++ s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 ++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later ++ ++ L_RESTORE_VGPR_LOOP: ++ if(USE_MTBUF_INSTEAD_OF_MUBUF) ++ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 ++ else ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 ++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 ++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 ++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 ++ end ++ s_waitcnt vmcnt(0) //ensure data ready ++ v_mov_b32 v0, v0 //v[0+m0] = v0 ++ v_mov_b32 v1, v1 ++ v_mov_b32 v2, v2 ++ v_mov_b32 v3, v3 ++ s_add_u32 m0, m0, 4 //next vgpr index ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes ++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? ++ s_set_gpr_idx_off ++ /* VGPR restore on v0 */ ++ if(USE_MTBUF_INSTEAD_OF_MUBUF) ++ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 ++ else ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 ++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 ++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 ++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 ++ end ++ ++end ++ ++ /* restore SGPRs */ ++ ////////////////////////////// ++ ++ // SGPR SR memory offset : size(VGPR) ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ get_sgpr_size_bytes(s_restore_tmp) ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp ++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group ++ // TODO, change RSRC word to rearrange memory layout for SGPRS ++ ++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size ++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) ++ ++ if (SGPR_SAVE_USE_SQC) ++ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes ++ else ++ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) ++ end ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), ++ However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG ++ */ ++ s_mov_b32 m0, s_restore_alloc_size ++ ++ L_RESTORE_SGPR_LOOP: ++ read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made ++ s_waitcnt lgkmcnt(0) //ensure data ready ++ ++ s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] ++ ++ s_movreld_b64 s0, s0 //s[0+m0] = s0 ++ s_movreld_b64 s2, s2 ++ s_movreld_b64 s4, s4 ++ s_movreld_b64 s6, s6 ++ s_movreld_b64 s8, s8 ++ s_movreld_b64 s10, s10 ++ s_movreld_b64 s12, s12 ++ s_movreld_b64 s14, s14 ++ ++ s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 ++ s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? ++ ++ /* restore HW registers */ ++ ////////////////////////////// ++ L_RESTORE_HWREG: ++ ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo ++ s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi ++end ++ ++ // HWREG SR memory offset : size(VGPR)+size(SGPR) ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ get_sgpr_size_bytes(s_restore_tmp) ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp ++ ++ ++ s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 ++ read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC ++ read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) ++ read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC ++ read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) ++ read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS ++ read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS ++ read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO ++ read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI ++ read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE ++ read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO ++ read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI ++ ++ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS ++ ++ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS ++ ++ //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) ++ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) ++ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over ++ end ++ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) ++ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal ++ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over ++ end ++ ++ s_mov_b32 m0, s_restore_m0 ++ s_mov_b32 exec_lo, s_restore_exec_lo ++ s_mov_b32 exec_hi, s_restore_exec_hi ++ ++ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 ++ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts ++ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 ++ //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore ++ s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode ++ //reuse s_restore_m0 as a temp register ++ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK ++ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT ++ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT ++ s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero ++ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 ++ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK ++ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT ++ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT ++ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 ++ s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK ++ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT ++ s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp ++ ++ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 ++ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 ++ s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu ++ ++ s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_restore_d ++ s_waitcnt lgkmcnt(0) ++end ++ ++// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution ++ s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc ++ ++ ++/**************************************************************************/ ++/* the END */ ++/**************************************************************************/ ++L_END_PGM: ++ s_endpgm ++ ++end ++ ++ ++/**************************************************************************/ ++/* the helper functions */ ++/**************************************************************************/ ++ ++//Only for save hwreg to mem ++function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) ++ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on ++ s_mov_b32 m0, s_mem_offset ++ s_buffer_store_dword s, s_rsrc, m0 glc:1 ++ s_add_u32 s_mem_offset, s_mem_offset, 4 ++ s_mov_b32 m0, exec_lo ++end ++ ++ ++// HWREG are saved before SGPRs, so all HWREG could be use. ++function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) ++ ++ s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 ++ s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 ++ s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 ++ s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 ++ s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 ++ s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc ++end ++ ++ ++function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) ++ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 ++ s_add_u32 s_mem_offset, s_mem_offset, 4 ++end ++ ++function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) ++ s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 ++ s_sub_u32 s_mem_offset, s_mem_offset, 4*16 ++end ++ ++ ++ ++function get_lds_size_bytes(s_lds_size_byte) ++ // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW ++ s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size ++ s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW ++end ++ ++function get_vgpr_size_bytes(s_vgpr_size_byte) ++ s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size ++ s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 ++ s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible ++end ++ ++function get_sgpr_size_bytes(s_sgpr_size_byte) ++ s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size ++ s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 ++ s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) ++end ++ ++function get_hwreg_size_bytes ++ return 128 //HWREG size 128 bytes ++end ++ ++ ++#endif ++ ++static const uint32_t cwsr_trap_gfx9_hex[] = { ++ 0xbf820001, 0xbf820125, ++ 0xb8f0f802, 0x89708670, ++ 0xb8f1f803, 0x8671ff71, ++ 0x00000400, 0xbf850013, ++ 0xb8faf812, 0xb8fbf813, ++ 0xc00a1e3d, 0x00000000, ++ 0xbf8cc07f, 0x87777978, ++ 0xbf840002, 0xb970f802, ++ 0xbe801d78, 0xb8f1f803, ++ 0x8671ff71, 0x000001ff, ++ 0xbf850002, 0x806c846c, ++ 0x826d806d, 0x866dff6d, ++ 0x0000ffff, 0xb970f802, ++ 0xbe801f6c, 0xb8f1f803, ++ 0x8671ff71, 0x00000100, ++ 0xbf840006, 0xbef60080, ++ 0xb9760203, 0x866dff6d, ++ 0x0000ffff, 0x80ec886c, ++ 0x82ed806d, 0xbef60080, ++ 0xb9760283, 0xbef20068, ++ 0xbef30069, 0xb8f62407, ++ 0x8e769c76, 0x876d766d, ++ 0xb8f603c7, 0x8e769b76, ++ 0x876d766d, 0xb8f6f807, ++ 0x8676ff76, 0x00007fff, ++ 0xb976f807, 0xbeee007e, ++ 0xbeef007f, 0xbefe0180, ++ 0xbf900004, 0xbf8e0002, ++ 0xbf88fffe, 0xbef4007e, ++ 0x8675ff7f, 0x0000ffff, ++ 0x8775ff75, 0x00040000, ++ 0xbef60080, 0xbef700ff, ++ 0x00807fac, 0x8676ff7f, ++ 0x08000000, 0x8f768376, ++ 0x87777677, 0x8676ff7f, ++ 0x70000000, 0x8f768176, ++ 0x87777677, 0xbefb007c, ++ 0xbefa0080, 0xb8fa2a05, ++ 0x807a817a, 0x8e7a8a7a, ++ 0xb8f61605, 0x80768176, ++ 0x8e768676, 0x807a767a, ++ 0xbef60084, 0xbef600ff, ++ 0x01000000, 0xbefe007c, ++ 0xbefc007a, 0xc0611efa, ++ 0x0000007c, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611b3a, ++ 0x0000007c, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611b7a, ++ 0x0000007c, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611bba, ++ 0x0000007c, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611bfa, ++ 0x0000007c, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611c3a, ++ 0x0000007c, 0x807a847a, ++ 0xbefc007e, 0xb8f1f803, ++ 0xbefe007c, 0xbefc007a, ++ 0xc0611c7a, 0x0000007c, ++ 0x807a847a, 0xbefc007e, ++ 0xbefe007c, 0xbefc007a, ++ 0xc0611cba, 0x0000007c, ++ 0x807a847a, 0xbefc007e, ++ 0xbefe007c, 0xbefc007a, ++ 0xc0611cfa, 0x0000007c, ++ 0x807a847a, 0xbefc007e, ++ 0xb8fbf801, 0xbefe007c, ++ 0xbefc007a, 0xc0611efa, ++ 0x0000007c, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611e3a, ++ 0x0000007c, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611e7a, ++ 0x0000007c, 0x807a847a, ++ 0xbefc007e, 0x8676ff7f, ++ 0x04000000, 0xbeef0080, ++ 0x876f6f76, 0xb8fa2a05, ++ 0x807a817a, 0x8e7a8a7a, ++ 0xb8f11605, 0x80718171, ++ 0x8e718471, 0x8e768271, ++ 0xbef600ff, 0x01000000, ++ 0xbef20174, 0x80747a74, ++ 0xbefc0080, 0xbf800000, ++ 0xbe802b00, 0xbe822b02, ++ 0xbe842b04, 0xbe862b06, ++ 0xbe882b08, 0xbe8a2b0a, ++ 0xbe8c2b0c, 0xbe8e2b0e, ++ 0xc06b003a, 0x00000000, ++ 0xc06b013a, 0x00000010, ++ 0xc06b023a, 0x00000020, ++ 0xc06b033a, 0x00000030, ++ 0x8074c074, 0x82758075, ++ 0x807c907c, 0xbf0a717c, ++ 0xbf85ffeb, 0xbef40172, ++ 0xbefa0080, 0xbefe00c1, ++ 0xbeff00c1, 0xbef600ff, ++ 0x01000000, 0xe0724000, ++ 0x7a1d0000, 0xe0724100, ++ 0x7a1d0100, 0xe0724200, ++ 0x7a1d0200, 0xe0724300, ++ 0x7a1d0300, 0xbefe00c1, ++ 0xbeff00c1, 0xb8f14306, ++ 0x8671c171, 0xbf84002c, ++ 0xbf8a0000, 0x8676ff6f, ++ 0x04000000, 0xbf840028, ++ 0x8e718671, 0x8e718271, ++ 0xbef60071, 0xb8fa2a05, ++ 0x807a817a, 0x8e7a8a7a, ++ 0xb8f61605, 0x80768176, ++ 0x8e768676, 0x807a767a, ++ 0x807aff7a, 0x00000080, ++ 0xbef600ff, 0x01000000, ++ 0xbefc0080, 0xd28c0002, ++ 0x000100c1, 0xd28d0003, ++ 0x000204c1, 0xd1060002, ++ 0x00011103, 0x7e0602ff, ++ 0x00000200, 0xbefc00ff, ++ 0x00010000, 0xbe800077, ++ 0x8677ff77, 0xff7fffff, ++ 0x8777ff77, 0x00058000, ++ 0xd8ec0000, 0x00000002, ++ 0xbf8cc07f, 0xe0765000, ++ 0x7a1d0002, 0x68040702, ++ 0xd0c9006a, 0x0000e302, ++ 0xbf87fff7, 0xbef70000, ++ 0xbefa00ff, 0x00000400, ++ 0xbefe00c1, 0xbeff00c1, ++ 0xb8f12a05, 0x80718171, ++ 0x8e718271, 0x8e768871, ++ 0xbef600ff, 0x01000000, ++ 0xbefc0084, 0xbf0a717c, ++ 0xbf840015, 0xbf11017c, ++ 0x8071ff71, 0x00001000, ++ 0x7e000300, 0x7e020301, ++ 0x7e040302, 0x7e060303, ++ 0xe0724000, 0x7a1d0000, ++ 0xe0724100, 0x7a1d0100, ++ 0xe0724200, 0x7a1d0200, ++ 0xe0724300, 0x7a1d0300, ++ 0x807c847c, 0x807aff7a, ++ 0x00000400, 0xbf0a717c, ++ 0xbf85ffef, 0xbf9c0000, ++ 0xbf8200ca, 0xbef4007e, ++ 0x8675ff7f, 0x0000ffff, ++ 0x8775ff75, 0x00040000, ++ 0xbef60080, 0xbef700ff, ++ 0x00807fac, 0x8672ff7f, ++ 0x08000000, 0x8f728372, ++ 0x87777277, 0x8672ff7f, ++ 0x70000000, 0x8f728172, ++ 0x87777277, 0x8672ff7f, ++ 0x04000000, 0xbf84001e, ++ 0xbefe00c1, 0xbeff00c1, ++ 0xb8ef4306, 0x866fc16f, ++ 0xbf840019, 0x8e6f866f, ++ 0x8e6f826f, 0xbef6006f, ++ 0xb8ee2a05, 0x806e816e, ++ 0x8e6e8a6e, 0xb8f21605, ++ 0x80728172, 0x8e728672, ++ 0x806e726e, 0x806eff6e, ++ 0x00000080, 0xbef600ff, ++ 0x01000000, 0xbefc0080, ++ 0xe0510000, 0x6e1d0000, ++ 0xe0510100, 0x6e1d0000, ++ 0x807cff7c, 0x00000200, ++ 0x806eff6e, 0x00000200, ++ 0xbf0a6f7c, 0xbf85fff6, ++ 0xbeee0080, 0xbefe00c1, ++ 0xbeff00c1, 0xb8ef2a05, ++ 0x806f816f, 0x8e6f826f, ++ 0x8e76886f, 0xbef600ff, ++ 0x01000000, 0xbef2006e, ++ 0x806eff6e, 0x00000400, ++ 0xbefc0084, 0xbf11087c, ++ 0x806fff6f, 0x00008000, ++ 0xe0524000, 0x6e1d0000, ++ 0xe0524100, 0x6e1d0100, ++ 0xe0524200, 0x6e1d0200, ++ 0xe0524300, 0x6e1d0300, ++ 0xbf8c0f70, 0x7e000300, ++ 0x7e020301, 0x7e040302, ++ 0x7e060303, 0x807c847c, ++ 0x806eff6e, 0x00000400, ++ 0xbf0a6f7c, 0xbf85ffee, ++ 0xbf9c0000, 0xe0524000, ++ 0x721d0000, 0xe0524100, ++ 0x721d0100, 0xe0524200, ++ 0x721d0200, 0xe0524300, ++ 0x721d0300, 0xb8ee2a05, ++ 0x806e816e, 0x8e6e8a6e, ++ 0xb8f21605, 0x80728172, ++ 0x8e728672, 0x806e726e, ++ 0x80eec06e, 0xb8ef1605, ++ 0x806f816f, 0x8e6f846f, ++ 0x8e76826f, 0xbef600ff, ++ 0x01000000, 0xbefc006f, ++ 0xc031003a, 0x0000006e, ++ 0x80eec06e, 0xbf8cc07f, ++ 0x80fc907c, 0xbe802d00, ++ 0xbe822d02, 0xbe842d04, ++ 0xbe862d06, 0xbe882d08, ++ 0xbe8a2d0a, 0xbe8c2d0c, ++ 0xbe8e2d0e, 0xbf06807c, ++ 0xbf84fff1, 0xb8ee2a05, ++ 0x806e816e, 0x8e6e8a6e, ++ 0xb8f21605, 0x80728172, ++ 0x8e728672, 0x806e726e, ++ 0xbef60084, 0xbef600ff, ++ 0x01000000, 0xc0211bfa, ++ 0x0000006e, 0x806e846e, ++ 0xc0211b3a, 0x0000006e, ++ 0x806e846e, 0xc0211b7a, ++ 0x0000006e, 0x806e846e, ++ 0xc0211eba, 0x0000006e, ++ 0x806e846e, 0xc0211efa, ++ 0x0000006e, 0x806e846e, ++ 0xc0211c3a, 0x0000006e, ++ 0x806e846e, 0xc0211c7a, ++ 0x0000006e, 0x806e846e, ++ 0xc0211a3a, 0x0000006e, ++ 0x806e846e, 0xc0211a7a, ++ 0x0000006e, 0x806e846e, ++ 0xc0211cfa, 0x0000006e, ++ 0x806e846e, 0xc0211e3a, ++ 0x0000006e, 0x806e846e, ++ 0xc0211e7a, 0x0000006e, ++ 0x806e846e, 0xbf8cc07f, ++ 0x866dff6d, 0x0000ffff, ++ 0xbefc006f, 0xbefe007a, ++ 0xbeff007b, 0x866f71ff, ++ 0x000003ff, 0xb96f4803, ++ 0x866f71ff, 0xfffff800, ++ 0x8f6f8b6f, 0xb96fa2c3, ++ 0xb973f801, 0x866fff6d, ++ 0xf0000000, 0x8f6f9c6f, ++ 0x8e6f906f, 0xbef20080, ++ 0x87726f72, 0x866fff6d, ++ 0x08000000, 0x8f6f9b6f, ++ 0x8e6f8f6f, 0x87726f72, ++ 0x866fff70, 0x00800000, ++ 0x8f6f976f, 0xb972f807, ++ 0x86fe7e7e, 0x86ea6a6a, ++ 0xb970f802, 0xbf8a0000, ++ 0x95806f6c, 0xbf810000, ++ 0x00000000, 0x00000000, ++}; ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +index b13dcc3..b8e436c 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +@@ -313,8 +313,16 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + + + /* Return gpu_id as doorbell offset for mmap usage */ +- args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL | args->gpu_id); ++ args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; ++ args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); + args->doorbell_offset <<= PAGE_SHIFT; ++ if (KFD_IS_SOC15(dev->device_info->asic_family)) ++ /* On SOC15 ASICs, doorbell allocation must be ++ * per-device, and independent from the per-process ++ * queue_id. Return the doorbell offset within the ++ * doorbell aperture to user mode. ++ */ ++ args->doorbell_offset |= q_properties.doorbell_off; + + up_write(&p->lock); + +@@ -1279,6 +1287,8 @@ static uint32_t kfd_convert_user_mem_alloction_flags( + out: + if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_AQL_QUEUE_MEM) + kernel_allocation_flags |= ALLOC_MEM_FLAGS_AQL_QUEUE_MEM; ++ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) ++ kernel_allocation_flags |= ALLOC_MEM_FLAGS_COHERENT; + /* Current HW doesn't support non paged memory */ + kernel_allocation_flags |= ALLOC_MEM_FLAGS_NONPAGED; + /* +@@ -1320,7 +1330,7 @@ static int kfd_ioctl_alloc_memory_of_gpu_new(struct file *filep, + return PTR_ERR(pdd); + + if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { +- if (args->size != kfd_doorbell_process_slice()) ++ if (args->size != kfd_doorbell_process_slice(dev)) + return -EINVAL; + offset = kfd_get_process_doorbells(dev, p); + } else +@@ -2361,7 +2371,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) + + switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { + case KFD_MMAP_TYPE_DOORBELL: +- return kfd_doorbell_mmap(process, vma); ++ kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); ++ if (!kfd) ++ return -EFAULT; ++ return kfd_doorbell_mmap(kfd, process, vma); + + case KFD_MMAP_TYPE_EVENTS: + return kfd_event_mmap(process, vma); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +index 95ff6ec..9520298 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +@@ -110,6 +110,8 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { + #define fiji_cache_info carrizo_cache_info + #define polaris10_cache_info carrizo_cache_info + #define polaris11_cache_info carrizo_cache_info ++/* TODO - check & update Vega10 cache details */ ++#define vega10_cache_info carrizo_cache_info + + static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +@@ -591,6 +593,10 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, + pcache_info = polaris11_cache_info; + num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); + break; ++ case CHIP_VEGA10: ++ pcache_info = vega10_cache_info; ++ num_of_cache_types = ARRAY_SIZE(vega10_cache_info); ++ break; + default: + return -EINVAL; + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +index 9de73ce..5fea0d3 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +@@ -29,7 +29,7 @@ + #include <linux/mutex.h> + #include <linux/device.h> + +-#include "kfd_pm4_headers.h" ++#include "kfd_pm4_headers_vi.h" + #include "kfd_pm4_headers_diq.h" + #include "kfd_kernel_queue.h" + #include "kfd_priv.h" +@@ -52,7 +52,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + { + int status = 0; + unsigned int *ib_packet_buff = NULL; +- struct pm4__release_mem *rm_packet; ++ struct pm4_mec_release_mem *rm_packet; + struct pm4__indirect_buffer_pasid *ib_packet; + struct kernel_queue *kq = dbgdev->kq; + size_t pq_packets_size_in_bytes = +@@ -78,7 +78,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + */ + if (sync) + pq_packets_size_in_bytes += +- sizeof(struct pm4__release_mem); ++ sizeof(struct pm4_mec_release_mem); + status = kq->ops.acquire_packet_buffer(kq, pq_packets_size_in_bytes / sizeof(uint32_t), &ib_packet_buff); + if (status != 0) { + pr_debug("Error! kfd: In func %s >> acquire_packet_buffer failed\n", __func__); +@@ -116,7 +116,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + * (a) Sync with HW + * (b) Sync var is written by CP to mem. + */ +- rm_packet = (struct pm4__release_mem *) (ib_packet_buff + ++ rm_packet = (struct pm4_mec_release_mem *) (ib_packet_buff + + (sizeof(struct pm4__indirect_buffer_pasid) / sizeof(unsigned int))); + + status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), +@@ -130,7 +130,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + + rm_packet->header.opcode = IT_RELEASE_MEM; + rm_packet->header.type = PM4_TYPE_3; +- rm_packet->header.count = sizeof(struct pm4__release_mem) / sizeof(unsigned int) - 2; ++ rm_packet->header.count = sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int) - 2; + + rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; + rm_packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +index 6874eb5..4eda7c5 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +@@ -29,8 +29,9 @@ + #include <linux/fence.h> + #include "kfd_priv.h" + #include "kfd_device_queue_manager.h" +-#include "kfd_pm4_headers.h" ++#include "kfd_pm4_headers_vi.h" + #include "cwsr_trap_handler_carrizo.h" ++#include "cwsr_trap_handler_gfx9.asm" + + #define MQD_SIZE_ALIGNED 768 + +@@ -39,6 +40,7 @@ static const struct kfd_device_info kaveri_device_info = { + .max_pasid_bits = 16, + /* max num of queues for KV.TODO should be a dynamic value */ + .max_no_of_hqd = 24, ++ .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +@@ -53,6 +55,7 @@ static const struct kfd_device_info hawaii_device_info = { + .max_pasid_bits = 16, + /* max num of queues for KV.TODO should be a dynamic value */ + .max_no_of_hqd = 24, ++ .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +@@ -67,6 +70,7 @@ static const struct kfd_device_info carrizo_device_info = { + .max_pasid_bits = 16, + /* max num of queues for CZ.TODO should be a dynamic value */ + .max_no_of_hqd = 24, ++ .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +@@ -80,6 +84,7 @@ static const struct kfd_device_info tonga_device_info = { + .asic_family = CHIP_TONGA, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, ++ .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +@@ -93,6 +98,7 @@ static const struct kfd_device_info fiji_device_info = { + .asic_family = CHIP_FIJI, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, ++ .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +@@ -106,6 +112,7 @@ static const struct kfd_device_info polaris10_device_info = { + .asic_family = CHIP_POLARIS10, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, ++ .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +@@ -119,6 +126,7 @@ static const struct kfd_device_info polaris11_device_info = { + .asic_family = CHIP_POLARIS11, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, ++ .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, +@@ -128,6 +136,19 @@ static const struct kfd_device_info polaris11_device_info = { + .needs_pci_atomics = true, + }; + ++static const struct kfd_device_info vega10_device_info = { ++ .asic_family = CHIP_VEGA10, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 8, ++ .ih_ring_entry_size = 8 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_v9, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .is_need_iommu_device = false, ++ .supports_cwsr = true, ++}; ++ + struct kfd_deviceid { + unsigned short did; + const struct kfd_device_info *device_info; +@@ -215,7 +236,13 @@ static const struct kfd_deviceid supported_devices[] = { + { 0x67E9, &polaris11_device_info }, /* Polaris11 */ + { 0x67EB, &polaris11_device_info }, /* Polaris11 */ + { 0x67EF, &polaris11_device_info }, /* Polaris11 */ +- { 0x67FF, &polaris11_device_info } /* Polaris11 */ ++ { 0x67FF, &polaris11_device_info }, /* Polaris11 */ ++ { 0x6860, &vega10_device_info }, /* Vega10 */ ++ { 0x6861, &vega10_device_info }, /* Vega10 */ ++ { 0x6863, &vega10_device_info }, /* Vega10 */ ++ { 0x6867, &vega10_device_info }, /* Vega10 */ ++ { 0x686C, &vega10_device_info }, /* Vega10 */ ++ { 0x687F, &vega10_device_info } /* Vega10 */ + }; + + static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, +@@ -370,8 +397,17 @@ static int kfd_cwsr_init(struct kfd_dev *kfd) + * Initialize the CWSR required memory for TBA and TMA + */ + if (cwsr_enable && kfd->device_info->supports_cwsr) { ++ const uint32_t *cwsr_hex; + void *cwsr_addr = NULL; +- unsigned int size = sizeof(cwsr_trap_carrizo_hex); ++ unsigned int size; ++ ++ if (kfd->device_info->asic_family < CHIP_VEGA10) { ++ cwsr_hex = cwsr_trap_carrizo_hex; ++ size = sizeof(cwsr_trap_carrizo_hex); ++ } else { ++ cwsr_hex = cwsr_trap_gfx9_hex; ++ size = sizeof(cwsr_trap_gfx9_hex); ++ } + + if (size > PAGE_SIZE) { + pr_err("amdkfd: wrong CWSR ISA size.\n"); +@@ -388,7 +424,7 @@ static int kfd_cwsr_init(struct kfd_dev *kfd) + /*Only first page used for cwsr ISA code */ + cwsr_addr = kmap(kfd->cwsr_pages); + memset(cwsr_addr, 0, PAGE_SIZE); +- memcpy(cwsr_addr, cwsr_trap_carrizo_hex, size); ++ memcpy(cwsr_addr, cwsr_hex, size); + kunmap(kfd->cwsr_pages); + kfd->tma_offset = ALIGN(size, PAGE_SIZE); + kfd->cwsr_enabled = true; +@@ -460,9 +496,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + * calculate max size of runlist packet. + * There can be only 2 packets at once + */ +- size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_map_process) + +- max_num_of_queues_per_device * +- sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2; ++ size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_mes_map_process) + ++ max_num_of_queues_per_device * sizeof(struct pm4_mes_map_queues) ++ + sizeof(struct pm4_mes_runlist)) * 2; + + /* Add size of HIQ & DIQ */ + size += KFD_KERNEL_QUEUE_SIZE * 2; +@@ -989,7 +1025,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) + return -ENOMEM; + +- *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); ++ *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if ((*mem_obj) == NULL) + return -ENOMEM; + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +index 1e28bb7..dcdc380 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +@@ -111,6 +111,83 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, + qpd->sh_mem_bases); + } + ++static int init_doorbell_bitmap(struct device_queue_manager *dqm) ++{ ++ unsigned int i; ++ ++ if (!KFD_IS_SOC15(dqm->dev->device_info->asic_family)) ++ return 0; ++ ++ dqm->doorbell_bitmap = ++ kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, ++ BITS_PER_BYTE), GFP_KERNEL); ++ if (dqm->doorbell_bitmap == NULL) ++ return -ENOMEM; ++ ++ /* Mask out any reserved doorbells */ ++ for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) ++ if ((dqm->dev->shared_resources.reserved_doorbell_mask & i) == ++ dqm->dev->shared_resources.reserved_doorbell_val) { ++ set_bit(i, dqm->doorbell_bitmap); ++ pr_debug("reserved doorbell 0x%03x\n", i); ++ } ++ ++ return 0; ++} ++ ++static void uninit_doorbell_bitmap(struct device_queue_manager *dqm) ++{ ++ kfree(dqm->doorbell_bitmap); ++} ++ ++static int allocate_doorbell(struct device_queue_manager *dqm, struct queue *q) ++{ ++ if (!KFD_IS_SOC15(dqm->dev->device_info->asic_family)) { ++ /* On pre-SOC15 chips we need to use the queue ID to ++ * preserve the user mode ABI. ++ */ ++ q->doorbell_id = q->properties.queue_id; ++ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { ++ /* For SDMA queues on SOC15, use static doorbell ++ * assignments based on the engine and queue. ++ */ ++ q->doorbell_id = dqm->dev->shared_resources.sdma_doorbell ++ [q->properties.sdma_engine_id] ++ [q->properties.sdma_queue_id]; ++ } else { ++ /* For CP queues on SOC15 reserve a free doorbell ID */ ++ unsigned int found; ++ ++ found = find_first_zero_bit(dqm->doorbell_bitmap, ++ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); ++ if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { ++ pr_debug("amdkfd: No doorbells available"); ++ return -EBUSY; ++ } ++ set_bit(found, dqm->doorbell_bitmap); ++ q->doorbell_id = found; ++ } ++ ++ q->properties.doorbell_off = ++ kfd_doorbell_id_to_offset(dqm->dev, q->process, ++ q->doorbell_id); ++ ++ return 0; ++} ++ ++static void deallocate_doorbell(struct device_queue_manager *dqm, ++ struct queue *q) ++{ ++ unsigned int old; ++ ++ if (!KFD_IS_SOC15(dqm->dev->device_info->asic_family) || ++ q->properties.type == KFD_QUEUE_TYPE_SDMA) ++ return; ++ ++ old = test_and_clear_bit(q->doorbell_id, dqm->doorbell_bitmap); ++ WARN_ON(!old); ++} ++ + static int allocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +@@ -152,7 +229,8 @@ static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, + if (!qpd->ib_kaddr) + return -ENOMEM; + +- len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr); ++ len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base, ++ (uint32_t *)qpd->ib_kaddr); + + return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, + qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); +@@ -313,12 +391,14 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + if (retval != 0) + return retval; + ++ retval = allocate_doorbell(dqm, q); ++ if (retval) ++ goto out_deallocate_hqd; ++ + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); +- if (retval != 0) { +- deallocate_hqd(dqm, q); +- return retval; +- } ++ if (retval != 0) ++ goto out_deallocate_doorbell; + + pr_debug("kfd: loading mqd to hqd on pipe (%d) queue (%d)\n", + q->pipe, +@@ -332,13 +412,19 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, + q->process->mm); +- if (retval != 0) { +- deallocate_hqd(dqm, q); +- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); +- return retval; +- } ++ if (retval != 0) ++ goto out_uninit_mqd; + + return 0; ++ ++out_uninit_mqd: ++ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); ++out_deallocate_doorbell: ++ deallocate_doorbell(dqm, q); ++out_deallocate_hqd: ++ deallocate_hqd(dqm, q); ++ ++ return retval; + } + + /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked +@@ -360,6 +446,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, + goto out; + } + ++ deallocate_doorbell(dqm, q); ++ + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + deallocate_hqd(dqm, q); + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { +@@ -741,24 +829,29 @@ static int init_scheduler(struct device_queue_manager *dqm) + + static int initialize_nocpsch(struct device_queue_manager *dqm) + { +- int i; ++ int i, ret; + + BUG_ON(!dqm); + + pr_debug("kfd: In func %s num of pipes: %d\n", + __func__, get_pipes_per_mec(dqm)); + +- mutex_init(&dqm->lock); +- INIT_LIST_HEAD(&dqm->queues); +- dqm->queue_count = dqm->next_pipe_to_allocate = 0; +- dqm->sdma_queue_count = 0; + dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm), + sizeof(unsigned int), GFP_KERNEL); +- if (!dqm->allocated_queues) { +- mutex_destroy(&dqm->lock); ++ if (!dqm->allocated_queues) + return -ENOMEM; ++ ++ ret = init_doorbell_bitmap(dqm); ++ if (ret) { ++ kfree(dqm->allocated_queues); ++ return ret; + } + ++ mutex_init(&dqm->lock); ++ INIT_LIST_HEAD(&dqm->queues); ++ dqm->queue_count = dqm->next_pipe_to_allocate = 0; ++ dqm->sdma_queue_count = 0; ++ + for (i = 0; i < get_pipes_per_mec(dqm); i++) + dqm->allocated_queues[i] = (1 << get_queues_per_pipe(dqm)) - 1; + +@@ -777,6 +870,7 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm) + + BUG_ON(dqm->queue_count > 0 || dqm->processes_count > 0); + ++ uninit_doorbell_bitmap(dqm); + kfree(dqm->allocated_queues); + for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) + kfree(dqm->mqds[i]); +@@ -839,6 +933,10 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; + q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + ++ retval = allocate_doorbell(dqm, q); ++ if (retval) ++ goto out_deallocate_sdma_queue; ++ + pr_debug("kfd: sdma id is: %d\n", q->sdma_id); + pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); + pr_debug(" sdma engine id: %d\n", q->properties.sdma_engine_id); +@@ -846,19 +944,23 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); +- if (retval != 0) { +- deallocate_sdma_queue(dqm, q->sdma_id); +- return retval; +- } ++ if (retval != 0) ++ goto out_deallocate_doorbell; + + retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); +- if (retval != 0) { +- deallocate_sdma_queue(dqm, q->sdma_id); +- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); +- return retval; +- } ++ if (retval != 0) ++ goto out_uninit_mqd; + + return 0; ++ ++out_uninit_mqd: ++ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); ++out_deallocate_doorbell: ++ deallocate_doorbell(dqm, q); ++out_deallocate_sdma_queue: ++ deallocate_sdma_queue(dqm, q->sdma_id); ++ ++ return retval; + } + + /* +@@ -918,6 +1020,10 @@ static int initialize_cpsch(struct device_queue_manager *dqm) + pr_debug("kfd: In func %s num of pipes: %d\n", + __func__, get_pipes_per_mec(dqm)); + ++ retval = init_doorbell_bitmap(dqm); ++ if (retval) ++ return retval; ++ + mutex_init(&dqm->lock); + INIT_LIST_HEAD(&dqm->queues); + dqm->queue_count = dqm->processes_count = 0; +@@ -931,6 +1037,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) + return 0; + + fail_init_pipelines: ++ uninit_doorbell_bitmap(dqm); + mutex_destroy(&dqm->lock); + return retval; + } +@@ -1069,24 +1176,29 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", + dqm->total_queue_count); + retval = -EPERM; +- goto out; ++ goto out_unlock; + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + retval = allocate_sdma_queue(dqm, &q->sdma_id); + if (retval != 0) +- goto out; ++ goto out_unlock; + q->properties.sdma_queue_id = + q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; + q->properties.sdma_engine_id = + q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + } ++ ++ retval = allocate_doorbell(dqm, q); ++ if (retval) ++ goto out_deallocate_sdma_queue; ++ + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + + if (mqd == NULL) { +- mutex_unlock(&dqm->lock); +- return -ENOMEM; ++ retval = -ENOMEM; ++ goto out_deallocate_doorbell; + } + /* + * Eviction state logic: we only mark active queues as evicted +@@ -1104,7 +1216,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval != 0) +- goto out; ++ goto out_deallocate_doorbell; + + list_add(&q->list, &qpd->queues_list); + if (q->properties.is_active) { +@@ -1123,9 +1235,18 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + +-out: + mutex_unlock(&dqm->lock); + return retval; ++ ++out_deallocate_doorbell: ++ deallocate_doorbell(dqm, q); ++out_deallocate_sdma_queue: ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) ++ deallocate_sdma_queue(dqm, q->sdma_id); ++out_unlock: ++ mutex_unlock(&dqm->lock); ++ ++ return retval; + } + + int amdkfd_fence_wait_timeout(unsigned int *fence_addr, +@@ -1286,6 +1407,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, + goto failed; + } + ++ deallocate_doorbell(dqm, q); ++ + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); +@@ -1333,10 +1456,13 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) + { +- bool retval; ++ bool retval = true; + + pr_debug("kfd: In func %s\n", __func__); + ++ if (!dqm->asic_ops.set_cache_memory_policy) ++ return retval; ++ + mutex_lock(&dqm->lock); + + if (alternate_aperture_size == 0) { +@@ -1590,6 +1716,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + case CHIP_POLARIS11: + device_queue_manager_init_vi_tonga(&dqm->asic_ops); + break; ++ ++ case CHIP_VEGA10: ++ device_queue_manager_init_v9_vega10(&dqm->asic_ops); ++ break; + } + + if (dqm->ops.initialize(dqm) != 0) { +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +index 05d0cc8..c269e5e 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +@@ -182,6 +182,7 @@ struct device_queue_manager { + unsigned int *allocated_queues; + unsigned int sdma_bitmap; + unsigned int vmid_bitmap; ++ unsigned long *doorbell_bitmap; + uint64_t pipelines_addr; + struct kfd_mem_obj *pipeline_mem; + uint64_t fence_gpu_addr; +@@ -199,6 +200,8 @@ void device_queue_manager_init_vi( + struct device_queue_manager_asic_ops *asic_ops); + void device_queue_manager_init_vi_tonga( + struct device_queue_manager_asic_ops *asic_ops); ++void device_queue_manager_init_v9_vega10( ++ struct device_queue_manager_asic_ops *asic_ops); + void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + unsigned int get_queues_num(struct device_queue_manager *dqm); +@@ -216,6 +219,7 @@ static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) + return (pdd->lds_base >> 16) & 0xFF; + } + ++/* This function is only useful for GFXv7 and v8 */ + static inline unsigned int + get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) + { +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +new file mode 100644 +index 0000000..2d81e2b +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +@@ -0,0 +1,87 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ * ++ */ ++ ++#include "kfd_device_queue_manager.h" ++#include "vega10/vega10_enum.h" ++#include "vega10/GC/gc_9_0_offset.h" ++#include "vega10/GC/gc_9_0_sh_mask.h" ++#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" ++ ++static int update_qpd_v9(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++static int initialize_cpsch_v9(struct device_queue_manager *dqm); ++static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, ++ struct qcm_process_device *qpd); ++ ++void device_queue_manager_init_v9_vega10( ++ struct device_queue_manager_asic_ops *asic_ops) ++{ ++ asic_ops->update_qpd = update_qpd_v9; ++ asic_ops->init_cpsch = initialize_cpsch_v9; ++ asic_ops->init_sdma_vm = init_sdma_vm_v9; ++} ++ ++static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) ++{ ++ uint32_t shared_base = pdd->lds_base >> 48; ++ uint32_t private_base = pdd->scratch_base >> 48; ++ ++ return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | ++ private_base; ++} ++ ++static int update_qpd_v9(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct kfd_process_device *pdd; ++ ++ pdd = qpd_to_pdd(qpd); ++ ++ /* check if sh_mem_config register already configured */ ++ if (qpd->sh_mem_config == 0) { ++ qpd->sh_mem_config = ++ SH_MEM_ALIGNMENT_MODE_UNALIGNED << ++ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; ++ ++ qpd->sh_mem_ape1_limit = 0; ++ qpd->sh_mem_ape1_base = 0; ++ } ++ ++ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); ++ ++ pr_debug("kfd: sh_mem_bases 0x%X\n", qpd->sh_mem_bases); ++ ++ return 0; ++} ++ ++static int initialize_cpsch_v9(struct device_queue_manager *dqm) ++{ ++ return 0; ++} ++ ++static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, ++ struct qcm_process_device *qpd) ++{ ++ /* Not needed on SDMAv4 any more */ ++ q->properties.sdma_vm_addr = 0; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +index 9387b1d..9a86b98 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +@@ -34,7 +34,6 @@ + */ + + #define KERNEL_DOORBELL_PASID 1 +-#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 + + /* + * Each device exposes a doorbell aperture, a PCI MMIO aperture that +@@ -51,9 +50,9 @@ + */ + + /* # of doorbell bytes allocated for each process. */ +-size_t kfd_doorbell_process_slice(void) ++size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) + { +- return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * ++ return roundup(kfd->device_info->doorbell_size * + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + PAGE_SIZE); + } +@@ -73,16 +72,16 @@ void kfd_doorbell_init(struct kfd_dev *kfd) + + doorbell_start_offset = + roundup(kfd->shared_resources.doorbell_start_offset, +- kfd_doorbell_process_slice()); ++ kfd_doorbell_process_slice(kfd)); + + doorbell_aperture_size = + rounddown(kfd->shared_resources.doorbell_aperture_size, +- kfd_doorbell_process_slice()); ++ kfd_doorbell_process_slice(kfd)); + + if (doorbell_aperture_size > doorbell_start_offset) + doorbell_process_limit = + (doorbell_aperture_size - doorbell_start_offset) / +- kfd_doorbell_process_slice(); ++ kfd_doorbell_process_slice(kfd); + else + doorbell_process_limit = 0; + +@@ -93,7 +92,7 @@ void kfd_doorbell_init(struct kfd_dev *kfd) + kfd->doorbell_process_limit = doorbell_process_limit - 1; + + kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, +- kfd_doorbell_process_slice()); ++ kfd_doorbell_process_slice(kfd)); + + BUG_ON(!kfd->doorbell_kernel_ptr); + +@@ -117,21 +116,16 @@ void kfd_doorbell_init(struct kfd_dev *kfd) + (uintptr_t)kfd->doorbell_kernel_ptr); + } + +-int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) ++int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, ++ struct vm_area_struct *vma) + { + phys_addr_t address; +- struct kfd_dev *dev; + + /* + * For simplicitly we only allow mapping of the entire doorbell + * allocation of a single device & process. + */ +- if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice()) +- return -EINVAL; +- +- /* Find kfd device according to gpu id */ +- dev = kfd_device_by_id(vma->vm_pgoff); +- if (dev == NULL) ++ if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) + return -EINVAL; + + /* Calculate physical address of doorbell */ +@@ -148,19 +142,19 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", + (unsigned long long) vma->vm_start, address, vma->vm_flags, +- kfd_doorbell_process_slice()); ++ kfd_doorbell_process_slice(dev)); + + + return io_remap_pfn_range(vma, + vma->vm_start, + address >> PAGE_SHIFT, +- kfd_doorbell_process_slice(), ++ kfd_doorbell_process_slice(dev), + vma->vm_page_prot); + } + + + /* get kernel iomem pointer for a doorbell */ +-u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, ++void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off) + { + u32 inx; +@@ -177,12 +171,15 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) + return NULL; + ++ inx *= kfd->device_info->doorbell_size / sizeof(u32); ++ + /* + * Calculating the kernel doorbell offset using "faked" kernel +- * pasid that allocated for kernel queues only ++ * pasid that allocated for kernel queues only. Offset is in ++ * dword units regardless of the ASIC-dependent doorbell size. + */ +- *doorbell_off = KERNEL_DOORBELL_PASID * (kfd_doorbell_process_slice() / +- sizeof(u32)) + inx; ++ *doorbell_off = KERNEL_DOORBELL_PASID * ++ (kfd_doorbell_process_slice(kfd) / sizeof(u32)) + inx; + + pr_debug("kfd: get kernel queue doorbell\n" + " doorbell offset == 0x%08X\n" +@@ -205,7 +202,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) + mutex_unlock(&kfd->doorbell_mutex); + } + +-inline void write_kernel_doorbell(u32 __iomem *db, u32 value) ++void write_kernel_doorbell(void __iomem *db, u32 value) + { + if (db) { + writel(value, db); +@@ -213,29 +210,40 @@ inline void write_kernel_doorbell(u32 __iomem *db, u32 value) + } + } + ++void write_kernel_doorbell64(void __iomem *db, u64 value) ++{ ++ if (db) { ++ WARN(((unsigned long)db & 7) != 0, ++ "Unaligned 64-bit doorbell"); ++ writeq(value, (u64 __iomem *)db); ++ pr_debug("writing %llu to doorbell address 0x%p\n", value, db); ++ } ++} ++ + /* + * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 + * to doorbells with the process's doorbell page + */ +-unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, ++unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, + struct kfd_process *process, +- unsigned int queue_id) ++ unsigned int doorbell_id) + { + /* + * doorbell_id_offset accounts for doorbells taken by KGD. +- * pasid * kfd_doorbell_process_slice/sizeof(u32) adjusts +- * to the process's doorbells ++ * pasid * kfd_doorbell_process_slice/sizeof(u32) adjusts to ++ * the process's doorbells. The offset returned is in dword ++ * units regardless of the ASIC-dependent doorbell size. + */ + return kfd->doorbell_id_offset + +- process->pasid * (kfd_doorbell_process_slice()/sizeof(u32)) + +- queue_id; ++ process->pasid * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) + ++ doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); + } + + uint64_t kfd_get_number_elems(struct kfd_dev *kfd) + { + uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - + kfd->shared_resources.doorbell_start_offset) / +- kfd_doorbell_process_slice() + 1; ++ kfd_doorbell_process_slice(kfd) + 1; + + return num_of_elems; + +@@ -245,5 +253,5 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process) + { + return dev->doorbell_base + +- process->pasid * kfd_doorbell_process_slice(); ++ process->pasid * kfd_doorbell_process_slice(dev); + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +index 7d290bb..49a2a53 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +@@ -279,25 +279,39 @@ + * for FLAT_* / S_LOAD operations. + */ + +-#define MAKE_GPUVM_APP_BASE(gpu_num) \ ++#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) + + #define MAKE_GPUVM_APP_LIMIT(base, size) \ + (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) + +-#define MAKE_SCRATCH_APP_BASE() \ ++#define MAKE_SCRATCH_APP_BASE_VI() \ + (((uint64_t)(0x1UL) << 61) + 0x100000000L) + + #define MAKE_SCRATCH_APP_LIMIT(base) \ + (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +-#define MAKE_LDS_APP_BASE() \ ++#define MAKE_LDS_APP_BASE_VI() \ + (((uint64_t)(0x1UL) << 61) + 0x0) + + #define MAKE_LDS_APP_LIMIT(base) \ + (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + ++/* On GFXv9 the LDS and scratch apertures are programmed independently ++ * using the high 16 bits of the 64-bit virtual address. They must be ++ * in the hole, which will be the case as long as the high 16 bits are ++ * not 0. ++ * ++ * The aperture sizes are still 4GB implicitly. ++ * ++ * A GPUVM aperture is not applicable on GFXv9. ++ */ ++#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) ++#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) + ++/* Some VM address space reserved for kernel use (CWSR trap handlers ++ * and kernel IBs) ++ */ + #define DGPU_VM_BASE_DEFAULT 0x100000 + #define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE) + +@@ -313,6 +327,32 @@ int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, + return 0; + } + ++void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) ++{ ++ /* ++ * node id couldn't be 0 - the three MSB bits of ++ * aperture shoudn't be 0 ++ */ ++ pdd->lds_base = MAKE_LDS_APP_BASE_VI(); ++ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); ++ ++ pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); ++ pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( ++ pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size); ++ ++ pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); ++ pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); ++} ++ ++void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) ++{ ++ pdd->lds_base = MAKE_LDS_APP_BASE_V9(); ++ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); ++ ++ pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); ++ pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); ++} ++ + int kfd_init_apertures(struct kfd_process *process) + { + uint8_t id = 0; +@@ -341,24 +381,23 @@ int kfd_init_apertures(struct kfd_process *process) + pdd->gpuvm_base = pdd->gpuvm_limit = 0; + pdd->scratch_base = pdd->scratch_limit = 0; + } else { +- /* +- * node id couldn't be 0 - the three MSB bits of +- * aperture shoudn't be 0 +- */ +- pdd->lds_base = MAKE_LDS_APP_BASE(); +- +- pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); +- +- pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); +- +- pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( +- pdd->gpuvm_base, +- dev->shared_resources.gpuvm_size); +- +- pdd->scratch_base = MAKE_SCRATCH_APP_BASE(); +- +- pdd->scratch_limit = +- MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); ++ switch (dev->device_info->asic_family) { ++ case CHIP_KAVERI: ++ case CHIP_HAWAII: ++ case CHIP_CARRIZO: ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ case CHIP_POLARIS10: ++ case CHIP_POLARIS11: ++ kfd_init_apertures_vi(pdd, id); ++ break; ++ case CHIP_VEGA10: ++ kfd_init_apertures_v9(pdd, id); ++ break; ++ default: ++ pr_err("Unknown chip in kfd_init_apertures\n"); ++ goto err; ++ } + + if (KFD_IS_DGPU(dev->device_info->asic_family)) { + pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +new file mode 100644 +index 0000000..a479820 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +@@ -0,0 +1,130 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "kfd_priv.h" ++#include "kfd_events.h" ++#include "soc15_int.h" ++ ++ ++static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid) ++{ ++ uint32_t pasid = 0; ++ const struct kfd2kgd_calls *f2g = dev->kfd2kgd; ++ ++ if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid)) ++ pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); ++ ++ return pasid; ++} ++ ++static bool event_interrupt_isr_v9(struct kfd_dev *dev, ++ const uint32_t *ih_ring_entry, ++ uint32_t *patched_ihre, ++ bool *patched_flag) ++{ ++ uint16_t source_id, client_id, pasid, vmid; ++ bool result = false; ++ ++ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); ++ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); ++ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); ++ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); ++ ++ if (pasid) { ++ const uint32_t *data = ih_ring_entry; ++ ++ pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", ++ client_id, source_id, pasid); ++ pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", ++ data[0], data[1], data[2], data[3], ++ data[4], data[5], data[6], data[7]); ++ } ++ ++ if ((vmid >= dev->vm_info.first_vmid_kfd && ++ vmid <= dev->vm_info.last_vmid_kfd) && ++ (source_id == SOC15_INTSRC_CP_END_OF_PIPE || ++ source_id == SOC15_INTSRC_SDMA_TRAP || ++ source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || ++ source_id == SOC15_INTSRC_CP_BAD_OPCODE || ++ client_id == SOC15_IH_CLIENTID_VMC)) { ++ ++ /* ++ * KFD want to handle this INT, but MEC firmware did ++ * not send pasid. Try to get it from vmid mapping ++ * and patch the ih entry. It's a temp workaround. ++ */ ++ WARN_ONCE((!pasid), "Fix me.\n"); ++ if (!pasid) { ++ uint32_t temp = le32_to_cpu(ih_ring_entry[3]); ++ ++ pasid = kfd_get_pasid_from_vmid(dev, vmid); ++ memcpy(patched_ihre, ih_ring_entry, ++ dev->device_info->ih_ring_entry_size); ++ patched_ihre[3] = cpu_to_le32(temp | pasid); ++ *patched_flag = true; ++ } ++ result = pasid ? true : false; ++ } ++ ++ /* Do not process in ISR, just request it to be forwarded to WQ. */ ++ return result; ++ ++} ++ ++static void event_interrupt_wq_v9(struct kfd_dev *dev, ++ const uint32_t *ih_ring_entry) ++{ ++ uint16_t source_id, client_id, pasid, vmid; ++ ++ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); ++ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); ++ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); ++ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); ++ ++ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) ++ kfd_signal_event_interrupt(pasid, 0, 0); ++ else if (source_id == SOC15_INTSRC_SDMA_TRAP) ++ kfd_signal_event_interrupt(pasid, 0, 0); ++ else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) ++ kfd_signal_event_interrupt(pasid, 0, 0); /*todo */ ++ else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) ++ kfd_signal_hw_exception_event(pasid); ++ else if (client_id == SOC15_IH_CLIENTID_VMC) { ++ struct kfd_vm_fault_info info; ++ ++ memset(&info, 0, sizeof(info)); ++ dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); ++ kfd_process_vm_fault(dev->dqm, pasid); ++ if (!info.page_addr && !info.status) ++ return; ++ ++ if (info.vmid == vmid) ++ kfd_signal_vm_fault_event(dev, pasid, &info); ++ else ++ kfd_signal_vm_fault_event(dev, pasid, NULL); ++ } ++} ++ ++const struct kfd_event_interrupt_class event_interrupt_class_v9 = { ++ .interrupt_isr = event_interrupt_isr_v9, ++ .interrupt_wq = event_interrupt_wq_v9, ++}; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +index 126d848..b826689 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +@@ -99,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + kq->rptr_kernel = kq->rptr_mem->cpu_ptr; + kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; + +- retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), ++ retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, + &kq->wptr_mem); + + if (retval != 0) +@@ -211,6 +211,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + size_t available_size; + size_t queue_size_dwords; + uint32_t wptr, rptr; ++ uint64_t wptr64; + unsigned int *queue_address; + + BUG_ON(!kq || !buffer_ptr); +@@ -222,6 +223,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + */ + rptr = *kq->rptr_kernel; + wptr = kq->pending_wptr; ++ wptr64 = kq->pending_wptr64; + queue_address = (unsigned int *)kq->pq_kernel_addr; + queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); + +@@ -251,11 +253,13 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + while (wptr > 0) { + queue_address[wptr] = kq->nop_packet; + wptr = (wptr + 1) % queue_size_dwords; ++ wptr64++; + } + } + + *buffer_ptr = &queue_address[wptr]; + kq->pending_wptr = wptr + packet_size_in_dwords; ++ kq->pending_wptr64 = wptr64 + packet_size_in_dwords; + + return 0; + } +@@ -310,9 +314,7 @@ static void submit_packet(struct kernel_queue *kq) + pr_debug("\n"); + #endif + +- *kq->wptr_kernel = kq->pending_wptr; +- write_kernel_doorbell(kq->queue->properties.doorbell_ptr, +- kq->pending_wptr); ++ kq->ops_asic_specific.submit_packet(kq); + } + + static void rollback_packet(struct kernel_queue *kq) +@@ -352,6 +354,10 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + case CHIP_HAWAII: + kernel_queue_init_cik(&kq->ops_asic_specific); + break; ++ ++ case CHIP_VEGA10: ++ kernel_queue_init_v9(&kq->ops_asic_specific); ++ break; + } + + if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) { +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +index a217f42..82c94a6 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +@@ -82,6 +82,7 @@ struct kernel_queue { + struct kfd_dev *dev; + struct mqd_manager *mqd; + struct queue *queue; ++ uint64_t pending_wptr64; + uint32_t pending_wptr; + unsigned int nop_packet; + +@@ -89,7 +90,10 @@ struct kernel_queue { + uint32_t *rptr_kernel; + uint64_t rptr_gpu_addr; + struct kfd_mem_obj *wptr_mem; +- uint32_t *wptr_kernel; ++ union { ++ uint64_t *wptr64_kernel; ++ uint32_t *wptr_kernel; ++ }; + uint64_t wptr_gpu_addr; + struct kfd_mem_obj *pq; + uint64_t pq_gpu_addr; +@@ -107,5 +111,6 @@ struct kernel_queue { + + void kernel_queue_init_cik(struct kernel_queue_ops *ops); + void kernel_queue_init_vi(struct kernel_queue_ops *ops); ++void kernel_queue_init_v9(struct kernel_queue_ops *ops); + + #endif /* KFD_KERNEL_QUEUE_H_ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +index a90eb44..8c69ea7 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +@@ -22,15 +22,19 @@ + */ + + #include "kfd_kernel_queue.h" ++#include "kfd_pm4_headers.h" ++#include "kfd_pm4_opcodes.h" + + static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); + static void uninitialize_cik(struct kernel_queue *kq); ++static void submit_packet_cik(struct kernel_queue *kq); + + void kernel_queue_init_cik(struct kernel_queue_ops *ops) + { + ops->initialize = initialize_cik; + ops->uninitialize = uninitialize_cik; ++ ops->submit_packet = submit_packet_cik; + } + + static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, +@@ -42,3 +46,124 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + static void uninitialize_cik(struct kernel_queue *kq) + { + } ++ ++static void submit_packet_cik(struct kernel_queue *kq) ++{ ++ *kq->wptr_kernel = kq->pending_wptr; ++ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, ++ kq->pending_wptr); ++} ++ ++static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer, ++ struct qcm_process_device *qpd) ++{ ++ struct pm4_map_process *packet; ++ struct queue *cur; ++ uint32_t num_queues; ++ ++ packet = (struct pm4_map_process *)buffer; ++ ++ memset(buffer, 0, sizeof(struct pm4_map_process)); ++ ++ packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_map_process)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields3.page_table_base = qpd->page_table_base; ++ packet->bitfields10.gds_size = qpd->gds_size; ++ packet->bitfields10.num_gws = qpd->num_gws; ++ packet->bitfields10.num_oac = qpd->num_oac; ++ num_queues = 0; ++ list_for_each_entry(cur, &qpd->queues_list, list) ++ num_queues++; ++ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; ++ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ return 0; ++} ++ ++static int pm_map_process_scratch_cik(struct packet_manager *pm, ++ uint32_t *buffer, struct qcm_process_device *qpd) ++{ ++ struct pm4_map_process_scratch_kv *packet; ++ struct queue *cur; ++ uint32_t num_queues; ++ ++ packet = (struct pm4_map_process_scratch_kv *)buffer; ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); ++ ++ packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_map_process_scratch_kv)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields3.page_table_base = qpd->page_table_base; ++ packet->bitfields14.gds_size = qpd->gds_size; ++ packet->bitfields14.num_gws = qpd->num_gws; ++ packet->bitfields14.num_oac = qpd->num_oac; ++ num_queues = 0; ++ list_for_each_entry(cur, &qpd->queues_list, list) ++ num_queues++; ++ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; ++ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; ++ ++ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ return 0; ++} ++ ++static uint32_t pm_get_map_process_packet_size_cik(void) ++{ ++ return sizeof(struct pm4_map_process); ++} ++static uint32_t pm_get_map_process_scratch_packet_size_cik(void) ++{ ++ return sizeof(struct pm4_map_process_scratch_kv); ++} ++ ++ ++static struct packet_manager_funcs kfd_cik_pm_funcs = { ++ .map_process = pm_map_process_cik, ++ .runlist = pm_runlist_vi, ++ .set_resources = pm_set_resources_vi, ++ .map_queues = pm_map_queues_vi, ++ .unmap_queues = pm_unmap_queues_vi, ++ .query_status = pm_query_status_vi, ++ .release_mem = pm_release_mem_vi, ++ .get_map_process_packet_size = pm_get_map_process_packet_size_cik, ++ .get_runlist_packet_size = pm_get_runlist_packet_size_vi, ++ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, ++ .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, ++ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, ++ .get_query_status_packet_size = pm_get_query_status_packet_size_vi, ++ .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, ++}; ++ ++ ++void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver) ++{ ++ pm->pmf = &kfd_cik_pm_funcs; ++ if (fw_ver >= KFD_SCRATCH_KV_FW_VER) { ++ pm->pmf->map_process = pm_map_process_scratch_cik; ++ pm->pmf->get_map_process_packet_size = ++ pm_get_map_process_scratch_packet_size_cik; ++ } ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +new file mode 100644 +index 0000000..89edf3c +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +@@ -0,0 +1,385 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ * ++ */ ++ ++#include "kfd_kernel_queue.h" ++#include "kfd_device_queue_manager.h" ++#include "kfd_pm4_headers_ai.h" ++#include "kfd_pm4_opcodes.h" ++ ++static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, ++ enum kfd_queue_type type, unsigned int queue_size); ++static void uninitialize_v9(struct kernel_queue *kq); ++static void submit_packet_v9(struct kernel_queue *kq); ++ ++void kernel_queue_init_v9(struct kernel_queue_ops *ops) ++{ ++ ops->initialize = initialize_v9; ++ ops->uninitialize = uninitialize_v9; ++ ops->submit_packet = submit_packet_v9; ++} ++ ++static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, ++ enum kfd_queue_type type, unsigned int queue_size) ++{ ++ int retval; ++ ++ retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); ++ if (retval != 0) ++ return false; ++ ++ kq->eop_gpu_addr = kq->eop_mem->gpu_addr; ++ kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; ++ ++ memset(kq->eop_kernel_addr, 0, PAGE_SIZE); ++ ++ return true; ++} ++ ++static void uninitialize_v9(struct kernel_queue *kq) ++{ ++ kfd_gtt_sa_free(kq->dev, kq->eop_mem); ++} ++ ++static void submit_packet_v9(struct kernel_queue *kq) ++{ ++ *kq->wptr64_kernel = kq->pending_wptr64; ++ write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, ++ kq->pending_wptr64); ++} ++ ++static int pm_map_process_v9(struct packet_manager *pm, ++ uint32_t *buffer, struct qcm_process_device *qpd) ++{ ++ struct pm4_mes_map_process *packet; ++ struct queue *cur; ++ uint32_t num_queues; ++ uint64_t vm_page_table_base_addr = ++ (uint64_t)(qpd->page_table_base) << 12; ++ ++ packet = (struct pm4_mes_map_process *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_map_process)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_mes_map_process)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields14.gds_size = qpd->gds_size; ++ packet->bitfields14.num_gws = qpd->num_gws; ++ packet->bitfields14.num_oac = qpd->num_oac; ++ packet->bitfields14.sdma_enable = 1; ++ ++ num_queues = 0; ++ list_for_each_entry(cur, &qpd->queues_list, list) ++ num_queues++; ++ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); ++ packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); ++ packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); ++ packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ packet->vm_context_page_table_base_addr_lo32 = ++ lower_32_bits(vm_page_table_base_addr); ++ packet->vm_context_page_table_base_addr_hi32 = ++ upper_32_bits(vm_page_table_base_addr); ++ ++ return 0; ++} ++ ++static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t ib, size_t ib_size_in_dwords, bool chain) ++{ ++ struct pm4_mes_runlist *packet; ++ ++ int concurrent_proc_cnt = 0; ++ struct kfd_dev *kfd = pm->dqm->dev; ++ ++ /* Determine the number of processes to map together to HW: ++ * it can not exceed the number of VMIDs available to the ++ * scheduler, and it is determined by the smaller of the number ++ * of processes in the runlist and kfd module parameter ++ * hws_max_conc_proc. ++ * Note: the arbitration between the number of VMIDs and ++ * hws_max_conc_proc has been done in ++ * kgd2kfd_device_init(). ++ */ ++ concurrent_proc_cnt = min(pm->dqm->processes_count, ++ kfd->max_proc_per_quantum); ++ ++ ++ packet = (struct pm4_mes_runlist *)buffer; ++ ++ memset(buffer, 0, sizeof(struct pm4_mes_runlist)); ++ packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, ++ sizeof(struct pm4_mes_runlist)); ++ ++ packet->bitfields4.ib_size = ib_size_in_dwords; ++ packet->bitfields4.chain = chain ? 1 : 0; ++ packet->bitfields4.offload_polling = 0; ++ packet->bitfields4.valid = 1; ++ packet->bitfields4.process_cnt = concurrent_proc_cnt; ++ packet->ordinal2 = lower_32_bits(ib); ++ packet->ib_base_hi = upper_32_bits(ib); ++ ++ return 0; ++} ++ ++static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static) ++{ ++ struct pm4_mes_map_queues *packet; ++ bool use_static = is_static; ++ ++ packet = (struct pm4_mes_map_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, ++ sizeof(struct pm4_mes_map_queues)); ++ packet->bitfields2.alloc_format = ++ alloc_format__mes_map_queues__one_per_pipe_vi; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; ++ ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_map_queues__compute_vi; ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_compute_vi; ++ ++ switch (q->properties.type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ if (use_static) ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_latency_static_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__debug_interface_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + ++ engine_sel__mes_map_queues__sdma0_vi; ++ use_static = false; /* no static queues under SDMA */ ++ break; ++ default: ++ WARN(1, "queue type %d\n", q->properties.type); ++ break; ++ } ++ packet->bitfields3.doorbell_offset = ++ q->properties.doorbell_off; ++ ++ packet->mqd_addr_lo = ++ lower_32_bits(q->gart_mqd_addr); ++ ++ packet->mqd_addr_hi = ++ upper_32_bits(q->gart_mqd_addr); ++ ++ packet->wptr_addr_lo = ++ lower_32_bits((uint64_t)q->properties.write_ptr); ++ ++ packet->wptr_addr_hi = ++ upper_32_bits((uint64_t)q->properties.write_ptr); ++ ++ return 0; ++} ++ ++static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, ++ enum kfd_queue_type type, ++ enum kfd_unmap_queues_filter filter, ++ uint32_t filter_param, bool reset, ++ unsigned int sdma_engine) ++{ ++ struct pm4_mes_unmap_queues *packet; ++ ++ packet = (struct pm4_mes_unmap_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, ++ sizeof(struct pm4_mes_unmap_queues)); ++ switch (type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__compute; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__sdma0 + sdma_engine; ++ break; ++ default: ++ WARN(1, "queue type %d\n", type); ++ break; ++ } ++ ++ if (reset) ++ packet->bitfields2.action = ++ action__mes_unmap_queues__reset_queues; ++ else ++ packet->bitfields2.action = ++ action__mes_unmap_queues__preempt_queues; ++ ++ switch (filter) { ++ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_specified_queues; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields3b.doorbell_offset0 = filter_param; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_BY_PASID: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; ++ packet->bitfields3a.pasid = filter_param; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__unmap_all_queues; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: ++ /* in this case, we do not preempt static queues */ ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__unmap_all_non_static_queues; ++ break; ++ default: ++ WARN(1, "filter %d\n", filter); ++ break; ++ } ++ ++ return 0; ++ ++} ++ ++static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t fence_address, uint32_t fence_value) ++{ ++ struct pm4_mes_query_status *packet; ++ ++ packet = (struct pm4_mes_query_status *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_query_status)); ++ ++ ++ packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, ++ sizeof(struct pm4_mes_query_status)); ++ ++ packet->bitfields2.context_id = 0; ++ packet->bitfields2.interrupt_sel = ++ interrupt_sel__mes_query_status__completion_status; ++ packet->bitfields2.command = ++ command__mes_query_status__fence_only_after_write_ack; ++ ++ packet->addr_hi = upper_32_bits((uint64_t)fence_address); ++ packet->addr_lo = lower_32_bits((uint64_t)fence_address); ++ packet->data_hi = upper_32_bits((uint64_t)fence_value); ++ packet->data_lo = lower_32_bits((uint64_t)fence_value); ++ ++ return 0; ++} ++ ++ ++static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) ++{ ++ struct pm4_mec_release_mem *packet; ++ ++ WARN_ON(!buffer); ++ ++ packet = (struct pm4_mec_release_mem *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, ++ sizeof(struct pm4_mec_release_mem)); ++ ++ packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; ++ packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; ++ packet->bitfields2.tcl1_action_ena = 1; ++ packet->bitfields2.tc_action_ena = 1; ++ packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; ++ ++ packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; ++ packet->bitfields3.int_sel = ++ int_sel__mec_release_mem__send_interrupt_after_write_confirm; ++ ++ packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; ++ packet->address_hi = upper_32_bits(gpu_addr); ++ ++ packet->data_lo = 0; ++ ++ return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); ++} ++ ++static uint32_t pm_get_map_process_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_map_process); ++} ++ ++static uint32_t pm_get_runlist_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_runlist); ++} ++ ++static uint32_t pm_get_map_queues_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_map_queues); ++} ++ ++static uint32_t pm_get_unmap_queues_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_unmap_queues); ++} ++ ++static uint32_t pm_get_query_status_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_query_status); ++} ++ ++static uint32_t pm_get_release_mem_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mec_release_mem); ++} ++ ++static struct packet_manager_funcs kfd_v9_pm_funcs = { ++ .map_process = pm_map_process_v9, ++ .runlist = pm_runlist_v9, ++ .set_resources = pm_set_resources_vi, ++ .map_queues = pm_map_queues_v9, ++ .unmap_queues = pm_unmap_queues_v9, ++ .query_status = pm_query_status_v9, ++ .release_mem = pm_release_mem_v9, ++ .get_map_process_packet_size = pm_get_map_process_packet_size_v9, ++ .get_runlist_packet_size = pm_get_runlist_packet_size_v9, ++ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, ++ .get_map_queues_packet_size = pm_get_map_queues_packet_size_v9, ++ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_v9, ++ .get_query_status_packet_size = pm_get_query_status_packet_size_v9, ++ .get_release_mem_packet_size = pm_get_release_mem_packet_size_v9, ++}; ++ ++void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver) ++{ ++ pm->pmf = &kfd_v9_pm_funcs; ++} ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +index f1d4828..6f12fe0 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +@@ -22,15 +22,20 @@ + */ + + #include "kfd_kernel_queue.h" ++#include "kfd_device_queue_manager.h" ++#include "kfd_pm4_headers_vi.h" ++#include "kfd_pm4_opcodes.h" + + static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); + static void uninitialize_vi(struct kernel_queue *kq); ++static void submit_packet_vi(struct kernel_queue *kq); + + void kernel_queue_init_vi(struct kernel_queue_ops *ops) + { + ops->initialize = initialize_vi; + ops->uninitialize = uninitialize_vi; ++ ops->submit_packet = submit_packet_vi; + } + + static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, +@@ -54,3 +59,366 @@ static void uninitialize_vi(struct kernel_queue *kq) + { + kfd_gtt_sa_free(kq->dev, kq->eop_mem); + } ++ ++static void submit_packet_vi(struct kernel_queue *kq) ++{ ++ *kq->wptr_kernel = kq->pending_wptr; ++ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, ++ kq->pending_wptr); ++} ++ ++static int pm_map_process_vi(struct packet_manager *pm, ++ uint32_t *buffer, struct qcm_process_device *qpd) ++{ ++ struct pm4_mes_map_process *packet; ++ struct queue *cur; ++ uint32_t num_queues; ++ ++ packet = (struct pm4_mes_map_process *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_map_process)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_mes_map_process)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields3.page_table_base = qpd->page_table_base; ++ packet->bitfields10.gds_size = qpd->gds_size; ++ packet->bitfields10.num_gws = qpd->num_gws; ++ packet->bitfields10.num_oac = qpd->num_oac; ++ num_queues = 0; ++ list_for_each_entry(cur, &qpd->queues_list, list) ++ num_queues++; ++ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; ++ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; ++ ++ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ return 0; ++} ++ ++ ++unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) ++{ ++ union PM4_MES_TYPE_3_HEADER header; ++ ++ header.u32All = 0; ++ header.opcode = opcode; ++ header.count = packet_size/sizeof(uint32_t) - 2; ++ header.type = PM4_TYPE_3; ++ ++ return header.u32All; ++} ++ ++int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t ib, size_t ib_size_in_dwords, bool chain) ++{ ++ struct pm4_mes_runlist *packet; ++ ++ int concurrent_proc_cnt = 0; ++ struct kfd_dev *kfd = pm->dqm->dev; ++ ++ /* Determine the number of processes to map together to HW: ++ * it can not exceed the number of VMIDs available to the ++ * scheduler, and it is determined by the smaller of the number ++ * of processes in the runlist and kfd module parameter ++ * hws_max_conc_proc. ++ * Note: the arbitration between the number of VMIDs and ++ * hws_max_conc_proc has been done in ++ * kgd2kfd_device_init(). ++ */ ++ concurrent_proc_cnt = min(pm->dqm->processes_count, ++ kfd->max_proc_per_quantum); ++ ++ ++ packet = (struct pm4_mes_runlist *)buffer; ++ ++ memset(buffer, 0, sizeof(struct pm4_mes_runlist)); ++ packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, ++ sizeof(struct pm4_mes_runlist)); ++ ++ packet->bitfields4.ib_size = ib_size_in_dwords; ++ packet->bitfields4.chain = chain ? 1 : 0; ++ packet->bitfields4.offload_polling = 0; ++ packet->bitfields4.valid = 1; ++ packet->bitfields4.process_cnt = concurrent_proc_cnt; ++ packet->ordinal2 = lower_32_bits(ib); ++ packet->bitfields3.ib_base_hi = upper_32_bits(ib); ++ ++ return 0; ++} ++ ++int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static) ++{ ++ struct pm4_mes_map_queues *packet; ++ bool use_static = is_static; ++ ++ packet = (struct pm4_mes_map_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, ++ sizeof(struct pm4_mes_map_queues)); ++ packet->bitfields2.alloc_format = ++ alloc_format__mes_map_queues__one_per_pipe_vi; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; ++ ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_map_queues__compute_vi; ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_compute_vi; ++ ++ switch (q->properties.type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ if (use_static) ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_latency_static_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__debug_interface_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + ++ engine_sel__mes_map_queues__sdma0_vi; ++ use_static = false; /* no static queues under SDMA */ ++ break; ++ default: ++ WARN(1, "queue type %d\n", q->properties.type); ++ break; ++ } ++ packet->bitfields3.doorbell_offset = ++ q->properties.doorbell_off; ++ ++ packet->mqd_addr_lo = ++ lower_32_bits(q->gart_mqd_addr); ++ ++ packet->mqd_addr_hi = ++ upper_32_bits(q->gart_mqd_addr); ++ ++ packet->wptr_addr_lo = ++ lower_32_bits((uint64_t)q->properties.write_ptr); ++ ++ packet->wptr_addr_hi = ++ upper_32_bits((uint64_t)q->properties.write_ptr); ++ ++ return 0; ++} ++ ++int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, ++ struct scheduling_resources *res) ++{ ++ struct pm4_mes_set_resources *packet; ++ ++ packet = (struct pm4_mes_set_resources *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, ++ sizeof(struct pm4_mes_set_resources)); ++ ++ packet->bitfields2.queue_type = ++ queue_type__mes_set_resources__hsa_interface_queue_hiq; ++ packet->bitfields2.vmid_mask = res->vmid_mask; ++ packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; ++ packet->bitfields7.oac_mask = res->oac_mask; ++ packet->bitfields8.gds_heap_base = res->gds_heap_base; ++ packet->bitfields8.gds_heap_size = res->gds_heap_size; ++ ++ packet->gws_mask_lo = lower_32_bits(res->gws_mask); ++ packet->gws_mask_hi = upper_32_bits(res->gws_mask); ++ ++ packet->queue_mask_lo = lower_32_bits(res->queue_mask); ++ packet->queue_mask_hi = upper_32_bits(res->queue_mask); ++ ++ return 0; ++} ++ ++int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, ++ enum kfd_queue_type type, ++ enum kfd_unmap_queues_filter filter, ++ uint32_t filter_param, bool reset, ++ unsigned int sdma_engine) ++{ ++ struct pm4_mes_unmap_queues *packet; ++ ++ packet = (struct pm4_mes_unmap_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, ++ sizeof(struct pm4_mes_unmap_queues)); ++ switch (type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__compute; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__sdma0 + sdma_engine; ++ break; ++ default: ++ WARN(1, "queue type %d\n", type); ++ break; ++ } ++ ++ if (reset) ++ packet->bitfields2.action = ++ action__mes_unmap_queues__reset_queues; ++ else ++ packet->bitfields2.action = ++ action__mes_unmap_queues__preempt_queues; ++ ++ switch (filter) { ++ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_specified_queues; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields3b.doorbell_offset0 = filter_param; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_BY_PASID: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; ++ packet->bitfields3a.pasid = filter_param; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__unmap_all_queues; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: ++ /* in this case, we do not preempt static queues */ ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__unmap_all_non_static_queues; ++ break; ++ default: ++ WARN(1, "filter %d\n", filter); ++ break; ++ } ++ ++ return 0; ++ ++} ++ ++int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t fence_address, uint32_t fence_value) ++{ ++ struct pm4_mes_query_status *packet; ++ ++ packet = (struct pm4_mes_query_status *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_query_status)); ++ ++ ++ packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, ++ sizeof(struct pm4_mes_query_status)); ++ ++ packet->bitfields2.context_id = 0; ++ packet->bitfields2.interrupt_sel = ++ interrupt_sel__mes_query_status__completion_status; ++ packet->bitfields2.command = ++ command__mes_query_status__fence_only_after_write_ack; ++ ++ packet->addr_hi = upper_32_bits((uint64_t)fence_address); ++ packet->addr_lo = lower_32_bits((uint64_t)fence_address); ++ packet->data_hi = upper_32_bits((uint64_t)fence_value); ++ packet->data_lo = lower_32_bits((uint64_t)fence_value); ++ ++ return 0; ++} ++ ++ ++uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) ++{ ++ struct pm4_mec_release_mem *packet; ++ ++ WARN_ON(!buffer); ++ ++ packet = (struct pm4_mec_release_mem *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, ++ sizeof(struct pm4_mec_release_mem)); ++ ++ packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; ++ packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; ++ packet->bitfields2.tcl1_action_ena = 1; ++ packet->bitfields2.tc_action_ena = 1; ++ packet->bitfields2.cache_policy = cache_policy___release_mem__lru; ++ packet->bitfields2.atc = 0; ++ ++ packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; ++ packet->bitfields3.int_sel = ++ int_sel___release_mem__send_interrupt_after_write_confirm; ++ ++ packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; ++ packet->address_hi = upper_32_bits(gpu_addr); ++ ++ packet->data_lo = 0; ++ ++ return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); ++} ++ ++uint32_t pm_get_map_process_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_map_process); ++} ++ ++uint32_t pm_get_runlist_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_runlist); ++} ++ ++uint32_t pm_get_set_resources_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_set_resources); ++} ++ ++uint32_t pm_get_map_queues_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_map_queues); ++} ++ ++uint32_t pm_get_unmap_queues_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_unmap_queues); ++} ++ ++uint32_t pm_get_query_status_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_query_status); ++} ++ ++uint32_t pm_get_release_mem_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mec_release_mem); ++} ++ ++ ++static struct packet_manager_funcs kfd_vi_pm_funcs = { ++ .map_process = pm_map_process_vi, ++ .runlist = pm_runlist_vi, ++ .set_resources = pm_set_resources_vi, ++ .map_queues = pm_map_queues_vi, ++ .unmap_queues = pm_unmap_queues_vi, ++ .query_status = pm_query_status_vi, ++ .release_mem = pm_release_mem_vi, ++ .get_map_process_packet_size = pm_get_map_process_packet_size_vi, ++ .get_runlist_packet_size = pm_get_runlist_packet_size_vi, ++ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, ++ .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, ++ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, ++ .get_query_status_packet_size = pm_get_query_status_packet_size_vi, ++ .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, ++}; ++ ++void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver) ++{ ++ pm->pmf = &kfd_vi_pm_funcs; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +index 0980995..046282a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +@@ -80,6 +80,8 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + case CHIP_POLARIS10: + case CHIP_POLARIS11: + return mqd_manager_init_vi_tonga(type, dev); ++ case CHIP_VEGA10: ++ return mqd_manager_init_v9(type, dev); + } + + return NULL; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +new file mode 100644 +index 0000000..3caeb6e +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +@@ -0,0 +1,509 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ * ++ */ ++ ++#include <linux/printk.h> ++#include <linux/slab.h> ++#include "kfd_priv.h" ++#include "kfd_mqd_manager.h" ++#include "v9_structs.h" ++#include "vega10/GC/gc_9_0_offset.h" ++#include "vega10/GC/gc_9_0_sh_mask.h" ++#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" ++ ++static inline struct v9_mqd *get_mqd(void *mqd) ++{ ++ return (struct v9_mqd *)mqd; ++} ++ ++static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) ++{ ++ return (struct v9_sdma_mqd *)mqd; ++} ++ ++static void update_cu_mask(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct v9_mqd *m; ++ struct kfd_cu_info cu_info; ++ uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ ++ uint32_t cu_mask_count = q->cu_mask_count; ++ const uint32_t *cu_mask = q->cu_mask; ++ int se, cu_per_sh, cu_index, i; ++ ++ if (cu_mask_count == 0) ++ return; ++ ++ m = get_mqd(mqd); ++ m->compute_static_thread_mgmt_se0 = 0; ++ m->compute_static_thread_mgmt_se1 = 0; ++ m->compute_static_thread_mgmt_se2 = 0; ++ m->compute_static_thread_mgmt_se3 = 0; ++ ++ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); ++ ++ /* If # CU mask bits > # CUs, set it to the # of CUs */ ++ if (cu_mask_count > cu_info.cu_active_number) ++ cu_mask_count = cu_info.cu_active_number; ++ ++ cu_index = 0; ++ for (se = 0; se < cu_info.num_shader_engines; se++) { ++ cu_per_sh = 0; ++ ++ /* Get the number of CUs on this Shader Engine */ ++ for (i = 0; i < 4; i++) ++ cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); ++ ++ se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); ++ if ((cu_per_sh + (cu_index % 32)) > 32) ++ se_mask[se] |= cu_mask[(cu_index / 32) + 1] ++ << (32 - (cu_index % 32)); ++ se_mask[se] &= (1 << cu_per_sh) - 1; ++ cu_index += cu_per_sh; ++ } ++ m->compute_static_thread_mgmt_se0 = se_mask[0]; ++ m->compute_static_thread_mgmt_se1 = se_mask[1]; ++ m->compute_static_thread_mgmt_se2 = se_mask[2]; ++ m->compute_static_thread_mgmt_se3 = se_mask[3]; ++ ++ pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n", ++ m->compute_static_thread_mgmt_se0, ++ m->compute_static_thread_mgmt_se1, ++ m->compute_static_thread_mgmt_se2, ++ m->compute_static_thread_mgmt_se3); ++} ++ ++static int init_mqd(struct mqd_manager *mm, void **mqd, ++ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, ++ struct queue_properties *q) ++{ ++ int retval; ++ uint64_t addr; ++ struct v9_mqd *m; ++ struct kfd_dev *kfd = mm->dev; ++ ++ /* From V9, for CWSR, the control stack is located on the next page ++ * boundary after the mqd, we will use the gtt allocation function ++ * instead of sub-allocation function. ++ */ ++ if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { ++ *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); ++ retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, ++ ALIGN(q->ctl_stack_size, PAGE_SIZE) + ++ ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), ++ &((*mqd_mem_obj)->gtt_mem), ++ &((*mqd_mem_obj)->gpu_addr), ++ (void *)&((*mqd_mem_obj)->cpu_ptr)); ++ } else ++ retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), ++ mqd_mem_obj); ++ if (retval != 0) ++ return -ENOMEM; ++ ++ m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; ++ addr = (*mqd_mem_obj)->gpu_addr; ++ ++ memset(m, 0, sizeof(struct v9_mqd)); ++ ++ m->header = 0xC0310800; ++ m->compute_pipelinestat_enable = 1; ++ m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; ++ m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; ++ m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; ++ m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; ++ ++ m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | ++ 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; ++ ++ m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; ++ ++ m->cp_mqd_base_addr_lo = lower_32_bits(addr); ++ m->cp_mqd_base_addr_hi = upper_32_bits(addr); ++ ++ m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | ++ 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | ++ 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; ++ ++ m->cp_hqd_pipe_priority = 1; ++ m->cp_hqd_queue_priority = 15; ++ ++ if (q->format == KFD_QUEUE_FORMAT_AQL) { ++ m->cp_hqd_aql_control = ++ 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT | ++ 1 << CP_HQD_AQL_CONTROL__CONTROL0_EN__SHIFT; ++ } ++ ++ if (q->tba_addr) { ++ m->compute_pgm_rsrc2 |= ++ (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); ++ } ++ ++ if (mm->dev->cwsr_enabled) { ++ m->cp_hqd_persistent_state |= ++ (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); ++ m->cp_hqd_ctx_save_base_addr_lo = ++ lower_32_bits(q->ctx_save_restore_area_address); ++ m->cp_hqd_ctx_save_base_addr_hi = ++ upper_32_bits(q->ctx_save_restore_area_address); ++ m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; ++ m->cp_hqd_cntl_stack_size = q->ctl_stack_size; ++ m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; ++ m->cp_hqd_wg_state_offset = 0; ++ } ++ ++ *mqd = m; ++ if (gart_addr != NULL) ++ *gart_addr = addr; ++ retval = mm->update_mqd(mm, m, q); ++ ++ return retval; ++} ++ ++static int load_mqd(struct mqd_manager *mm, void *mqd, ++ uint32_t pipe_id, uint32_t queue_id, ++ struct queue_properties *p, struct mm_struct *mms) ++{ ++ /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ ++ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); ++ ++ return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, ++ (uint32_t __user *)p->write_ptr, ++ wptr_shift, 0, mms); ++} ++ ++static int update_mqd(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct v9_mqd *m; ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ m = get_mqd(mqd); ++ ++ m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; ++ m->cp_hqd_pq_control |= ++ ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; ++ pr_debug("kfd: cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); ++ ++ m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); ++ m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); ++ ++ m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); ++ m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); ++ ++ m->cp_hqd_pq_doorbell_control = ++ 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_EN__SHIFT | ++ q->doorbell_off << ++ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; ++ pr_debug("kfd: cp_hqd_pq_doorbell_control 0x%x\n", ++ m->cp_hqd_pq_doorbell_control); ++ ++ m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; ++ ++ /* ++ * HW does not clamp this field correctly. Maximum EOP queue size ++ * is constrained by per-SE EOP done signal count, which is 8-bit. ++ * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit ++ * more than (EOP entry count - 1) so a queue size of 0x800 dwords ++ * is safe, giving a maximum field value of 0xA. ++ */ ++ m->cp_hqd_eop_control = min(0xA, ++ ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); ++ m->cp_hqd_eop_base_addr_lo = ++ lower_32_bits(q->eop_ring_buffer_address >> 8); ++ m->cp_hqd_eop_base_addr_hi = ++ upper_32_bits(q->eop_ring_buffer_address >> 8); ++ ++ m->cp_hqd_iq_timer = 0; ++ ++ m->cp_hqd_vmid = q->vmid; ++ ++ if (q->format == KFD_QUEUE_FORMAT_AQL) { ++ m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | ++ 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | ++ 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | ++ 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; ++ m->cp_hqd_pq_doorbell_control |= ++ 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; ++ } ++ if (mm->dev->cwsr_enabled) ++ m->cp_hqd_ctx_save_control = 0; ++ ++ update_cu_mask(mm, mqd, q); ++ ++ m->cp_hqd_active = 0; ++ q->is_active = false; ++ if (q->queue_size > 0 && ++ q->queue_address != 0 && ++ q->queue_percent > 0 && ++ !q->is_evicted) { ++ m->cp_hqd_active = 1; ++ q->is_active = true; ++ } ++ ++ return 0; ++} ++ ++ ++static int destroy_mqd(struct mqd_manager *mm, void *mqd, ++ enum kfd_preempt_type type, ++ unsigned int timeout, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_destroy ++ (mm->dev->kgd, type, timeout, ++ pipe_id, queue_id); ++} ++ ++static void uninit_mqd(struct mqd_manager *mm, void *mqd, ++ struct kfd_mem_obj *mqd_mem_obj) ++{ ++ struct kfd_dev *kfd = mm->dev; ++ ++ if (mqd_mem_obj->gtt_mem) { ++ kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); ++ kfree(mqd_mem_obj); ++ } else { ++ kfd_gtt_sa_free(mm->dev, mqd_mem_obj); ++ } ++} ++ ++static bool is_occupied(struct mqd_manager *mm, void *mqd, ++ uint64_t queue_address, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_is_occupied( ++ mm->dev->kgd, queue_address, ++ pipe_id, queue_id); ++} ++ ++static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, ++ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, ++ struct queue_properties *q) ++{ ++ struct v9_mqd *m; ++ int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); ++ ++ if (retval != 0) ++ return retval; ++ ++ m = get_mqd(*mqd); ++ ++ m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | ++ 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; ++ ++ return retval; ++} ++ ++static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct v9_mqd *m; ++ int retval = update_mqd(mm, mqd, q); ++ ++ if (retval != 0) ++ return retval; ++ ++ /* TODO: what's the point? update_mqd already does this. */ ++ m = get_mqd(mqd); ++ m->cp_hqd_vmid = q->vmid; ++ return retval; ++} ++ ++static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, ++ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, ++ struct queue_properties *q) ++{ ++ int retval; ++ struct v9_sdma_mqd *m; ++ ++ ++ retval = kfd_gtt_sa_allocate(mm->dev, ++ sizeof(struct v9_sdma_mqd), ++ mqd_mem_obj); ++ ++ if (retval != 0) ++ return -ENOMEM; ++ ++ m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; ++ ++ memset(m, 0, sizeof(struct v9_sdma_mqd)); ++ ++ *mqd = m; ++ if (gart_addr != NULL) ++ *gart_addr = (*mqd_mem_obj)->gpu_addr; ++ ++ retval = mm->update_mqd(mm, m, q); ++ ++ return retval; ++} ++ ++static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ struct kfd_mem_obj *mqd_mem_obj) ++{ ++ kfd_gtt_sa_free(mm->dev, mqd_mem_obj); ++} ++ ++static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ uint32_t pipe_id, uint32_t queue_id, ++ struct queue_properties *p, struct mm_struct *mms) ++{ ++ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, ++ (uint32_t __user *)p->write_ptr, ++ mms); ++} ++ ++#define SDMA_RLC_DUMMY_DEFAULT 0xf ++ ++static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct v9_sdma_mqd *m; ++ ++ m = get_sdma_mqd(mqd); ++ m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) ++ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | ++ q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | ++ 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | ++ 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; ++ ++ m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); ++ m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); ++ m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); ++ m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); ++ m->sdmax_rlcx_doorbell = 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; ++ m->sdmax_rlcx_doorbell_offset = ++ q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; ++ ++ m->sdma_engine_id = q->sdma_engine_id; ++ m->sdma_queue_id = q->sdma_queue_id; ++ m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; ++ ++ q->is_active = false; ++ if (q->queue_size > 0 && ++ q->queue_address != 0 && ++ q->queue_percent > 0 && ++ !q->is_evicted) { ++ m->sdmax_rlcx_rb_cntl |= ++ 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; ++ ++ q->is_active = true; ++ } ++ ++ return 0; ++} ++ ++/* ++ * * preempt type here is ignored because there is only one way ++ * * to preempt sdma queue ++ */ ++static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ enum kfd_preempt_type type, ++ unsigned int timeout, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); ++} ++ ++static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, ++ uint64_t queue_address, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); ++} ++ ++#if defined(CONFIG_DEBUG_FS) ++ ++static int debugfs_show_mqd(struct seq_file *m, void *data) ++{ ++ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, ++ data, sizeof(struct v9_mqd), false); ++ return 0; ++} ++ ++static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) ++{ ++ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, ++ data, sizeof(struct v9_sdma_mqd), false); ++ return 0; ++} ++ ++#endif ++ ++struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev) ++{ ++ struct mqd_manager *mqd; ++ ++ if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) ++ return NULL; ++ ++ pr_debug("kfd: In func %s\n", __func__); ++ ++ mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); ++ if (!mqd) ++ return NULL; ++ ++ mqd->dev = dev; ++ ++ switch (type) { ++ case KFD_MQD_TYPE_CP: ++ case KFD_MQD_TYPE_COMPUTE: ++ mqd->init_mqd = init_mqd; ++ mqd->uninit_mqd = uninit_mqd; ++ mqd->load_mqd = load_mqd; ++ mqd->update_mqd = update_mqd; ++ mqd->destroy_mqd = destroy_mqd; ++ mqd->is_occupied = is_occupied; ++#if defined(CONFIG_DEBUG_FS) ++ mqd->debugfs_show_mqd = debugfs_show_mqd; ++#endif ++ break; ++ case KFD_MQD_TYPE_HIQ: ++ mqd->init_mqd = init_mqd_hiq; ++ mqd->uninit_mqd = uninit_mqd; ++ mqd->load_mqd = load_mqd; ++ mqd->update_mqd = update_mqd_hiq; ++ mqd->destroy_mqd = destroy_mqd; ++ mqd->is_occupied = is_occupied; ++#if defined(CONFIG_DEBUG_FS) ++ mqd->debugfs_show_mqd = debugfs_show_mqd; ++#endif ++ break; ++ case KFD_MQD_TYPE_SDMA: ++ mqd->init_mqd = init_mqd_sdma; ++ mqd->uninit_mqd = uninit_mqd_sdma; ++ mqd->load_mqd = load_mqd_sdma; ++ mqd->update_mqd = update_mqd_sdma; ++ mqd->destroy_mqd = destroy_mqd_sdma; ++ mqd->is_occupied = is_occupied_sdma; ++#if defined(CONFIG_DEBUG_FS) ++ mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; ++#endif ++ break; ++ default: ++ kfree(mqd); ++ return NULL; ++ } ++ ++ return mqd; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +index 6361c2e..f7c99ad 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +@@ -26,8 +26,6 @@ + #include "kfd_device_queue_manager.h" + #include "kfd_kernel_queue.h" + #include "kfd_priv.h" +-#include "kfd_pm4_headers.h" +-#include "kfd_pm4_headers_vi.h" + #include "kfd_pm4_opcodes.h" + + static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, +@@ -39,18 +37,6 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, + *wptr = temp; + } + +-static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) +-{ +- union PM4_MES_TYPE_3_HEADER header; +- +- header.u32all = 0; +- header.opcode = opcode; +- header.count = packet_size/sizeof(uint32_t) - 2; +- header.type = PM4_TYPE_3; +- +- return header.u32all; +-} +- + static void pm_calc_rlib_size(struct packet_manager *pm, + unsigned int *rlib_size, + bool *over_subscription) +@@ -84,9 +70,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + pr_debug("kfd: over subscribed runlist\n"); + } + +- map_queue_size = KFD_IS_VI(pm->dqm->dev->device_info->asic_family) ? +- sizeof(struct pm4_mes_map_queues) : +- sizeof(struct pm4_map_queues); ++ map_queue_size = pm->pmf->get_map_queues_packet_size(); + /* calculate run list ib allocation size */ + *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + + queue_count * map_queue_size; +@@ -96,7 +80,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + * when over subscription + */ + if (*over_subscription) +- *rlib_size += sizeof(struct pm4_runlist); ++ *rlib_size += pm->pmf->get_runlist_packet_size(); + + pr_debug("kfd: runlist ib size %d\n", *rlib_size); + } +@@ -136,296 +120,6 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, + return retval; + } + +-static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, +- uint64_t ib, size_t ib_size_in_dwords, bool chain) +-{ +- struct pm4_runlist *packet; +- int concurrent_proc_cnt = 0; +- struct kfd_dev *kfd = pm->dqm->dev; +- +- BUG_ON(!pm || !buffer || !ib); +- +- /* Determine the number of processes to map together to HW: +- * it can not exceed the number of VMIDs available to the +- * scheduler, and it is determined by the smaller of the number +- * of processes in the runlist and kfd module parameter +- * hws_max_conc_proc. +- * Note: the arbitration between the number of VMIDs and +- * hws_max_conc_proc has been done in +- * kgd2kfd_device_init(). +- */ +- concurrent_proc_cnt = min(pm->dqm->processes_count, +- kfd->max_proc_per_quantum); +- +- +- packet = (struct pm4_runlist *)buffer; +- +- memset(buffer, 0, sizeof(struct pm4_runlist)); +- packet->header.u32all = build_pm4_header(IT_RUN_LIST, +- sizeof(struct pm4_runlist)); +- +- packet->bitfields4.ib_size = ib_size_in_dwords; +- packet->bitfields4.chain = chain ? 1 : 0; +- packet->bitfields4.offload_polling = 0; +- packet->bitfields4.valid = 1; +- packet->bitfields4.process_cnt = concurrent_proc_cnt; +- packet->ordinal2 = lower_32_bits(ib); +- packet->bitfields3.ib_base_hi = upper_32_bits(ib); +- +- return 0; +-} +- +-static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, +- struct qcm_process_device *qpd) +-{ +- struct pm4_map_process *packet; +- struct queue *cur; +- uint32_t num_queues; +- +- BUG_ON(!pm || !buffer || !qpd); +- +- packet = (struct pm4_map_process *)buffer; +- +- pr_debug("kfd: In func %s\n", __func__); +- +- memset(buffer, 0, sizeof(struct pm4_map_process)); +- +- packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, +- sizeof(struct pm4_map_process)); +- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; +- packet->bitfields2.process_quantum = 1; +- packet->bitfields2.pasid = qpd->pqm->process->pasid; +- packet->bitfields3.page_table_base = qpd->page_table_base; +- packet->bitfields10.gds_size = qpd->gds_size; +- packet->bitfields10.num_gws = qpd->num_gws; +- packet->bitfields10.num_oac = qpd->num_oac; +- num_queues = 0; +- list_for_each_entry(cur, &qpd->queues_list, list) +- num_queues++; +- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; +- +- packet->sh_mem_config = qpd->sh_mem_config; +- packet->sh_mem_bases = qpd->sh_mem_bases; +- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; +- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; +- +- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); +- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); +- +- return 0; +-} +- +-static int pm_create_map_process_scratch_kv(struct packet_manager *pm, +- uint32_t *buffer, struct qcm_process_device *qpd) +-{ +- struct pm4_map_process_scratch_kv *packet; +- struct queue *cur; +- uint32_t num_queues; +- +- BUG_ON(!pm || !buffer || !qpd); +- +- packet = (struct pm4_map_process_scratch_kv *)buffer; +- +- pr_debug("kfd: In func %s\n", __func__); +- +- memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); +- +- packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, +- sizeof(struct pm4_map_process_scratch_kv)); +- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; +- packet->bitfields2.process_quantum = 1; +- packet->bitfields2.pasid = qpd->pqm->process->pasid; +- packet->bitfields3.page_table_base = qpd->page_table_base; +- packet->bitfields14.gds_size = qpd->gds_size; +- packet->bitfields14.num_gws = qpd->num_gws; +- packet->bitfields14.num_oac = qpd->num_oac; +- num_queues = 0; +- list_for_each_entry(cur, &qpd->queues_list, list) +- num_queues++; +- packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues; +- +- packet->sh_mem_config = qpd->sh_mem_config; +- packet->sh_mem_bases = qpd->sh_mem_bases; +- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; +- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; +- +- packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; +- +- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); +- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); +- +- return 0; +-} +- +-static int pm_create_map_process_scratch(struct packet_manager *pm, +- uint32_t *buffer, struct qcm_process_device *qpd) +-{ +- struct pm4_map_process_scratch *packet; +- struct queue *cur; +- uint32_t num_queues; +- +- BUG_ON(!pm || !buffer || !qpd); +- +- packet = (struct pm4_map_process_scratch *)buffer; +- +- pr_debug("kfd: In func %s\n", __func__); +- +- memset(buffer, 0, sizeof(struct pm4_map_process_scratch)); +- +- packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, +- sizeof(struct pm4_map_process_scratch)); +- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; +- packet->bitfields2.process_quantum = 1; +- packet->bitfields2.pasid = qpd->pqm->process->pasid; +- packet->bitfields3.page_table_base = qpd->page_table_base; +- packet->bitfields10.gds_size = qpd->gds_size; +- packet->bitfields10.num_gws = qpd->num_gws; +- packet->bitfields10.num_oac = qpd->num_oac; +- num_queues = 0; +- list_for_each_entry(cur, &qpd->queues_list, list) +- num_queues++; +- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; +- +- packet->sh_mem_config = qpd->sh_mem_config; +- packet->sh_mem_bases = qpd->sh_mem_bases; +- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; +- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; +- +- packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; +- +- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); +- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); +- +- return 0; +-} +- +-static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, +- struct queue *q, bool is_static) +-{ +- struct pm4_mes_map_queues *packet; +- bool use_static = is_static; +- +- BUG_ON(!pm || !buffer || !q); +- +- pr_debug("kfd: In func %s\n", __func__); +- +- packet = (struct pm4_mes_map_queues *)buffer; +- memset(buffer, 0, sizeof(struct pm4_map_queues)); +- +- packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, +- sizeof(struct pm4_map_queues)); +- packet->bitfields2.alloc_format = +- alloc_format__mes_map_queues__one_per_pipe_vi; +- packet->bitfields2.num_queues = 1; +- packet->bitfields2.queue_sel = +- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; +- +- packet->bitfields2.engine_sel = +- engine_sel__mes_map_queues__compute_vi; +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__normal_compute_vi; +- +- switch (q->properties.type) { +- case KFD_QUEUE_TYPE_COMPUTE: +- if (use_static) +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__normal_latency_static_queue_vi; +- break; +- case KFD_QUEUE_TYPE_DIQ: +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__debug_interface_queue_vi; +- break; +- case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = q->properties.sdma_engine_id + +- engine_sel__mes_map_queues__sdma0_vi; +- use_static = false; /* no static queues under SDMA */ +- break; +- default: +- pr_err("kfd: in %s queue type %d\n", __func__, +- q->properties.type); +- BUG(); +- break; +- } +- packet->bitfields3.doorbell_offset = +- q->properties.doorbell_off; +- +- packet->mqd_addr_lo = +- lower_32_bits(q->gart_mqd_addr); +- +- packet->mqd_addr_hi = +- upper_32_bits(q->gart_mqd_addr); +- +- packet->wptr_addr_lo = +- lower_32_bits((uint64_t)q->properties.write_ptr); +- +- packet->wptr_addr_hi = +- upper_32_bits((uint64_t)q->properties.write_ptr); +- +- return 0; +-} +- +-static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, +- struct queue *q, bool is_static) +-{ +- struct pm4_map_queues *packet; +- bool use_static = is_static; +- +- BUG_ON(!pm || !buffer || !q); +- +- pr_debug("kfd: In func %s\n", __func__); +- +- packet = (struct pm4_map_queues *)buffer; +- memset(buffer, 0, sizeof(struct pm4_map_queues)); +- +- packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, +- sizeof(struct pm4_map_queues)); +- packet->bitfields2.alloc_format = +- alloc_format__mes_map_queues__one_per_pipe; +- packet->bitfields2.num_queues = 1; +- packet->bitfields2.queue_sel = +- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots; +- +- packet->bitfields2.vidmem = (q->properties.is_interop) ? +- vidmem__mes_map_queues__uses_video_memory : +- vidmem__mes_map_queues__uses_no_video_memory; +- +- switch (q->properties.type) { +- case KFD_QUEUE_TYPE_COMPUTE: +- case KFD_QUEUE_TYPE_DIQ: +- packet->bitfields2.engine_sel = +- engine_sel__mes_map_queues__compute; +- break; +- case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = q->properties.sdma_engine_id + +- engine_sel__mes_map_queues__sdma0; +- use_static = false; /* no static queues under SDMA */ +- break; +- default: +- BUG(); +- break; +- } +- +- packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset = +- q->properties.doorbell_off; +- +- packet->mes_map_queues_ordinals[0].bitfields3.is_static = +- (use_static) ? 1 : 0; +- +- packet->mes_map_queues_ordinals[0].mqd_addr_lo = +- lower_32_bits(q->gart_mqd_addr); +- +- packet->mes_map_queues_ordinals[0].mqd_addr_hi = +- upper_32_bits(q->gart_mqd_addr); +- +- packet->mes_map_queues_ordinals[0].wptr_addr_lo = +- lower_32_bits((uint64_t)q->properties.write_ptr); +- +- packet->mes_map_queues_ordinals[0].wptr_addr_hi = +- upper_32_bits((uint64_t)q->properties.write_ptr); +- +- return 0; +-} +- + static int pm_create_runlist_ib(struct packet_manager *pm, + struct list_head *queues, + uint64_t *rl_gpu_addr, +@@ -481,13 +175,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n", + kq->queue->queue, qpd->is_debug); + +- if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) +- retval = pm_create_map_queue_vi(pm, +- &rl_buffer[rl_wptr], +- kq->queue, +- qpd->is_debug); +- else +- retval = pm_create_map_queue(pm, ++ retval = pm->pmf->map_queues(pm, + &rl_buffer[rl_wptr], + kq->queue, + qpd->is_debug); +@@ -495,7 +183,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + return retval; + + inc_wptr(&rl_wptr, +- sizeof(struct pm4_map_queues), ++ pm->pmf->get_map_queues_packet_size(), + alloc_size_bytes); + } + +@@ -506,22 +194,15 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n", + q->queue, qpd->is_debug); + +- if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) +- retval = pm_create_map_queue_vi(pm, +- &rl_buffer[rl_wptr], +- q, +- qpd->is_debug); +- else +- retval = pm_create_map_queue(pm, ++ retval = pm->pmf->map_queues(pm, + &rl_buffer[rl_wptr], + q, + qpd->is_debug); +- + if (retval != 0) + return retval; + + inc_wptr(&rl_wptr, +- sizeof(struct pm4_map_queues), ++ pm->pmf->get_map_queues_packet_size(), + alloc_size_bytes); + } + } +@@ -529,68 +210,17 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + pr_debug("kfd: finished map process and queues to runlist\n"); + + if (is_over_subscription) +- pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, ++ pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, + alloc_size_bytes / sizeof(uint32_t), true); + + for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) + pr_debug("0x%2X ", rl_buffer[i]); ++ + pr_debug("\n"); + + return 0; + } + +-static int get_map_process_packet_size(void) +-{ +- return sizeof(struct pm4_map_process); +-} +- +-static int get_map_process_packet_size_scratch_kv(void) +-{ +- return sizeof(struct pm4_map_process_scratch_kv); +-} +- +-static int get_map_process_packet_size_scratch(void) +-{ +- return sizeof(struct pm4_map_process_scratch); +-} +- +-/* pm_create_release_mem - Create a RELEASE_MEM packet and return the size +- * of this packet +- * @gpu_addr - GPU address of the packet. It's a virtual address. +- * @buffer - buffer to fill up with the packet. It's a CPU kernel pointer +- * Return - length of the packet +- */ +-uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer) +-{ +- struct pm4__release_mem *packet; +- +- WARN_ON(!buffer); +- +- packet = (struct pm4__release_mem *)buffer; +- memset(buffer, 0, sizeof(struct pm4__release_mem)); +- +- packet->header.u32all = build_pm4_header(IT_RELEASE_MEM, +- sizeof(struct pm4__release_mem)); +- +- packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; +- packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; +- packet->bitfields2.tcl1_action_ena = 1; +- packet->bitfields2.tc_action_ena = 1; +- packet->bitfields2.cache_policy = cache_policy___release_mem__lru; +- packet->bitfields2.atc = 0; +- +- packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; +- packet->bitfields3.int_sel = +- int_sel___release_mem__send_interrupt_after_write_confirm; +- +- packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; +- packet->address_hi = upper_32_bits(gpu_addr); +- +- packet->data_lo = 0; +- +- return sizeof(struct pm4__release_mem) / sizeof(unsigned int); +-} +- + int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, + uint16_t fw_ver) + { +@@ -603,36 +233,23 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, + mutex_destroy(&pm->lock); + return -ENOMEM; + } +- pm->pmf = kzalloc(sizeof(struct packet_manager_firmware), GFP_KERNEL); ++ pm->pmf = kzalloc(sizeof(struct packet_manager_funcs), GFP_KERNEL); + pm->allocated = false; + + switch (pm->dqm->dev->device_info->asic_family) { + case CHIP_KAVERI: + case CHIP_HAWAII: +- if (fw_ver >= KFD_SCRATCH_KV_FW_VER) { +- pm->pmf->map_process = pm_create_map_process_scratch_kv; +- pm->pmf->get_map_process_packet_size = +- get_map_process_packet_size_scratch_kv; +- } else { +- pm->pmf->map_process = pm_create_map_process; +- pm->pmf->get_map_process_packet_size = +- get_map_process_packet_size; +- } ++ kfd_pm_func_init_cik(pm, fw_ver); + break; + case CHIP_CARRIZO: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: +- if (fw_ver >= KFD_SCRATCH_CZ_FW_VER) { +- pm->pmf->map_process = pm_create_map_process_scratch; +- pm->pmf->get_map_process_packet_size = +- get_map_process_packet_size_scratch; +- } else { +- pm->pmf->map_process = pm_create_map_process; +- pm->pmf->get_map_process_packet_size = +- get_map_process_packet_size; +- } ++ kfd_pm_func_init_vi(pm, fw_ver); ++ break; ++ case CHIP_VEGA10: ++ kfd_pm_func_init_v9(pm, fw_ver); + break; + + } +@@ -652,39 +269,22 @@ void pm_uninit(struct packet_manager *pm) + int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res) + { +- struct pm4_set_resources *packet; +- +- BUG_ON(!pm || !res); ++ uint32_t *buffer, size; + + pr_debug("kfd: In func %s\n", __func__); + ++ size = pm->pmf->get_set_resources_packet_size(); + mutex_lock(&pm->lock); + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, +- sizeof(*packet) / sizeof(uint32_t), +- (unsigned int **)&packet); +- if (packet == NULL) { ++ size / sizeof(uint32_t), ++ (unsigned int **)&buffer); ++ if (buffer == NULL) { + mutex_unlock(&pm->lock); + pr_err("kfd: failed to allocate buffer on kernel queue\n"); + return -ENOMEM; + } + +- memset(packet, 0, sizeof(struct pm4_set_resources)); +- packet->header.u32all = build_pm4_header(IT_SET_RESOURCES, +- sizeof(struct pm4_set_resources)); +- +- packet->bitfields2.queue_type = +- queue_type__mes_set_resources__hsa_interface_queue_hiq; +- packet->bitfields2.vmid_mask = res->vmid_mask; +- packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; +- packet->bitfields7.oac_mask = res->oac_mask; +- packet->bitfields8.gds_heap_base = res->gds_heap_base; +- packet->bitfields8.gds_heap_size = res->gds_heap_size; +- +- packet->gws_mask_lo = lower_32_bits(res->gws_mask); +- packet->gws_mask_hi = upper_32_bits(res->gws_mask); +- +- packet->queue_mask_lo = lower_32_bits(res->queue_mask); +- packet->queue_mask_hi = upper_32_bits(res->queue_mask); ++ pm->pmf->set_resources(pm, buffer, res); + + pm->priv_queue->ops.submit_packet(pm->priv_queue); + +@@ -709,7 +309,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) + + pr_debug("kfd: runlist IB address: 0x%llX\n", rl_gpu_ib_addr); + +- packet_size_dwords = sizeof(struct pm4_runlist) / sizeof(uint32_t); ++ packet_size_dwords = pm->pmf->get_runlist_packet_size() / ++ sizeof(uint32_t); + mutex_lock(&pm->lock); + + retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, +@@ -717,8 +318,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) + if (retval != 0) + goto fail_acquire_packet_buffer; + +- retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, +- rl_ib_size / sizeof(uint32_t), false); ++ retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, ++ rl_ib_size / sizeof(uint32_t), false); + if (retval != 0) + goto fail_create_runlist; + +@@ -741,41 +342,22 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) + int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value) + { +- int retval; +- struct pm4_query_status *packet; +- +- BUG_ON(!pm || !fence_address); ++ uint32_t *buffer, size; + ++ size = pm->pmf->get_query_status_packet_size(); + mutex_lock(&pm->lock); +- retval = pm->priv_queue->ops.acquire_packet_buffer( +- pm->priv_queue, +- sizeof(struct pm4_query_status) / sizeof(uint32_t), +- (unsigned int **)&packet); +- if (retval != 0) +- goto fail_acquire_packet_buffer; +- +- packet->header.u32all = build_pm4_header(IT_QUERY_STATUS, +- sizeof(struct pm4_query_status)); +- +- packet->bitfields2.context_id = 0; +- packet->bitfields2.interrupt_sel = +- interrupt_sel__mes_query_status__completion_status; +- packet->bitfields2.command = +- command__mes_query_status__fence_only_after_write_ack; +- +- packet->addr_hi = upper_32_bits((uint64_t)fence_address); +- packet->addr_lo = lower_32_bits((uint64_t)fence_address); +- packet->data_hi = upper_32_bits((uint64_t)fence_value); +- packet->data_lo = lower_32_bits((uint64_t)fence_value); +- ++ pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, ++ size / sizeof(uint32_t), (unsigned int **)&buffer); ++ if (buffer == NULL) { ++ mutex_unlock(&pm->lock); ++ pr_err("kfd: failed to allocate buffer on kernel queue\n"); ++ return -ENOMEM; ++ } ++ pm->pmf->query_status(pm, buffer, fence_address, fence_value); + pm->priv_queue->ops.submit_packet(pm->priv_queue); + mutex_unlock(&pm->lock); + + return 0; +- +-fail_acquire_packet_buffer: +- mutex_unlock(&pm->lock); +- return retval; + } + + int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, +@@ -783,82 +365,23 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) + { +- int retval; +- uint32_t *buffer; +- struct pm4_unmap_queues *packet; +- +- BUG_ON(!pm); ++ uint32_t *buffer, size; + ++ size = pm->pmf->get_unmap_queues_packet_size(); + mutex_lock(&pm->lock); +- retval = pm->priv_queue->ops.acquire_packet_buffer( +- pm->priv_queue, +- sizeof(struct pm4_unmap_queues) / sizeof(uint32_t), +- &buffer); +- if (retval != 0) +- goto err_acquire_packet_buffer; +- +- packet = (struct pm4_unmap_queues *)buffer; +- memset(buffer, 0, sizeof(struct pm4_unmap_queues)); +- pr_debug("kfd: static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n", +- filter, reset, type); +- packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, +- sizeof(struct pm4_unmap_queues)); +- switch (type) { +- case KFD_QUEUE_TYPE_COMPUTE: +- case KFD_QUEUE_TYPE_DIQ: +- packet->bitfields2.engine_sel = +- engine_sel__mes_unmap_queues__compute; +- break; +- case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = +- engine_sel__mes_unmap_queues__sdma0 + sdma_engine; +- break; +- default: +- BUG(); +- break; +- } +- +- if (reset) +- packet->bitfields2.action = +- action__mes_unmap_queues__reset_queues; +- else +- packet->bitfields2.action = +- action__mes_unmap_queues__preempt_queues; +- +- switch (filter) { +- case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_specified_queues; +- packet->bitfields2.num_queues = 1; +- packet->bitfields3b.doorbell_offset0 = filter_param; +- break; +- case KFD_UNMAP_QUEUES_FILTER_BY_PASID: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; +- packet->bitfields3a.pasid = filter_param; +- break; +- case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; +- break; +- case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: +- /* in this case, we do not preempt static queues */ +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; +- break; +- default: +- BUG(); +- break; ++ pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, ++ size / sizeof(uint32_t), (unsigned int **)&buffer); ++ if (buffer == NULL) { ++ mutex_unlock(&pm->lock); ++ pr_err("kfd: failed to allocate buffer on kernel queue\n"); ++ return -ENOMEM; + } +- ++ pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, reset, ++ sdma_engine); + pm->priv_queue->ops.submit_packet(pm->priv_queue); +- + mutex_unlock(&pm->lock); +- return 0; + +-err_acquire_packet_buffer: +- mutex_unlock(&pm->lock); +- return retval; ++ return 0; + } + + void pm_release_ib(struct packet_manager *pm) +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h +index 058ba1b..05e692b 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h +@@ -21,8 +21,8 @@ + * + */ + +-#ifndef KFD_PM4_HEADERS_H_ +-#define KFD_PM4_HEADERS_H_ ++#ifndef KFD_PM4_HEADERS_CIK_H_ ++#define KFD_PM4_HEADERS_CIK_H_ + + #ifndef PM4_MES_HEADER_DEFINED + #define PM4_MES_HEADER_DEFINED +@@ -41,100 +41,6 @@ union PM4_MES_TYPE_3_HEADER { + }; + #endif /* PM4_MES_HEADER_DEFINED */ + +-/* --------------------MES_SET_RESOURCES-------------------- */ +- +-#ifndef PM4_MES_SET_RESOURCES_DEFINED +-#define PM4_MES_SET_RESOURCES_DEFINED +-enum set_resources_queue_type_enum { +- queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, +- queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, +- queue_type__mes_set_resources__hsa_debug_interface_queue = 4 +-}; +- +-struct pm4_set_resources { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t vmid_mask:16; +- uint32_t unmap_latency:8; +- uint32_t reserved1:5; +- enum set_resources_queue_type_enum queue_type:3; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- uint32_t queue_mask_lo; +- uint32_t queue_mask_hi; +- uint32_t gws_mask_lo; +- uint32_t gws_mask_hi; +- +- union { +- struct { +- uint32_t oac_mask:16; +- uint32_t reserved2:16; +- } bitfields7; +- uint32_t ordinal7; +- }; +- +- union { +- struct { +- uint32_t gds_heap_base:6; +- uint32_t reserved3:5; +- uint32_t gds_heap_size:6; +- uint32_t reserved4:15; +- } bitfields8; +- uint32_t ordinal8; +- }; +- +-}; +-#endif +- +-/*--------------------MES_RUN_LIST-------------------- */ +- +-#ifndef PM4_MES_RUN_LIST_DEFINED +-#define PM4_MES_RUN_LIST_DEFINED +- +-struct pm4_runlist { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t reserved1:2; +- uint32_t ib_base_lo:30; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- union { +- struct { +- uint32_t ib_base_hi:16; +- uint32_t reserved2:16; +- } bitfields3; +- uint32_t ordinal3; +- }; +- +- union { +- struct { +- uint32_t ib_size:20; +- uint32_t chain:1; +- uint32_t offload_polling:1; +- uint32_t reserved3:1; +- uint32_t valid:1; +- uint32_t process_cnt:4; +- uint32_t reserved4:4; +- } bitfields4; +- uint32_t ordinal4; +- }; +- +-}; +-#endif + + /*--------------------MES_MAP_PROCESS-------------------- */ + +@@ -187,68 +93,6 @@ struct pm4_map_process { + }; + #endif + +-/*--------------------MES_MAP_PROCESS_SCRATCH-------------------- */ +- +-#ifndef PM4_MES_MAP_PROCESS_SCRATCH_DEFINED +-#define PM4_MES_MAP_PROCESS_SCRATCH_DEFINED +- +-struct pm4_map_process_scratch { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t pasid:16; +- uint32_t reserved1:8; +- uint32_t diq_enable:1; +- uint32_t process_quantum:7; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- union { +- struct { +- uint32_t page_table_base:28; +- uint32_t reserved3:4; +- } bitfields3; +- uint32_t ordinal3; +- }; +- +- uint32_t reserved; +- +- uint32_t sh_mem_bases; +- uint32_t sh_mem_config; +- uint32_t sh_mem_ape1_base; +- uint32_t sh_mem_ape1_limit; +- +- uint32_t sh_hidden_private_base_vmid; +- +- uint32_t reserved2; +- uint32_t reserved3; +- +- uint32_t gds_addr_lo; +- uint32_t gds_addr_hi; +- +- union { +- struct { +- uint32_t num_gws:6; +- uint32_t reserved4:2; +- uint32_t num_oac:4; +- uint32_t reserved5:4; +- uint32_t gds_size:6; +- uint32_t num_queues:10; +- } bitfields10; +- uint32_t ordinal10; +- }; +- +- uint32_t completion_signal_lo; +- uint32_t completion_signal_hi; +- +-}; +-#endif +- + #ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH + #define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH + +@@ -304,315 +148,6 @@ uint32_t completion_signal_hi32; + }; + #endif + +-/*--------------------MES_MAP_QUEUES--------------------*/ +- +-#ifndef PM4_MES_MAP_QUEUES_DEFINED +-#define PM4_MES_MAP_QUEUES_DEFINED +-enum map_queues_queue_sel_enum { +- queue_sel__mes_map_queues__map_to_specified_queue_slots = 0, +- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots = 1, +- queue_sel__mes_map_queues__enable_process_queues = 2 +-}; +- +-enum map_queues_vidmem_enum { +- vidmem__mes_map_queues__uses_no_video_memory = 0, +- vidmem__mes_map_queues__uses_video_memory = 1 +-}; +- +-enum map_queues_alloc_format_enum { +- alloc_format__mes_map_queues__one_per_pipe = 0, +- alloc_format__mes_map_queues__all_on_one_pipe = 1 +-}; +- +-enum map_queues_engine_sel_enum { +- engine_sel__mes_map_queues__compute = 0, +- engine_sel__mes_map_queues__sdma0 = 2, +- engine_sel__mes_map_queues__sdma1 = 3 +-}; +- +-struct pm4_map_queues { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t reserved1:4; +- enum map_queues_queue_sel_enum queue_sel:2; +- uint32_t reserved2:2; +- uint32_t vmid:4; +- uint32_t reserved3:4; +- enum map_queues_vidmem_enum vidmem:2; +- uint32_t reserved4:6; +- enum map_queues_alloc_format_enum alloc_format:2; +- enum map_queues_engine_sel_enum engine_sel:3; +- uint32_t num_queues:3; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- struct { +- union { +- struct { +- uint32_t is_static:1; +- uint32_t reserved5:1; +- uint32_t doorbell_offset:21; +- uint32_t reserved6:3; +- uint32_t queue:6; +- } bitfields3; +- uint32_t ordinal3; +- }; +- +- uint32_t mqd_addr_lo; +- uint32_t mqd_addr_hi; +- uint32_t wptr_addr_lo; +- uint32_t wptr_addr_hi; +- +- } mes_map_queues_ordinals[1]; /* 1..N of these ordinal groups */ +- +-}; +-#endif +- +-/*--------------------MES_QUERY_STATUS--------------------*/ +- +-#ifndef PM4_MES_QUERY_STATUS_DEFINED +-#define PM4_MES_QUERY_STATUS_DEFINED +-enum query_status_interrupt_sel_enum { +- interrupt_sel__mes_query_status__completion_status = 0, +- interrupt_sel__mes_query_status__process_status = 1, +- interrupt_sel__mes_query_status__queue_status = 2 +-}; +- +-enum query_status_command_enum { +- command__mes_query_status__interrupt_only = 0, +- command__mes_query_status__fence_only_immediate = 1, +- command__mes_query_status__fence_only_after_write_ack = 2, +- command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 +-}; +- +-enum query_status_engine_sel_enum { +- engine_sel__mes_query_status__compute = 0, +- engine_sel__mes_query_status__sdma0_queue = 2, +- engine_sel__mes_query_status__sdma1_queue = 3 +-}; +- +-struct pm4_query_status { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- uint32_t context_id:28; +- enum query_status_interrupt_sel_enum interrupt_sel:2; +- enum query_status_command_enum command:2; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- union { +- struct { +- uint32_t pasid:16; +- uint32_t reserved1:16; +- } bitfields3a; +- struct { +- uint32_t reserved2:2; +- uint32_t doorbell_offset:21; +- uint32_t reserved3:3; +- enum query_status_engine_sel_enum engine_sel:3; +- uint32_t reserved4:3; +- } bitfields3b; +- uint32_t ordinal3; +- }; +- +- uint32_t addr_lo; +- uint32_t addr_hi; +- uint32_t data_lo; +- uint32_t data_hi; +-}; +-#endif +- +-/*--------------------MES_UNMAP_QUEUES--------------------*/ +- +-#ifndef PM4_MES_UNMAP_QUEUES_DEFINED +-#define PM4_MES_UNMAP_QUEUES_DEFINED +-enum unmap_queues_action_enum { +- action__mes_unmap_queues__preempt_queues = 0, +- action__mes_unmap_queues__reset_queues = 1, +- action__mes_unmap_queues__disable_process_queues = 2 +-}; +- +-enum unmap_queues_queue_sel_enum { +- queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, +- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, +- queue_sel__mes_unmap_queues__perform_request_on_all_active_queues = 2, +- queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only = 3 +-}; +- +-enum unmap_queues_engine_sel_enum { +- engine_sel__mes_unmap_queues__compute = 0, +- engine_sel__mes_unmap_queues__sdma0 = 2, +- engine_sel__mes_unmap_queues__sdma1 = 3 +-}; +- +-struct pm4_unmap_queues { +- union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; +- }; +- +- union { +- struct { +- enum unmap_queues_action_enum action:2; +- uint32_t reserved1:2; +- enum unmap_queues_queue_sel_enum queue_sel:2; +- uint32_t reserved2:20; +- enum unmap_queues_engine_sel_enum engine_sel:3; +- uint32_t num_queues:3; +- } bitfields2; +- uint32_t ordinal2; +- }; +- +- union { +- struct { +- uint32_t pasid:16; +- uint32_t reserved3:16; +- } bitfields3a; +- struct { +- uint32_t reserved4:2; +- uint32_t doorbell_offset0:21; +- uint32_t reserved5:9; +- } bitfields3b; +- uint32_t ordinal3; +- }; +- +- union { +- struct { +- uint32_t reserved6:2; +- uint32_t doorbell_offset1:21; +- uint32_t reserved7:9; +- } bitfields4; +- uint32_t ordinal4; +- }; +- +- union { +- struct { +- uint32_t reserved8:2; +- uint32_t doorbell_offset2:21; +- uint32_t reserved9:9; +- } bitfields5; +- uint32_t ordinal5; +- }; +- +- union { +- struct { +- uint32_t reserved10:2; +- uint32_t doorbell_offset3:21; +- uint32_t reserved11:9; +- } bitfields6; +- uint32_t ordinal6; +- }; +- +-}; +-#endif +- +-/*--------------------_RELEASE_MEM-------------------- */ +- +-#ifndef PM4__RELEASE_MEM_DEFINED +-#define PM4__RELEASE_MEM_DEFINED +-enum RELEASE_MEM_event_index_enum { +- event_index___release_mem__end_of_pipe = 5, +- event_index___release_mem__shader_done = 6 +-}; +- +-enum RELEASE_MEM_cache_policy_enum { +- cache_policy___release_mem__lru = 0, +- cache_policy___release_mem__stream = 1, +- cache_policy___release_mem__bypass = 2 +-}; +- +-enum RELEASE_MEM_dst_sel_enum { +- dst_sel___release_mem__memory_controller = 0, +- dst_sel___release_mem__tc_l2 = 1, +- dst_sel___release_mem__queue_write_pointer_register = 2, +- dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 +-}; +- +-enum RELEASE_MEM_int_sel_enum { +- int_sel___release_mem__none = 0, +- int_sel___release_mem__send_interrupt_only = 1, +- int_sel___release_mem__send_interrupt_after_write_confirm = 2, +- int_sel___release_mem__send_data_after_write_confirm = 3 +-}; +- +-enum RELEASE_MEM_data_sel_enum { +- data_sel___release_mem__none = 0, +- data_sel___release_mem__send_32_bit_low = 1, +- data_sel___release_mem__send_64_bit_data = 2, +- data_sel___release_mem__send_gpu_clock_counter = 3, +- data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, +- data_sel___release_mem__store_gds_data_to_memory = 5 +-}; +- +-struct pm4__release_mem { +- union { +- union PM4_MES_TYPE_3_HEADER header; /*header */ +- unsigned int ordinal1; +- }; +- +- union { +- struct { +- unsigned int event_type:6; +- unsigned int reserved1:2; +- enum RELEASE_MEM_event_index_enum event_index:4; +- unsigned int tcl1_vol_action_ena:1; +- unsigned int tc_vol_action_ena:1; +- unsigned int reserved2:1; +- unsigned int tc_wb_action_ena:1; +- unsigned int tcl1_action_ena:1; +- unsigned int tc_action_ena:1; +- unsigned int reserved3:6; +- unsigned int atc:1; +- enum RELEASE_MEM_cache_policy_enum cache_policy:2; +- unsigned int reserved4:5; +- } bitfields2; +- unsigned int ordinal2; +- }; +- +- union { +- struct { +- unsigned int reserved5:16; +- enum RELEASE_MEM_dst_sel_enum dst_sel:2; +- unsigned int reserved6:6; +- enum RELEASE_MEM_int_sel_enum int_sel:3; +- unsigned int reserved7:2; +- enum RELEASE_MEM_data_sel_enum data_sel:3; +- } bitfields3; +- unsigned int ordinal3; +- }; +- +- union { +- struct { +- unsigned int reserved8:2; +- unsigned int address_lo_32b:30; +- } bitfields4; +- struct { +- unsigned int reserved9:3; +- unsigned int address_lo_64b:29; +- } bitfields5; +- unsigned int ordinal4; +- }; +- +- unsigned int address_hi; +- +- unsigned int data_lo; +- +- unsigned int data_hi; +-}; +-#endif +- + enum { + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 + }; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +new file mode 100644 +index 0000000..ddad9be +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +@@ -0,0 +1,583 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ * ++ */ ++ ++#ifndef F32_MES_PM4_PACKETS_H ++#define F32_MES_PM4_PACKETS_H ++ ++#ifndef PM4_MES_HEADER_DEFINED ++#define PM4_MES_HEADER_DEFINED ++union PM4_MES_TYPE_3_HEADER { ++ struct { ++ uint32_t reserved1 : 8; /* < reserved */ ++ uint32_t opcode : 8; /* < IT opcode */ ++ uint32_t count : 14;/* < number of DWORDs - 1 in the ++ * information body. ++ */ ++ uint32_t type : 2; /* < packet identifier. ++ * It should be 3 for type 3 packets ++ */ ++ }; ++ uint32_t u32All; ++}; ++#endif /* PM4_MES_HEADER_DEFINED */ ++ ++/*--------------------MES_SET_RESOURCES--------------------*/ ++ ++#ifndef PM4_MES_SET_RESOURCES_DEFINED ++#define PM4_MES_SET_RESOURCES_DEFINED ++enum mes_set_resources_queue_type_enum { ++ queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, ++ queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, ++ queue_type__mes_set_resources__hsa_debug_interface_queue = 4 ++}; ++ ++ ++struct pm4_mes_set_resources { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t vmid_mask:16; ++ uint32_t unmap_latency:8; ++ uint32_t reserved1:5; ++ enum mes_set_resources_queue_type_enum queue_type:3; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ uint32_t queue_mask_lo; ++ uint32_t queue_mask_hi; ++ uint32_t gws_mask_lo; ++ uint32_t gws_mask_hi; ++ ++ union { ++ struct { ++ uint32_t oac_mask:16; ++ uint32_t reserved2:16; ++ } bitfields7; ++ uint32_t ordinal7; ++ }; ++ ++ union { ++ struct { ++ uint32_t gds_heap_base:6; ++ uint32_t reserved3:5; ++ uint32_t gds_heap_size:6; ++ uint32_t reserved4:15; ++ } bitfields8; ++ uint32_t ordinal8; ++ }; ++ ++}; ++#endif ++ ++/*--------------------MES_RUN_LIST--------------------*/ ++ ++#ifndef PM4_MES_RUN_LIST_DEFINED ++#define PM4_MES_RUN_LIST_DEFINED ++ ++struct pm4_mes_runlist { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved1:2; ++ uint32_t ib_base_lo:30; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ uint32_t ib_base_hi; ++ ++ union { ++ struct { ++ uint32_t ib_size:20; ++ uint32_t chain:1; ++ uint32_t offload_polling:1; ++ uint32_t reserved2:1; ++ uint32_t valid:1; ++ uint32_t process_cnt:4; ++ uint32_t reserved3:4; ++ } bitfields4; ++ uint32_t ordinal4; ++ }; ++ ++}; ++#endif ++ ++/*--------------------MES_MAP_PROCESS--------------------*/ ++ ++#ifndef PM4_MES_MAP_PROCESS_DEFINED ++#define PM4_MES_MAP_PROCESS_DEFINED ++ ++struct pm4_mes_map_process { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved1:8; ++ uint32_t diq_enable:1; ++ uint32_t process_quantum:7; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ uint32_t vm_context_page_table_base_addr_lo32; ++ ++ uint32_t vm_context_page_table_base_addr_hi32; ++ ++ uint32_t sh_mem_bases; ++ ++ uint32_t sh_mem_config; ++ ++ uint32_t sq_shader_tba_lo; ++ ++ uint32_t sq_shader_tba_hi; ++ ++ uint32_t sq_shader_tma_lo; ++ ++ uint32_t sq_shader_tma_hi; ++ ++ uint32_t reserved6; ++ ++ uint32_t gds_addr_lo; ++ ++ uint32_t gds_addr_hi; ++ ++ union { ++ struct { ++ uint32_t num_gws:6; ++ uint32_t reserved7:1; ++ uint32_t sdma_enable:1; ++ uint32_t num_oac:4; ++ uint32_t reserved8:4; ++ uint32_t gds_size:6; ++ uint32_t num_queues:10; ++ } bitfields14; ++ uint32_t ordinal14; ++ }; ++ ++ uint32_t completion_signal_lo; ++ ++ uint32_t completion_signal_hi; ++ ++}; ++ ++#endif ++ ++/*--------------------MES_MAP_PROCESS_VM--------------------*/ ++ ++#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED ++#define PM4_MES_MAP_PROCESS_VM_DEFINED ++ ++struct PM4_MES_MAP_PROCESS_VM { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ uint32_t reserved1; ++ ++ uint32_t vm_context_cntl; ++ ++ uint32_t reserved2; ++ ++ uint32_t vm_context_page_table_end_addr_lo32; ++ ++ uint32_t vm_context_page_table_end_addr_hi32; ++ ++ uint32_t vm_context_page_table_start_addr_lo32; ++ ++ uint32_t vm_context_page_table_start_addr_hi32; ++ ++ uint32_t reserved3; ++ ++ uint32_t reserved4; ++ ++ uint32_t reserved5; ++ ++ uint32_t reserved6; ++ ++ uint32_t reserved7; ++ ++ uint32_t reserved8; ++ ++ uint32_t completion_signal_lo32; ++ ++ uint32_t completion_signal_hi32; ++ ++}; ++#endif ++ ++/*--------------------MES_MAP_QUEUES--------------------*/ ++ ++#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED ++#define PM4_MES_MAP_QUEUES_VI_DEFINED ++enum mes_map_queues_queue_sel_enum { ++ queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, ++queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 ++}; ++ ++enum mes_map_queues_queue_type_enum { ++ queue_type__mes_map_queues__normal_compute_vi = 0, ++ queue_type__mes_map_queues__debug_interface_queue_vi = 1, ++ queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, ++queue_type__mes_map_queues__low_latency_static_queue_vi = 3 ++}; ++ ++enum mes_map_queues_alloc_format_enum { ++ alloc_format__mes_map_queues__one_per_pipe_vi = 0, ++alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 ++}; ++ ++enum mes_map_queues_engine_sel_enum { ++ engine_sel__mes_map_queues__compute_vi = 0, ++ engine_sel__mes_map_queues__sdma0_vi = 2, ++ engine_sel__mes_map_queues__sdma1_vi = 3 ++}; ++ ++ ++struct pm4_mes_map_queues { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved1:4; ++ enum mes_map_queues_queue_sel_enum queue_sel:2; ++ uint32_t reserved2:15; ++ enum mes_map_queues_queue_type_enum queue_type:3; ++ enum mes_map_queues_alloc_format_enum alloc_format:2; ++ enum mes_map_queues_engine_sel_enum engine_sel:3; ++ uint32_t num_queues:3; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved3:1; ++ uint32_t check_disable:1; ++ uint32_t doorbell_offset:26; ++ uint32_t reserved4:4; ++ } bitfields3; ++ uint32_t ordinal3; ++ }; ++ ++ uint32_t mqd_addr_lo; ++ uint32_t mqd_addr_hi; ++ uint32_t wptr_addr_lo; ++ uint32_t wptr_addr_hi; ++}; ++#endif ++ ++/*--------------------MES_QUERY_STATUS--------------------*/ ++ ++#ifndef PM4_MES_QUERY_STATUS_DEFINED ++#define PM4_MES_QUERY_STATUS_DEFINED ++enum mes_query_status_interrupt_sel_enum { ++ interrupt_sel__mes_query_status__completion_status = 0, ++ interrupt_sel__mes_query_status__process_status = 1, ++ interrupt_sel__mes_query_status__queue_status = 2 ++}; ++ ++enum mes_query_status_command_enum { ++ command__mes_query_status__interrupt_only = 0, ++ command__mes_query_status__fence_only_immediate = 1, ++ command__mes_query_status__fence_only_after_write_ack = 2, ++ command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 ++}; ++ ++enum mes_query_status_engine_sel_enum { ++ engine_sel__mes_query_status__compute = 0, ++ engine_sel__mes_query_status__sdma0_queue = 2, ++ engine_sel__mes_query_status__sdma1_queue = 3 ++}; ++ ++struct pm4_mes_query_status { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t context_id:28; ++ enum mes_query_status_interrupt_sel_enum interrupt_sel:2; ++ enum mes_query_status_command_enum command:2; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved1:16; ++ } bitfields3a; ++ struct { ++ uint32_t reserved2:2; ++ uint32_t doorbell_offset:26; ++ enum mes_query_status_engine_sel_enum engine_sel:3; ++ uint32_t reserved3:1; ++ } bitfields3b; ++ uint32_t ordinal3; ++ }; ++ ++ uint32_t addr_lo; ++ uint32_t addr_hi; ++ uint32_t data_lo; ++ uint32_t data_hi; ++}; ++#endif ++ ++/*--------------------MES_UNMAP_QUEUES--------------------*/ ++ ++#ifndef PM4_MES_UNMAP_QUEUES_DEFINED ++#define PM4_MES_UNMAP_QUEUES_DEFINED ++enum mes_unmap_queues_action_enum { ++ action__mes_unmap_queues__preempt_queues = 0, ++ action__mes_unmap_queues__reset_queues = 1, ++ action__mes_unmap_queues__disable_process_queues = 2, ++ action__mes_unmap_queues__reserved = 3 ++}; ++ ++enum mes_unmap_queues_queue_sel_enum { ++ queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, ++ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, ++ queue_sel__mes_unmap_queues__unmap_all_queues = 2, ++ queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 ++}; ++ ++enum mes_unmap_queues_engine_sel_enum { ++ engine_sel__mes_unmap_queues__compute = 0, ++ engine_sel__mes_unmap_queues__sdma0 = 2, ++ engine_sel__mes_unmap_queues__sdmal = 3 ++}; ++ ++struct pm4_mes_unmap_queues { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ enum mes_unmap_queues_action_enum action:2; ++ uint32_t reserved1:2; ++ enum mes_unmap_queues_queue_sel_enum queue_sel:2; ++ uint32_t reserved2:20; ++ enum mes_unmap_queues_engine_sel_enum engine_sel:3; ++ uint32_t num_queues:3; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved3:16; ++ } bitfields3a; ++ struct { ++ uint32_t reserved4:2; ++ uint32_t doorbell_offset0:26; ++ int32_t reserved5:4; ++ } bitfields3b; ++ uint32_t ordinal3; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved6:2; ++ uint32_t doorbell_offset1:26; ++ uint32_t reserved7:4; ++ } bitfields4; ++ uint32_t ordinal4; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved8:2; ++ uint32_t doorbell_offset2:26; ++ uint32_t reserved9:4; ++ } bitfields5; ++ uint32_t ordinal5; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved10:2; ++ uint32_t doorbell_offset3:26; ++ uint32_t reserved11:4; ++ } bitfields6; ++ uint32_t ordinal6; ++ }; ++}; ++#endif ++ ++#ifndef PM4_MEC_RELEASE_MEM_DEFINED ++#define PM4_MEC_RELEASE_MEM_DEFINED ++ ++enum mec_release_mem_event_index_enum { ++ event_index__mec_release_mem__end_of_pipe = 5, ++ event_index__mec_release_mem__shader_done = 6 ++}; ++ ++enum mec_release_mem_cache_policy_enum { ++ cache_policy__mec_release_mem__lru = 0, ++ cache_policy__mec_release_mem__stream = 1 ++}; ++ ++enum mec_release_mem_pq_exe_status_enum { ++ pq_exe_status__mec_release_mem__default = 0, ++ pq_exe_status__mec_release_mem__phase_update = 1 ++}; ++ ++enum mec_release_mem_dst_sel_enum { ++ dst_sel__mec_release_mem__memory_controller = 0, ++ dst_sel__mec_release_mem__tc_l2 = 1, ++ dst_sel__mec_release_mem__queue_write_pointer_register = 2, ++ dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 ++}; ++ ++enum mec_release_mem_int_sel_enum { ++ int_sel__mec_release_mem__none = 0, ++ int_sel__mec_release_mem__send_interrupt_only = 1, ++ int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, ++ int_sel__mec_release_mem__send_data_after_write_confirm = 3, ++ int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, ++ int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, ++ int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 ++}; ++ ++enum mec_release_mem_data_sel_enum { ++ data_sel__mec_release_mem__none = 0, ++ data_sel__mec_release_mem__send_32_bit_low = 1, ++ data_sel__mec_release_mem__send_64_bit_data = 2, ++ data_sel__mec_release_mem__send_gpu_clock_counter = 3, ++ data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, ++ data_sel__mec_release_mem__store_gds_data_to_memory = 5 ++}; ++ ++struct pm4_mec_release_mem { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /*header */ ++ unsigned int ordinal1; ++ }; ++ ++ union { ++ struct { ++ unsigned int event_type:6; ++ unsigned int reserved1:2; ++ enum mec_release_mem_event_index_enum event_index:4; ++ unsigned int tcl1_vol_action_ena:1; ++ unsigned int tc_vol_action_ena:1; ++ unsigned int reserved2:1; ++ unsigned int tc_wb_action_ena:1; ++ unsigned int tcl1_action_ena:1; ++ unsigned int tc_action_ena:1; ++ uint32_t reserved3:1; ++ uint32_t tc_nc_action_ena:1; ++ uint32_t tc_wc_action_ena:1; ++ uint32_t tc_md_action_ena:1; ++ uint32_t reserved4:3; ++ enum mec_release_mem_cache_policy_enum cache_policy:2; ++ uint32_t reserved5:2; ++ enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; ++ uint32_t reserved6:2; ++ } bitfields2; ++ unsigned int ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved7:16; ++ enum mec_release_mem_dst_sel_enum dst_sel:2; ++ uint32_t reserved8:6; ++ enum mec_release_mem_int_sel_enum int_sel:3; ++ uint32_t reserved9:2; ++ enum mec_release_mem_data_sel_enum data_sel:3; ++ } bitfields3; ++ unsigned int ordinal3; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved10:2; ++ unsigned int address_lo_32b:30; ++ } bitfields4; ++ struct { ++ uint32_t reserved11:3; ++ uint32_t address_lo_64b:29; ++ } bitfields4b; ++ uint32_t reserved12; ++ unsigned int ordinal4; ++ }; ++ ++ union { ++ uint32_t address_hi; ++ uint32_t reserved13; ++ uint32_t ordinal5; ++ }; ++ ++ union { ++ uint32_t data_lo; ++ uint32_t cmp_data_lo; ++ struct { ++ uint32_t dw_offset:16; ++ uint32_t num_dwords:16; ++ } bitfields6c; ++ uint32_t reserved14; ++ uint32_t ordinal6; ++ }; ++ ++ union { ++ uint32_t data_hi; ++ uint32_t cmp_data_hi; ++ uint32_t reserved15; ++ uint32_t reserved16; ++ uint32_t ordinal7; ++ }; ++ ++ uint32_t int_ctxid; ++ ++}; ++ ++#endif ++ ++enum { ++ CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 ++}; ++#endif ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h +index 08c7219..8cb3094 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h +@@ -124,9 +124,10 @@ struct pm4_mes_runlist { + uint32_t ib_size:20; + uint32_t chain:1; + uint32_t offload_polling:1; +- uint32_t reserved3:1; ++ uint32_t reserved2:1; + uint32_t valid:1; +- uint32_t reserved4:8; ++ uint32_t process_cnt:4; ++ uint32_t reserved3:4; + } bitfields4; + uint32_t ordinal4; + }; +@@ -141,8 +142,8 @@ struct pm4_mes_runlist { + + struct pm4_mes_map_process { + union { +- union PM4_MES_TYPE_3_HEADER header; /* header */ +- uint32_t ordinal1; ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; + }; + + union { +@@ -153,36 +154,48 @@ struct pm4_mes_map_process { + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; +-}; ++ }; + + union { + struct { + uint32_t page_table_base:28; +- uint32_t reserved2:4; ++ uint32_t reserved3:4; + } bitfields3; + uint32_t ordinal3; + }; + ++ uint32_t reserved; ++ + uint32_t sh_mem_bases; ++ uint32_t sh_mem_config; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; +- uint32_t sh_mem_config; ++ ++ uint32_t sh_hidden_private_base_vmid; ++ ++ uint32_t reserved2; ++ uint32_t reserved3; ++ + uint32_t gds_addr_lo; + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; +- uint32_t reserved3:2; ++ uint32_t reserved4:2; + uint32_t num_oac:4; +- uint32_t reserved4:4; ++ uint32_t reserved5:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields10; + uint32_t ordinal10; + }; + ++ uint32_t completion_signal_lo; ++ uint32_t completion_signal_hi; ++ + }; ++ + #endif + + /*--------------------MES_MAP_QUEUES--------------------*/ +@@ -335,7 +348,7 @@ enum mes_unmap_queues_engine_sel_enum { + engine_sel__mes_unmap_queues__sdmal = 3 + }; + +-struct PM4_MES_UNMAP_QUEUES { ++struct pm4_mes_unmap_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; +@@ -395,4 +408,101 @@ struct PM4_MES_UNMAP_QUEUES { + }; + #endif + ++#ifndef PM4_MEC_RELEASE_MEM_DEFINED ++#define PM4_MEC_RELEASE_MEM_DEFINED ++enum RELEASE_MEM_event_index_enum { ++ event_index___release_mem__end_of_pipe = 5, ++ event_index___release_mem__shader_done = 6 ++}; ++ ++enum RELEASE_MEM_cache_policy_enum { ++ cache_policy___release_mem__lru = 0, ++ cache_policy___release_mem__stream = 1, ++ cache_policy___release_mem__bypass = 2 ++}; ++ ++enum RELEASE_MEM_dst_sel_enum { ++ dst_sel___release_mem__memory_controller = 0, ++ dst_sel___release_mem__tc_l2 = 1, ++ dst_sel___release_mem__queue_write_pointer_register = 2, ++ dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 ++}; ++ ++enum RELEASE_MEM_int_sel_enum { ++ int_sel___release_mem__none = 0, ++ int_sel___release_mem__send_interrupt_only = 1, ++ int_sel___release_mem__send_interrupt_after_write_confirm = 2, ++ int_sel___release_mem__send_data_after_write_confirm = 3 ++}; ++ ++enum RELEASE_MEM_data_sel_enum { ++ data_sel___release_mem__none = 0, ++ data_sel___release_mem__send_32_bit_low = 1, ++ data_sel___release_mem__send_64_bit_data = 2, ++ data_sel___release_mem__send_gpu_clock_counter = 3, ++ data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, ++ data_sel___release_mem__store_gds_data_to_memory = 5 ++}; ++ ++struct pm4_mec_release_mem { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /*header */ ++ unsigned int ordinal1; ++ }; ++ ++ union { ++ struct { ++ unsigned int event_type:6; ++ unsigned int reserved1:2; ++ enum RELEASE_MEM_event_index_enum event_index:4; ++ unsigned int tcl1_vol_action_ena:1; ++ unsigned int tc_vol_action_ena:1; ++ unsigned int reserved2:1; ++ unsigned int tc_wb_action_ena:1; ++ unsigned int tcl1_action_ena:1; ++ unsigned int tc_action_ena:1; ++ unsigned int reserved3:6; ++ unsigned int atc:1; ++ enum RELEASE_MEM_cache_policy_enum cache_policy:2; ++ unsigned int reserved4:5; ++ } bitfields2; ++ unsigned int ordinal2; ++ }; ++ ++ union { ++ struct { ++ unsigned int reserved5:16; ++ enum RELEASE_MEM_dst_sel_enum dst_sel:2; ++ unsigned int reserved6:6; ++ enum RELEASE_MEM_int_sel_enum int_sel:3; ++ unsigned int reserved7:2; ++ enum RELEASE_MEM_data_sel_enum data_sel:3; ++ } bitfields3; ++ unsigned int ordinal3; ++ }; ++ ++ union { ++ struct { ++ unsigned int reserved8:2; ++ unsigned int address_lo_32b:30; ++ } bitfields4; ++ struct { ++ unsigned int reserved9:3; ++ unsigned int address_lo_64b:29; ++ } bitfields5; ++ unsigned int ordinal4; ++ }; ++ ++ unsigned int address_hi; ++ ++ unsigned int data_lo; ++ ++ unsigned int data_hi; ++}; ++#endif ++ ++enum { ++ CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 ++}; ++ + #endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +index aa2133e..4962d7b 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -172,13 +172,15 @@ enum asic_family_type { + CHIP_TONGA, + CHIP_FIJI, + CHIP_POLARIS10, +- CHIP_POLARIS11 ++ CHIP_POLARIS11, ++ CHIP_VEGA10 + }; + + #define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_POLARIS11) + #define KFD_IS_DGPU(chip) (((chip) >= CHIP_TONGA && \ +- (chip) <= CHIP_POLARIS11) || \ ++ (chip) <= CHIP_VEGA10) || \ + (chip) == CHIP_HAWAII) ++#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) + + struct kfd_event_interrupt_class { + bool (*interrupt_isr)(struct kfd_dev *dev, const uint32_t *ih_ring_entry, +@@ -191,6 +193,7 @@ struct kfd_device_info { + const struct kfd_event_interrupt_class *event_interrupt_class; + unsigned int max_pasid_bits; + unsigned int max_no_of_hqd; ++ unsigned int doorbell_size; + size_t ih_ring_entry_size; + uint8_t num_of_watch_points; + uint16_t mqd_size_aligned; +@@ -204,6 +207,7 @@ struct kfd_mem_obj { + uint32_t range_end; + uint64_t gpu_addr; + uint32_t *cpu_ptr; ++ void *gtt_mem; + }; + + struct kfd_vmid_info { +@@ -417,7 +421,7 @@ struct queue_properties { + uint32_t queue_percent; + uint32_t *read_ptr; + uint32_t *write_ptr; +- uint32_t __iomem *doorbell_ptr; ++ void __iomem *doorbell_ptr; + uint32_t doorbell_off; + bool is_interop; + bool is_evicted; /* true -> queue is evicted */ +@@ -482,6 +486,7 @@ struct queue { + uint32_t queue; + + unsigned int sdma_id; ++ unsigned int doorbell_id; + + struct kfd_process *process; + struct kfd_dev *device; +@@ -794,17 +799,19 @@ unsigned int kfd_pasid_alloc(void); + void kfd_pasid_free(unsigned int pasid); + + /* Doorbells */ +-size_t kfd_doorbell_process_slice(void); ++size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); + void kfd_doorbell_init(struct kfd_dev *kfd); +-int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); +-u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, ++int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process, ++ struct vm_area_struct *vma); ++void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off); + void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); + u32 read_kernel_doorbell(u32 __iomem *db); +-void write_kernel_doorbell(u32 __iomem *db, u32 value); +-unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, ++void write_kernel_doorbell(void __iomem *db, u32 value); ++void write_kernel_doorbell64(void __iomem *db, u64 value); ++unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, + struct kfd_process *process, +- unsigned int queue_id); ++ unsigned int doorbell_id); + + /* GTT Sub-Allocator */ + +@@ -865,6 +872,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); + struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); ++struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev); + struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); + void device_queue_manager_uninit(struct device_queue_manager *dqm); + struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, +@@ -904,7 +913,7 @@ int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); + #define KFD_FENCE_COMPLETED (100) + #define KFD_FENCE_INIT (10) + +-struct packet_manager_firmware; ++struct packet_manager_func; + + struct packet_manager { + struct device_queue_manager *dqm; +@@ -914,17 +923,38 @@ struct packet_manager { + struct kfd_mem_obj *ib_buffer_obj; + unsigned ib_size_bytes; + +- struct packet_manager_firmware *pmf; ++ struct packet_manager_funcs *pmf; + }; + +-struct packet_manager_firmware { +- /* Support different firmware versions for map process packet */ ++struct packet_manager_funcs { ++ /* Support different firmware versions for PM4 packets */ + int (*map_process)(struct packet_manager *pm, uint32_t *buffer, +- struct qcm_process_device *qpd); +- int (*get_map_process_packet_size)(void); ++ struct qcm_process_device *qpd); ++ int (*runlist)(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t ib, size_t ib_size_in_dwords, bool chain); ++ int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, ++ struct scheduling_resources *res); ++ int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static); ++ int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, ++ enum kfd_queue_type type, ++ enum kfd_unmap_queues_filter mode, ++ uint32_t filter_param, bool reset, ++ unsigned int sdma_engine); ++ int (*query_status)(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t fence_address, uint32_t fence_value); ++ uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); ++ ++ uint32_t (*get_map_process_packet_size)(void); ++ uint32_t (*get_runlist_packet_size)(void); ++ uint32_t (*get_set_resources_packet_size)(void); ++ uint32_t (*get_map_queues_packet_size)(void); ++ uint32_t (*get_unmap_queues_packet_size)(void); ++ uint32_t (*get_query_status_packet_size)(void); ++ uint32_t (*get_release_mem_packet_size)(void); ++ + }; + +-uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer); + int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, + uint16_t fw_ver); + void pm_uninit(struct packet_manager *pm); +@@ -941,6 +971,38 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + + void pm_release_ib(struct packet_manager *pm); + ++/* Following PM funcs can be shared among KV and VI */ ++unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); ++int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t ib, size_t ib_size_in_dwords, bool chain); ++int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static); ++int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, ++ struct scheduling_resources *res); ++int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, ++ enum kfd_queue_type type, ++ enum kfd_unmap_queues_filter filter, ++ uint32_t filter_param, bool reset, ++ unsigned int sdma_engine); ++int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t fence_address, uint32_t fence_value); ++uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer); ++ ++uint32_t pm_get_map_process_packet_size_vi(void); ++uint32_t pm_get_runlist_packet_size_vi(void); ++uint32_t pm_get_set_resources_packet_size_vi(void); ++uint32_t pm_get_map_queues_packet_size_vi(void); ++uint32_t pm_get_unmap_queues_packet_size_vi(void); ++uint32_t pm_get_query_status_packet_size_vi(void); ++uint32_t pm_get_release_mem_packet_size_vi(void); ++ ++ ++void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver); ++void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver); ++ ++void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver); ++ ++ + uint64_t kfd_get_number_elems(struct kfd_dev *kfd); + phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process); +@@ -950,6 +1012,8 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + + /* Events */ + extern const struct kfd_event_interrupt_class event_interrupt_class_cik; ++extern const struct kfd_event_interrupt_class event_interrupt_class_v9; ++ + extern const struct kfd_device_global_init_class device_global_init_class_cik; + + enum kfd_event_wait_result { +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +index af90b0a..94e07ee 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +@@ -146,9 +146,6 @@ static int create_cp_queue(struct process_queue_manager *pqm, + /* Doorbell initialized in user space*/ + q_properties->doorbell_ptr = NULL; + +- q_properties->doorbell_off = +- kfd_queue_id_to_doorbell(dev, pqm->process, qid); +- + /* let DQM handle it*/ + q_properties->vmid = 0; + q_properties->queue_id = qid; +@@ -283,6 +280,15 @@ int pqm_create_queue(struct process_queue_manager *pqm, + goto err_create_queue; + } + ++ if (q) ++ /* Return the doorbell offset within the doorbell page ++ * to the caller so it can be passed up to user mode ++ * (in bytes). ++ */ ++ properties->doorbell_off = ++ (q->properties.doorbell_off * sizeof(uint32_t)) & ++ (kfd_doorbell_process_slice(dev) - 1); ++ + pr_debug("kfd: PQM After DQM create queue\n"); + + list_add(&pqn->process_queue_list, &pqm->queues); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +index 36e2cba..7603967 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +@@ -1233,6 +1233,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu) + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; ++ case CHIP_VEGA10: ++ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); ++ break; + } + + /* Fix errors in CZ CRAT. +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +index aa8056b..a968e58 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +@@ -45,6 +45,7 @@ + + #define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 + #define HSA_CAP_DOORBELL_TYPE_1_0 0x1 ++#define HSA_CAP_DOORBELL_TYPE_2_0 0x2 + #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 + #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 + #define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 +diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h +new file mode 100644 +index 0000000..e00d03d +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h +@@ -0,0 +1,84 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef HSA_SOC15_INT_H_INCLUDED ++#define HSA_SOC15_INT_H_INCLUDED ++/* ++ * vega10+ IH clients ++ */ ++enum soc15_ih_client_id { ++ SOC15_IH_CLIENTID_IH = 0x00, ++ SOC15_IH_CLIENTID_ACP = 0x01, ++ SOC15_IH_CLIENTID_ATHUB = 0x02, ++ SOC15_IH_CLIENTID_BIF = 0x03, ++ SOC15_IH_CLIENTID_DCE = 0x04, ++ SOC15_IH_CLIENTID_ISP = 0x05, ++ SOC15_IH_CLIENTID_PCIE0 = 0x06, ++ SOC15_IH_CLIENTID_RLC = 0x07, ++ SOC15_IH_CLIENTID_SDMA0 = 0x08, ++ SOC15_IH_CLIENTID_SDMA1 = 0x09, ++ SOC15_IH_CLIENTID_SE0SH = 0x0a, ++ SOC15_IH_CLIENTID_SE1SH = 0x0b, ++ SOC15_IH_CLIENTID_SE2SH = 0x0c, ++ SOC15_IH_CLIENTID_SE3SH = 0x0d, ++ SOC15_IH_CLIENTID_SYSHUB = 0x0e, ++ SOC15_IH_CLIENTID_THM = 0x0f, ++ SOC15_IH_CLIENTID_UVD = 0x10, ++ SOC15_IH_CLIENTID_VCE0 = 0x11, ++ SOC15_IH_CLIENTID_VMC = 0x12, ++ SOC15_IH_CLIENTID_XDMA = 0x13, ++ SOC15_IH_CLIENTID_GRBM_CP = 0x14, ++ SOC15_IH_CLIENTID_ATS = 0x15, ++ SOC15_IH_CLIENTID_ROM_SMUIO = 0x16, ++ SOC15_IH_CLIENTID_DF = 0x17, ++ SOC15_IH_CLIENTID_VCE1 = 0x18, ++ SOC15_IH_CLIENTID_PWR = 0x19, ++ SOC15_IH_CLIENTID_UTCL2 = 0x1b, ++ SOC15_IH_CLIENTID_EA = 0x1c, ++ SOC15_IH_CLIENTID_UTCL2LOG = 0x1d, ++ SOC15_IH_CLIENTID_MP0 = 0x1e, ++ SOC15_IH_CLIENTID_MP1 = 0x1f, ++ ++ SOC15_IH_CLIENTID_MAX ++}; ++ ++ ++#define SOC15_INTSRC_CP_END_OF_PIPE 181 ++#define SOC15_INTSRC_CP_BAD_OPCODE 183 ++#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 ++#define SOC15_INTSRC_VMC_FAULT 0 ++#define SOC15_INTSRC_SDMA_TRAP 224 ++ ++ ++#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) ++#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) ++#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) ++#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) ++#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) ++#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) ++#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) ++#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) ++#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) ++#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) ++ ++#endif ++ +-- +2.7.4 + |