diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch | 6577 |
1 files changed, 0 insertions, 6577 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch deleted file mode 100644 index 1b2fa380..00000000 --- a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1624-drm-amdkfd-Add-Vega10-support-for-KFD.patch +++ /dev/null @@ -1,6577 +0,0 @@ -From 6cd5da3b1655f692cc68c402546fba401095b059 Mon Sep 17 00:00:00 2001 -From: Felix Kuehling <Felix.Kuehling@amd.com> -Date: Tue, 14 Mar 2017 23:38:24 -0400 -Subject: [PATCH 1624/4131] drm/amdkfd: Add Vega10 support for KFD - -Change-Id: Id024a9fed3bf233142a9e747e4c77659cf7ab7c1 -Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> - - Conflicts: - drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c ---- - drivers/gpu/drm/amd/amdkfd/Makefile | 10 +- - .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 1392 ++++++++++++++++++++ - drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 19 +- - drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 6 + - drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 10 +- - drivers/gpu/drm/amd/amdkfd/kfd_device.c | 52 +- - .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 196 ++- - .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 4 + - .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 87 ++ - drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 68 +- - drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 81 +- - drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 130 ++ - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 14 +- - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 7 +- - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | 125 ++ - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 385 ++++++ - drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 368 ++++++ - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 2 + - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 509 +++++++ - drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 567 +------- - drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h | 469 +------ - drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h | 583 ++++++++ - drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h | 130 +- - drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 96 +- - .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 12 +- - drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 5 + - drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 1 + - drivers/gpu/drm/amd/amdkfd/soc15_int.h | 84 ++ - 28 files changed, 4285 insertions(+), 1127 deletions(-) - create mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm - create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c - create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c - create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c - create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c - create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h - create mode 100644 drivers/gpu/drm/amd/amdkfd/soc15_int.h - -diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile -index 84646ed..fde693c 100644 ---- a/drivers/gpu/drm/amd/amdkfd/Makefile -+++ b/drivers/gpu/drm/amd/amdkfd/Makefile -@@ -11,11 +11,13 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ - kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ - kfd_process.o kfd_queue.o kfd_mqd_manager.o \ - kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ -+ kfd_mqd_manager_v9.o \ - kfd_kernel_queue.o kfd_kernel_queue_cik.o \ -- kfd_kernel_queue_vi.o kfd_packet_manager.o \ -- kfd_process_queue_manager.o kfd_device_queue_manager.o \ -- kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ -- kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ -+ kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ -+ kfd_packet_manager.o kfd_process_queue_manager.o \ -+ kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ -+ kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ -+ kfd_interrupt.o kfd_events.o cik_event_interrupt.o kfd_int_process_v9.o \ - kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o \ - kfd_peerdirect.o kfd_ipc.o - -diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm -new file mode 100644 -index 0000000..0106e77 ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm -@@ -0,0 +1,1392 @@ -+/* -+ * Copyright 2016 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#if 0 -+HW (GFX9) source code for CWSR trap handler -+#Version 18 + multiple trap handler -+ -+// this performance-optimal version was originally from Seven Xu at SRDC -+ -+// Revison #18 --... -+/* Rev History -+** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) -+** #4. SR Memory Layout: -+** 1. VGPR-SGPR-HWREG-{LDS} -+** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. -+** #5. Update: 1. Accurate g8sr_ts_save_d timestamp -+** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) -+** #7. Update: 1. don't barrier if noLDS -+** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version -+** 2. Fix SQ issue by s_sleep 2 -+** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last -+** 2. optimize s_buffer save by burst 16sgprs... -+** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. -+** #11. Update 1. Add 2 more timestamp for debug version -+** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance -+** #13. Integ 1. Always use MUBUF for PV trap shader... -+** #14. Update 1. s_buffer_store soft clause... -+** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. -+** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree -+** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] -+** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... -+** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 -+** 2. FUNC - Handle non-CWSR traps -+*/ -+ -+var G8SR_WDMEM_HWREG_OFFSET = 0 -+var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes -+ -+// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. -+ -+var G8SR_DEBUG_TIMESTAMP = 0 -+var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset -+var s_g8sr_ts_save_s = s[34:35] // save start -+var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi -+var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ -+var s_g8sr_ts_save_d = s[40:41] // save end -+var s_g8sr_ts_restore_s = s[42:43] // restore start -+var s_g8sr_ts_restore_d = s[44:45] // restore end -+ -+var G8SR_VGPR_SR_IN_DWX4 = 0 -+var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes -+var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 -+ -+ -+/*************************************************************************/ -+/* control on how to run the shader */ -+/*************************************************************************/ -+//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) -+var EMU_RUN_HACK = 0 -+var EMU_RUN_HACK_RESTORE_NORMAL = 0 -+var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 -+var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 -+var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -+var SAVE_LDS = 1 -+var WG_BASE_ADDR_LO = 0x9000a000 -+var WG_BASE_ADDR_HI = 0x0 -+var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem -+var CTX_SAVE_CONTROL = 0x0 -+var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -+var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) -+var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -+var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes -+var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing -+ -+/**************************************************************************/ -+/* variables */ -+/**************************************************************************/ -+var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 -+var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 -+var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 -+ -+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 -+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 -+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 -+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 -+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 -+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits -+ -+var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 -+var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask -+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 -+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 -+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 -+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF -+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 -+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 -+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 -+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 -+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 -+ -+var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME -+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME -+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME -+ -+var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 -+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 -+ -+ -+/* Save */ -+var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes -+var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE -+ -+var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -+var S_SAVE_SPI_INIT_ATC_SHIFT = 27 -+var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -+var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 -+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -+ -+var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used -+var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME -+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME -+var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME -+ -+var s_save_spi_init_lo = exec_lo -+var s_save_spi_init_hi = exec_hi -+ -+ //tba_lo and tba_hi need to be saved/restored -+var tba_lo = ttmp12 -+var tba_hi = ttmp13 -+var tma_lo = ttmp14 -+var tma_hi = ttmp15 -+ -+var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} -+var s_save_pc_hi = ttmp1 -+var s_save_exec_lo = ttmp2 -+var s_save_exec_hi = ttmp3 -+var s_save_status = ttmp4 -+var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine -+var s_save_xnack_mask_lo = ttmp6 -+var s_save_xnack_mask_hi = ttmp7 -+var s_save_buf_rsrc0 = ttmp8 -+var s_save_buf_rsrc1 = ttmp9 -+var s_save_buf_rsrc2 = ttmp10 -+var s_save_buf_rsrc3 = ttmp11 -+ -+var s_save_mem_offset = tma_lo -+var s_save_alloc_size = s_save_trapsts //conflict -+var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) -+var s_save_m0 = tma_hi -+ -+/* Restore */ -+var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE -+var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC -+ -+var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit -+var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 -+var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype -+var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 -+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG -+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -+ -+var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT -+var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK -+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK -+ -+var s_restore_spi_init_lo = exec_lo -+var s_restore_spi_init_hi = exec_hi -+ -+var s_restore_mem_offset = ttmp2 -+var s_restore_alloc_size = ttmp3 -+var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored -+var s_restore_mem_offset_save = s_restore_tmp //no conflict -+ -+var s_restore_m0 = s_restore_alloc_size //no conflict -+ -+var s_restore_mode = ttmp7 -+ -+var s_restore_pc_lo = ttmp0 -+var s_restore_pc_hi = ttmp1 -+var s_restore_exec_lo = tma_lo //no conflict -+var s_restore_exec_hi = tma_hi //no conflict -+var s_restore_status = ttmp4 -+var s_restore_trapsts = ttmp5 -+var s_restore_xnack_mask_lo = xnack_mask_lo -+var s_restore_xnack_mask_hi = xnack_mask_hi -+var s_restore_buf_rsrc0 = ttmp8 -+var s_restore_buf_rsrc1 = ttmp9 -+var s_restore_buf_rsrc2 = ttmp10 -+var s_restore_buf_rsrc3 = ttmp11 -+ -+/**************************************************************************/ -+/* trap handler entry points */ -+/**************************************************************************/ -+/* Shader Main*/ -+ -+shader main -+ asic(GFX9) -+ type(CS) -+ -+ -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore -+ //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC -+ s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC -+ s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. -+ s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE -+ //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE -+ s_branch L_SKIP_RESTORE //NOT restore, SAVE actually -+ else -+ s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save -+ end -+ -+L_JUMP_TO_RESTORE: -+ s_branch L_RESTORE //restore -+ -+L_SKIP_RESTORE: -+ -+ s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC -+ s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save -+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save -+ s_cbranch_scc1 L_SAVE //this is the operation for save -+ -+ // ********* Handle non-CWSR traps ******************* -+if (!EMU_RUN_HACK) -+ /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ -+ s_getreg_b32 tma_lo,hwreg(HW_REG_SQ_SHADER_TMA_LO) -+ s_getreg_b32 tma_hi,hwreg(HW_REG_SQ_SHADER_TMA_HI) -+ s_load_dwordx4 [tba_lo,tba_hi,tma_lo, tma_hi], [tma_lo,tma_hi], 0 -+ s_waitcnt lgkmcnt(0) -+ s_or_b32 ttmp11, tba_lo, tba_hi -+ s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set -+ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) -+ s_setpc_b64 [tba_lo,tba_hi] //jump to next level trap handler -+ -+L_NO_NEXT_TRAP: -+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception -+ s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. -+ s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 -+ s_addc_u32 ttmp1, ttmp1, 0 -+L_EXCP_CASE: -+ s_and_b32 ttmp1, ttmp1, 0xFFFF -+ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) -+ s_rfe_b64 [ttmp0, ttmp1] -+end -+ // ********* End handling of non-CWSR traps ******************* -+ -+/**************************************************************************/ -+/* save routine */ -+/**************************************************************************/ -+ -+L_SAVE: -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_save_s -+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -+end -+ -+ //check whether there is mem_viol -+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK -+ s_cbranch_scc0 L_NO_PC_REWIND -+ -+ //if so, need rewind PC assuming GDS operation gets NACKed -+ s_mov_b32 s_save_tmp, 0 //clear mem_viol bit -+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit -+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] -+ s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 -+ s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc -+ -+L_NO_PC_REWIND: -+ s_mov_b32 s_save_tmp, 0 //clear saveCtx bit -+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit -+ -+ s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK -+ s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation -+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT -+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT -+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp -+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY -+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp -+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS -+ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG -+ -+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp -+ -+ /* inform SPI the readiness and wait for SPI's go signal */ -+ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI -+ s_mov_b32 s_save_exec_hi, exec_hi -+ s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_sq_save_msg -+ s_waitcnt lgkmcnt(0) -+end -+ -+ if (EMU_RUN_HACK) -+ -+ else -+ s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC -+ end -+ -+ L_SLEEP: -+ s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 -+ -+ if (EMU_RUN_HACK) -+ -+ else -+ s_cbranch_execz L_SLEEP -+ end -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_spi_wrexec -+ s_waitcnt lgkmcnt(0) -+end -+ -+ /* setup Resource Contants */ -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) -+ //calculate wd_addr using absolute thread id -+ v_readlane_b32 s_save_tmp, v9, 0 -+ s_lshr_b32 s_save_tmp, s_save_tmp, 6 -+ s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE -+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO -+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI -+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL -+ else -+ end -+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) -+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO -+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI -+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL -+ else -+ end -+ -+ -+ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo -+ s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE -+ s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited -+ s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC -+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK -+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position -+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC -+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK -+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position -+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE -+ -+ //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) -+ s_mov_b32 s_save_m0, m0 //save M0 -+ -+ /* global mem offset */ -+ s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 -+ -+ -+ -+ -+ /* save HW registers */ -+ ////////////////////////////// -+ -+ L_SAVE_HWREG: -+ // HWREG SR memory offset : size(VGPR)+size(SGPR) -+ get_vgpr_size_bytes(s_save_mem_offset) -+ get_sgpr_size_bytes(s_save_tmp) -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp -+ -+ -+ s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ -+ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 -+ -+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) -+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 -+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over -+ s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO -+ s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI -+ end -+ -+ write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC -+ write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) -+ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC -+ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) -+ write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS -+ -+ //s_save_trapsts conflicts with s_save_alloc_size -+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) -+ write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS -+ -+ write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO -+ write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI -+ -+ //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 -+ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE -+ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) -+ write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO -+ write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI -+ -+ -+ -+ /* the first wave in the threadgroup */ -+ // save fist_wave bits in tba_hi unused bit.26 -+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit -+ //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] -+ s_mov_b32 s_save_exec_hi, 0x0 -+ s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] -+ -+ -+ /* save SGPRs */ -+ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... -+ ////////////////////////////// -+ -+ // SGPR SR memory offset : size(VGPR) -+ get_vgpr_size_bytes(s_save_mem_offset) -+ // TODO, change RSRC word to rearrange memory layout for SGPRS -+ -+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 -+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) -+ -+ if (SGPR_SAVE_USE_SQC) -+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes -+ else -+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) -+ end -+ -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ -+ // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 -+ //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 -+ s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 -+ s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset -+ -+ s_mov_b32 m0, 0x0 //SGPR initial index value =0 -+ s_nop 0x0 //Manually inserted wait states -+ L_SAVE_SGPR_LOOP: -+ // SGPR is allocated in 16 SGPR granularity -+ s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] -+ s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] -+ s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] -+ s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] -+ s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] -+ s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] -+ s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] -+ s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] -+ -+ write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 -+ s_add_u32 m0, m0, 16 //next sgpr index -+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? -+ // restore s_save_buf_rsrc0,1 -+ //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo -+ s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo -+ -+ -+ -+ -+ /* save first 4 VGPR, then LDS save could use */ -+ // each wave will alloc 4 vgprs at least... -+ ///////////////////////////////////////////////////////////////////////////////////// -+ -+ s_mov_b32 s_save_mem_offset, 0 -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ -+ // VGPR Allocated in 4-GPR granularity -+ -+if G8SR_VGPR_SR_IN_DWX4 -+ // the const stride for DWx4 is 4*4 bytes -+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -+ -+ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -+ -+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -+else -+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 -+ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 -+ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -+end -+ -+ -+ -+ /* save LDS */ -+ ////////////////////////////// -+ -+ L_SAVE_LDS: -+ -+ // Change EXEC to all threads... -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size -+ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? -+ s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE -+ -+ s_barrier //LDS is used? wait for other waves in the same TG -+ //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here -+ s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here -+ s_cbranch_scc0 L_SAVE_LDS_DONE -+ -+ // first wave do LDS save; -+ -+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw -+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes -+ s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes -+ -+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) -+ // -+ get_vgpr_size_bytes(s_save_mem_offset) -+ get_sgpr_size_bytes(s_save_tmp) -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() -+ -+ -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -+ -+ -+var LDS_DMA_ENABLE = 0 -+var UNROLL = 0 -+if UNROLL==0 && LDS_DMA_ENABLE==1 -+ s_mov_b32 s3, 256*2 -+ s_nop 0 -+ s_nop 0 -+ s_nop 0 -+ L_SAVE_LDS_LOOP: -+ //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? -+ if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity -+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW -+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW -+ end -+ -+ s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes -+ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? -+ -+elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss -+ // store from higest LDS address to lowest -+ s_mov_b32 s3, 256*2 -+ s_sub_u32 m0, s_save_alloc_size, s3 -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 -+ s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... -+ s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest -+ s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc -+ s_nop 0 -+ s_nop 0 -+ s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes -+ s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved -+ s_add_u32 s0, s0,s_save_alloc_size -+ s_addc_u32 s1, s1, 0 -+ s_setpc_b64 s[0:1] -+ -+ -+ for var i =0; i< 128; i++ -+ // be careful to make here a 64Byte aligned address, which could improve performance... -+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW -+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW -+ -+ if i!=127 -+ s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline -+ s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 -+ end -+ end -+ -+else // BUFFER_STORE -+ v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 -+ v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid -+ v_mul_i32_i24 v2, v3, 8 // tid*8 -+ v_mov_b32 v3, 256*2 -+ s_mov_b32 m0, 0x10000 -+ s_mov_b32 s0, s_save_buf_rsrc3 -+ s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid -+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT -+ -+L_SAVE_LDS_LOOP_VECTOR: -+ ds_read_b64 v[0:1], v2 //x =LDS[a], byte address -+ s_waitcnt lgkmcnt(0) -+ buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 -+// s_waitcnt vmcnt(0) -+// v_add_u32 v2, vcc[0:1], v2, v3 -+ v_add_u32 v2, v2, v3 -+ v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size -+ s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR -+ -+ // restore rsrc3 -+ s_mov_b32 s_save_buf_rsrc3, s0 -+ -+end -+ -+L_SAVE_LDS_DONE: -+ -+ -+ /* save VGPRs - set the Rest VGPRs */ -+ ////////////////////////////////////////////////////////////////////////////////////// -+ L_SAVE_VGPR: -+ // VGPR SR memory offset: 0 -+ // TODO rearrange the RSRC words to use swizzle for VGPR save... -+ -+ s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 -+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible -+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) -+ if (SWIZZLE_EN) -+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ -+ // VGPR Allocated in 4-GPR granularity -+ -+if G8SR_VGPR_SR_IN_DWX4 -+ // the const stride for DWx4 is 4*4 bytes -+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -+ -+ s_mov_b32 m0, 4 // skip first 4 VGPRs -+ s_cmp_lt_u32 m0, s_save_alloc_size -+ s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs -+ -+ s_set_gpr_idx_on m0, 0x1 // This will change M0 -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 -+L_SAVE_VGPR_LOOP: -+ v_mov_b32 v0, v0 // v0 = v[0+m0] -+ v_mov_b32 v1, v1 -+ v_mov_b32 v2, v2 -+ v_mov_b32 v3, v3 -+ -+ -+ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -+ s_add_u32 m0, m0, 4 -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 -+ s_cmp_lt_u32 m0, s_save_alloc_size -+ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? -+ s_set_gpr_idx_off -+L_SAVE_VGPR_LOOP_END: -+ -+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -+else -+ // VGPR store using dw burst -+ s_mov_b32 m0, 0x4 //VGPR initial index value =0 -+ s_cmp_lt_u32 m0, s_save_alloc_size -+ s_cbranch_scc0 L_SAVE_VGPR_END -+ -+ -+ s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 -+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later -+ -+ L_SAVE_VGPR_LOOP: -+ v_mov_b32 v0, v0 //v0 = v[0+m0] -+ v_mov_b32 v1, v1 //v0 = v[0+m0] -+ v_mov_b32 v2, v2 //v0 = v[0+m0] -+ v_mov_b32 v3, v3 //v0 = v[0+m0] -+ -+ if(USE_MTBUF_INSTEAD_OF_MUBUF) -+ tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -+ else -+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 -+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 -+ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 -+ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -+ end -+ -+ s_add_u32 m0, m0, 4 //next vgpr index -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes -+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? -+ s_set_gpr_idx_off -+end -+ -+L_SAVE_VGPR_END: -+ -+ -+ -+ -+ -+ -+ /* S_PGM_END_SAVED */ //FIXME graphics ONLY -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) -+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] -+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 -+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over -+ s_rfe_b64 s_save_pc_lo //Return to the main shader program -+ else -+ end -+ -+// Save Done timestamp -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_save_d -+ // SGPR SR memory offset : size(VGPR) -+ get_vgpr_size_bytes(s_save_mem_offset) -+ s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET -+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -+ // Need reset rsrc2?? -+ s_mov_b32 m0, s_save_mem_offset -+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 -+end -+ -+ -+ s_branch L_END_PGM -+ -+ -+ -+/**************************************************************************/ -+/* restore routine */ -+/**************************************************************************/ -+ -+L_RESTORE: -+ /* Setup Resource Contants */ -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) -+ //calculate wd_addr using absolute thread id -+ v_readlane_b32 s_restore_tmp, v9, 0 -+ s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 -+ s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE -+ s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO -+ s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI -+ s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL -+ else -+ end -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_restore_s -+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -+ // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... -+ s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] -+ s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. -+end -+ -+ -+ -+ s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo -+ s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi -+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE -+ s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) -+ s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC -+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK -+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position -+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC -+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK -+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position -+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE -+ -+ /* global mem offset */ -+// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 -+ -+ /* the first wave in the threadgroup */ -+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK -+ s_cbranch_scc0 L_RESTORE_VGPR -+ -+ /* restore LDS */ -+ ////////////////////////////// -+ L_RESTORE_LDS: -+ -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size -+ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? -+ s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR -+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw -+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes -+ s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes -+ -+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) -+ // -+ get_vgpr_size_bytes(s_restore_mem_offset) -+ get_sgpr_size_bytes(s_restore_tmp) -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? -+ -+ -+ if (SWIZZLE_EN) -+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -+ -+ L_RESTORE_LDS_LOOP: -+ if (SAVE_LDS) -+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW -+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW -+ end -+ s_add_u32 m0, m0, 256*2 // 128 DW -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW -+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? -+ -+ -+ /* restore VGPRs */ -+ ////////////////////////////// -+ L_RESTORE_VGPR: -+ // VGPR SR memory offset : 0 -+ s_mov_b32 s_restore_mem_offset, 0x0 -+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead -+ s_mov_b32 exec_hi, 0xFFFFFFFF -+ -+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 -+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) -+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) -+ if (SWIZZLE_EN) -+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+if G8SR_VGPR_SR_IN_DWX4 -+ get_vgpr_size_bytes(s_restore_mem_offset) -+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -+ -+ // the const stride for DWx4 is 4*4 bytes -+ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes -+ -+ s_mov_b32 m0, s_restore_alloc_size -+ s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 -+ -+L_RESTORE_VGPR_LOOP: -+ buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 -+ s_waitcnt vmcnt(0) -+ s_sub_u32 m0, m0, 4 -+ v_mov_b32 v0, v0 // v[0+m0] = v0 -+ v_mov_b32 v1, v1 -+ v_mov_b32 v2, v2 -+ v_mov_b32 v3, v3 -+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -+ s_cmp_eq_u32 m0, 0x8000 -+ s_cbranch_scc0 L_RESTORE_VGPR_LOOP -+ s_set_gpr_idx_off -+ -+ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 -+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes -+ -+else -+ // VGPR load using dw burst -+ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 -+ s_mov_b32 m0, 4 //VGPR initial index value = 1 -+ s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 -+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later -+ -+ L_RESTORE_VGPR_LOOP: -+ if(USE_MTBUF_INSTEAD_OF_MUBUF) -+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -+ else -+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 -+ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 -+ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 -+ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 -+ end -+ s_waitcnt vmcnt(0) //ensure data ready -+ v_mov_b32 v0, v0 //v[0+m0] = v0 -+ v_mov_b32 v1, v1 -+ v_mov_b32 v2, v2 -+ v_mov_b32 v3, v3 -+ s_add_u32 m0, m0, 4 //next vgpr index -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes -+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 -+ s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? -+ s_set_gpr_idx_off -+ /* VGPR restore on v0 */ -+ if(USE_MTBUF_INSTEAD_OF_MUBUF) -+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 -+ else -+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 -+ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 -+ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 -+ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 -+ end -+ -+end -+ -+ /* restore SGPRs */ -+ ////////////////////////////// -+ -+ // SGPR SR memory offset : size(VGPR) -+ get_vgpr_size_bytes(s_restore_mem_offset) -+ get_sgpr_size_bytes(s_restore_tmp) -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group -+ // TODO, change RSRC word to rearrange memory layout for SGPRS -+ -+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 -+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) -+ -+ if (SGPR_SAVE_USE_SQC) -+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes -+ else -+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) -+ end -+ if (SWIZZLE_EN) -+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), -+ However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG -+ */ -+ s_mov_b32 m0, s_restore_alloc_size -+ -+ L_RESTORE_SGPR_LOOP: -+ read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made -+ s_waitcnt lgkmcnt(0) //ensure data ready -+ -+ s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] -+ -+ s_movreld_b64 s0, s0 //s[0+m0] = s0 -+ s_movreld_b64 s2, s2 -+ s_movreld_b64 s4, s4 -+ s_movreld_b64 s6, s6 -+ s_movreld_b64 s8, s8 -+ s_movreld_b64 s10, s10 -+ s_movreld_b64 s12, s12 -+ s_movreld_b64 s14, s14 -+ -+ s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 -+ s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? -+ -+ /* restore HW registers */ -+ ////////////////////////////// -+ L_RESTORE_HWREG: -+ -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo -+ s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi -+end -+ -+ // HWREG SR memory offset : size(VGPR)+size(SGPR) -+ get_vgpr_size_bytes(s_restore_mem_offset) -+ get_sgpr_size_bytes(s_restore_tmp) -+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp -+ -+ -+ s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes -+ if (SWIZZLE_EN) -+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? -+ else -+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes -+ end -+ -+ read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 -+ read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC -+ read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) -+ read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC -+ read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) -+ read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS -+ read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS -+ read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO -+ read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI -+ read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE -+ read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO -+ read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI -+ -+ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS -+ -+ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS -+ -+ //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: -+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) -+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) -+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over -+ end -+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) -+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal -+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over -+ end -+ -+ s_mov_b32 m0, s_restore_m0 -+ s_mov_b32 exec_lo, s_restore_exec_lo -+ s_mov_b32 exec_hi, s_restore_exec_hi -+ -+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts -+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 -+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts -+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT -+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 -+ //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore -+ s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode -+ //reuse s_restore_m0 as a temp register -+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK -+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT -+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT -+ s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero -+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 -+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK -+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT -+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT -+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 -+ s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK -+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT -+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp -+ -+ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 -+ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 -+ s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu -+ -+ s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time -+ -+if G8SR_DEBUG_TIMESTAMP -+ s_memrealtime s_g8sr_ts_restore_d -+ s_waitcnt lgkmcnt(0) -+end -+ -+// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution -+ s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc -+ -+ -+/**************************************************************************/ -+/* the END */ -+/**************************************************************************/ -+L_END_PGM: -+ s_endpgm -+ -+end -+ -+ -+/**************************************************************************/ -+/* the helper functions */ -+/**************************************************************************/ -+ -+//Only for save hwreg to mem -+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) -+ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on -+ s_mov_b32 m0, s_mem_offset -+ s_buffer_store_dword s, s_rsrc, m0 glc:1 -+ s_add_u32 s_mem_offset, s_mem_offset, 4 -+ s_mov_b32 m0, exec_lo -+end -+ -+ -+// HWREG are saved before SGPRs, so all HWREG could be use. -+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) -+ -+ s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 -+ s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 -+ s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 -+ s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 -+ s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 -+ s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc -+end -+ -+ -+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) -+ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 -+ s_add_u32 s_mem_offset, s_mem_offset, 4 -+end -+ -+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) -+ s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 -+ s_sub_u32 s_mem_offset, s_mem_offset, 4*16 -+end -+ -+ -+ -+function get_lds_size_bytes(s_lds_size_byte) -+ // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW -+ s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size -+ s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW -+end -+ -+function get_vgpr_size_bytes(s_vgpr_size_byte) -+ s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size -+ s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 -+ s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible -+end -+ -+function get_sgpr_size_bytes(s_sgpr_size_byte) -+ s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size -+ s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 -+ s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) -+end -+ -+function get_hwreg_size_bytes -+ return 128 //HWREG size 128 bytes -+end -+ -+ -+#endif -+ -+static const uint32_t cwsr_trap_gfx9_hex[] = { -+ 0xbf820001, 0xbf820125, -+ 0xb8f0f802, 0x89708670, -+ 0xb8f1f803, 0x8671ff71, -+ 0x00000400, 0xbf850013, -+ 0xb8faf812, 0xb8fbf813, -+ 0xc00a1e3d, 0x00000000, -+ 0xbf8cc07f, 0x87777978, -+ 0xbf840002, 0xb970f802, -+ 0xbe801d78, 0xb8f1f803, -+ 0x8671ff71, 0x000001ff, -+ 0xbf850002, 0x806c846c, -+ 0x826d806d, 0x866dff6d, -+ 0x0000ffff, 0xb970f802, -+ 0xbe801f6c, 0xb8f1f803, -+ 0x8671ff71, 0x00000100, -+ 0xbf840006, 0xbef60080, -+ 0xb9760203, 0x866dff6d, -+ 0x0000ffff, 0x80ec886c, -+ 0x82ed806d, 0xbef60080, -+ 0xb9760283, 0xbef20068, -+ 0xbef30069, 0xb8f62407, -+ 0x8e769c76, 0x876d766d, -+ 0xb8f603c7, 0x8e769b76, -+ 0x876d766d, 0xb8f6f807, -+ 0x8676ff76, 0x00007fff, -+ 0xb976f807, 0xbeee007e, -+ 0xbeef007f, 0xbefe0180, -+ 0xbf900004, 0xbf8e0002, -+ 0xbf88fffe, 0xbef4007e, -+ 0x8675ff7f, 0x0000ffff, -+ 0x8775ff75, 0x00040000, -+ 0xbef60080, 0xbef700ff, -+ 0x00807fac, 0x8676ff7f, -+ 0x08000000, 0x8f768376, -+ 0x87777677, 0x8676ff7f, -+ 0x70000000, 0x8f768176, -+ 0x87777677, 0xbefb007c, -+ 0xbefa0080, 0xb8fa2a05, -+ 0x807a817a, 0x8e7a8a7a, -+ 0xb8f61605, 0x80768176, -+ 0x8e768676, 0x807a767a, -+ 0xbef60084, 0xbef600ff, -+ 0x01000000, 0xbefe007c, -+ 0xbefc007a, 0xc0611efa, -+ 0x0000007c, 0x807a847a, -+ 0xbefc007e, 0xbefe007c, -+ 0xbefc007a, 0xc0611b3a, -+ 0x0000007c, 0x807a847a, -+ 0xbefc007e, 0xbefe007c, -+ 0xbefc007a, 0xc0611b7a, -+ 0x0000007c, 0x807a847a, -+ 0xbefc007e, 0xbefe007c, -+ 0xbefc007a, 0xc0611bba, -+ 0x0000007c, 0x807a847a, -+ 0xbefc007e, 0xbefe007c, -+ 0xbefc007a, 0xc0611bfa, -+ 0x0000007c, 0x807a847a, -+ 0xbefc007e, 0xbefe007c, -+ 0xbefc007a, 0xc0611c3a, -+ 0x0000007c, 0x807a847a, -+ 0xbefc007e, 0xb8f1f803, -+ 0xbefe007c, 0xbefc007a, -+ 0xc0611c7a, 0x0000007c, -+ 0x807a847a, 0xbefc007e, -+ 0xbefe007c, 0xbefc007a, -+ 0xc0611cba, 0x0000007c, -+ 0x807a847a, 0xbefc007e, -+ 0xbefe007c, 0xbefc007a, -+ 0xc0611cfa, 0x0000007c, -+ 0x807a847a, 0xbefc007e, -+ 0xb8fbf801, 0xbefe007c, -+ 0xbefc007a, 0xc0611efa, -+ 0x0000007c, 0x807a847a, -+ 0xbefc007e, 0xbefe007c, -+ 0xbefc007a, 0xc0611e3a, -+ 0x0000007c, 0x807a847a, -+ 0xbefc007e, 0xbefe007c, -+ 0xbefc007a, 0xc0611e7a, -+ 0x0000007c, 0x807a847a, -+ 0xbefc007e, 0x8676ff7f, -+ 0x04000000, 0xbeef0080, -+ 0x876f6f76, 0xb8fa2a05, -+ 0x807a817a, 0x8e7a8a7a, -+ 0xb8f11605, 0x80718171, -+ 0x8e718471, 0x8e768271, -+ 0xbef600ff, 0x01000000, -+ 0xbef20174, 0x80747a74, -+ 0xbefc0080, 0xbf800000, -+ 0xbe802b00, 0xbe822b02, -+ 0xbe842b04, 0xbe862b06, -+ 0xbe882b08, 0xbe8a2b0a, -+ 0xbe8c2b0c, 0xbe8e2b0e, -+ 0xc06b003a, 0x00000000, -+ 0xc06b013a, 0x00000010, -+ 0xc06b023a, 0x00000020, -+ 0xc06b033a, 0x00000030, -+ 0x8074c074, 0x82758075, -+ 0x807c907c, 0xbf0a717c, -+ 0xbf85ffeb, 0xbef40172, -+ 0xbefa0080, 0xbefe00c1, -+ 0xbeff00c1, 0xbef600ff, -+ 0x01000000, 0xe0724000, -+ 0x7a1d0000, 0xe0724100, -+ 0x7a1d0100, 0xe0724200, -+ 0x7a1d0200, 0xe0724300, -+ 0x7a1d0300, 0xbefe00c1, -+ 0xbeff00c1, 0xb8f14306, -+ 0x8671c171, 0xbf84002c, -+ 0xbf8a0000, 0x8676ff6f, -+ 0x04000000, 0xbf840028, -+ 0x8e718671, 0x8e718271, -+ 0xbef60071, 0xb8fa2a05, -+ 0x807a817a, 0x8e7a8a7a, -+ 0xb8f61605, 0x80768176, -+ 0x8e768676, 0x807a767a, -+ 0x807aff7a, 0x00000080, -+ 0xbef600ff, 0x01000000, -+ 0xbefc0080, 0xd28c0002, -+ 0x000100c1, 0xd28d0003, -+ 0x000204c1, 0xd1060002, -+ 0x00011103, 0x7e0602ff, -+ 0x00000200, 0xbefc00ff, -+ 0x00010000, 0xbe800077, -+ 0x8677ff77, 0xff7fffff, -+ 0x8777ff77, 0x00058000, -+ 0xd8ec0000, 0x00000002, -+ 0xbf8cc07f, 0xe0765000, -+ 0x7a1d0002, 0x68040702, -+ 0xd0c9006a, 0x0000e302, -+ 0xbf87fff7, 0xbef70000, -+ 0xbefa00ff, 0x00000400, -+ 0xbefe00c1, 0xbeff00c1, -+ 0xb8f12a05, 0x80718171, -+ 0x8e718271, 0x8e768871, -+ 0xbef600ff, 0x01000000, -+ 0xbefc0084, 0xbf0a717c, -+ 0xbf840015, 0xbf11017c, -+ 0x8071ff71, 0x00001000, -+ 0x7e000300, 0x7e020301, -+ 0x7e040302, 0x7e060303, -+ 0xe0724000, 0x7a1d0000, -+ 0xe0724100, 0x7a1d0100, -+ 0xe0724200, 0x7a1d0200, -+ 0xe0724300, 0x7a1d0300, -+ 0x807c847c, 0x807aff7a, -+ 0x00000400, 0xbf0a717c, -+ 0xbf85ffef, 0xbf9c0000, -+ 0xbf8200ca, 0xbef4007e, -+ 0x8675ff7f, 0x0000ffff, -+ 0x8775ff75, 0x00040000, -+ 0xbef60080, 0xbef700ff, -+ 0x00807fac, 0x8672ff7f, -+ 0x08000000, 0x8f728372, -+ 0x87777277, 0x8672ff7f, -+ 0x70000000, 0x8f728172, -+ 0x87777277, 0x8672ff7f, -+ 0x04000000, 0xbf84001e, -+ 0xbefe00c1, 0xbeff00c1, -+ 0xb8ef4306, 0x866fc16f, -+ 0xbf840019, 0x8e6f866f, -+ 0x8e6f826f, 0xbef6006f, -+ 0xb8ee2a05, 0x806e816e, -+ 0x8e6e8a6e, 0xb8f21605, -+ 0x80728172, 0x8e728672, -+ 0x806e726e, 0x806eff6e, -+ 0x00000080, 0xbef600ff, -+ 0x01000000, 0xbefc0080, -+ 0xe0510000, 0x6e1d0000, -+ 0xe0510100, 0x6e1d0000, -+ 0x807cff7c, 0x00000200, -+ 0x806eff6e, 0x00000200, -+ 0xbf0a6f7c, 0xbf85fff6, -+ 0xbeee0080, 0xbefe00c1, -+ 0xbeff00c1, 0xb8ef2a05, -+ 0x806f816f, 0x8e6f826f, -+ 0x8e76886f, 0xbef600ff, -+ 0x01000000, 0xbef2006e, -+ 0x806eff6e, 0x00000400, -+ 0xbefc0084, 0xbf11087c, -+ 0x806fff6f, 0x00008000, -+ 0xe0524000, 0x6e1d0000, -+ 0xe0524100, 0x6e1d0100, -+ 0xe0524200, 0x6e1d0200, -+ 0xe0524300, 0x6e1d0300, -+ 0xbf8c0f70, 0x7e000300, -+ 0x7e020301, 0x7e040302, -+ 0x7e060303, 0x807c847c, -+ 0x806eff6e, 0x00000400, -+ 0xbf0a6f7c, 0xbf85ffee, -+ 0xbf9c0000, 0xe0524000, -+ 0x721d0000, 0xe0524100, -+ 0x721d0100, 0xe0524200, -+ 0x721d0200, 0xe0524300, -+ 0x721d0300, 0xb8ee2a05, -+ 0x806e816e, 0x8e6e8a6e, -+ 0xb8f21605, 0x80728172, -+ 0x8e728672, 0x806e726e, -+ 0x80eec06e, 0xb8ef1605, -+ 0x806f816f, 0x8e6f846f, -+ 0x8e76826f, 0xbef600ff, -+ 0x01000000, 0xbefc006f, -+ 0xc031003a, 0x0000006e, -+ 0x80eec06e, 0xbf8cc07f, -+ 0x80fc907c, 0xbe802d00, -+ 0xbe822d02, 0xbe842d04, -+ 0xbe862d06, 0xbe882d08, -+ 0xbe8a2d0a, 0xbe8c2d0c, -+ 0xbe8e2d0e, 0xbf06807c, -+ 0xbf84fff1, 0xb8ee2a05, -+ 0x806e816e, 0x8e6e8a6e, -+ 0xb8f21605, 0x80728172, -+ 0x8e728672, 0x806e726e, -+ 0xbef60084, 0xbef600ff, -+ 0x01000000, 0xc0211bfa, -+ 0x0000006e, 0x806e846e, -+ 0xc0211b3a, 0x0000006e, -+ 0x806e846e, 0xc0211b7a, -+ 0x0000006e, 0x806e846e, -+ 0xc0211eba, 0x0000006e, -+ 0x806e846e, 0xc0211efa, -+ 0x0000006e, 0x806e846e, -+ 0xc0211c3a, 0x0000006e, -+ 0x806e846e, 0xc0211c7a, -+ 0x0000006e, 0x806e846e, -+ 0xc0211a3a, 0x0000006e, -+ 0x806e846e, 0xc0211a7a, -+ 0x0000006e, 0x806e846e, -+ 0xc0211cfa, 0x0000006e, -+ 0x806e846e, 0xc0211e3a, -+ 0x0000006e, 0x806e846e, -+ 0xc0211e7a, 0x0000006e, -+ 0x806e846e, 0xbf8cc07f, -+ 0x866dff6d, 0x0000ffff, -+ 0xbefc006f, 0xbefe007a, -+ 0xbeff007b, 0x866f71ff, -+ 0x000003ff, 0xb96f4803, -+ 0x866f71ff, 0xfffff800, -+ 0x8f6f8b6f, 0xb96fa2c3, -+ 0xb973f801, 0x866fff6d, -+ 0xf0000000, 0x8f6f9c6f, -+ 0x8e6f906f, 0xbef20080, -+ 0x87726f72, 0x866fff6d, -+ 0x08000000, 0x8f6f9b6f, -+ 0x8e6f8f6f, 0x87726f72, -+ 0x866fff70, 0x00800000, -+ 0x8f6f976f, 0xb972f807, -+ 0x86fe7e7e, 0x86ea6a6a, -+ 0xb970f802, 0xbf8a0000, -+ 0x95806f6c, 0xbf810000, -+ 0x00000000, 0x00000000, -+}; -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -index b13dcc3..b8e436c 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c -@@ -313,8 +313,16 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, - - - /* Return gpu_id as doorbell offset for mmap usage */ -- args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL | args->gpu_id); -+ args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; -+ args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); - args->doorbell_offset <<= PAGE_SHIFT; -+ if (KFD_IS_SOC15(dev->device_info->asic_family)) -+ /* On SOC15 ASICs, doorbell allocation must be -+ * per-device, and independent from the per-process -+ * queue_id. Return the doorbell offset within the -+ * doorbell aperture to user mode. -+ */ -+ args->doorbell_offset |= q_properties.doorbell_off; - - up_write(&p->lock); - -@@ -1279,6 +1287,8 @@ static uint32_t kfd_convert_user_mem_alloction_flags( - out: - if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_AQL_QUEUE_MEM) - kernel_allocation_flags |= ALLOC_MEM_FLAGS_AQL_QUEUE_MEM; -+ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) -+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_COHERENT; - /* Current HW doesn't support non paged memory */ - kernel_allocation_flags |= ALLOC_MEM_FLAGS_NONPAGED; - /* -@@ -1320,7 +1330,7 @@ static int kfd_ioctl_alloc_memory_of_gpu_new(struct file *filep, - return PTR_ERR(pdd); - - if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { -- if (args->size != kfd_doorbell_process_slice()) -+ if (args->size != kfd_doorbell_process_slice(dev)) - return -EINVAL; - offset = kfd_get_process_doorbells(dev, p); - } else -@@ -2361,7 +2371,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) - - switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { - case KFD_MMAP_TYPE_DOORBELL: -- return kfd_doorbell_mmap(process, vma); -+ kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); -+ if (!kfd) -+ return -EFAULT; -+ return kfd_doorbell_mmap(kfd, process, vma); - - case KFD_MMAP_TYPE_EVENTS: - return kfd_event_mmap(process, vma); -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -index 95ff6ec..9520298 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c -@@ -110,6 +110,8 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { - #define fiji_cache_info carrizo_cache_info - #define polaris10_cache_info carrizo_cache_info - #define polaris11_cache_info carrizo_cache_info -+/* TODO - check & update Vega10 cache details */ -+#define vega10_cache_info carrizo_cache_info - - static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, - struct crat_subtype_computeunit *cu) -@@ -591,6 +593,10 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, - pcache_info = polaris11_cache_info; - num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); - break; -+ case CHIP_VEGA10: -+ pcache_info = vega10_cache_info; -+ num_of_cache_types = ARRAY_SIZE(vega10_cache_info); -+ break; - default: - return -EINVAL; - } -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c -index 9de73ce..5fea0d3 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c -@@ -29,7 +29,7 @@ - #include <linux/mutex.h> - #include <linux/device.h> - --#include "kfd_pm4_headers.h" -+#include "kfd_pm4_headers_vi.h" - #include "kfd_pm4_headers_diq.h" - #include "kfd_kernel_queue.h" - #include "kfd_priv.h" -@@ -52,7 +52,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - { - int status = 0; - unsigned int *ib_packet_buff = NULL; -- struct pm4__release_mem *rm_packet; -+ struct pm4_mec_release_mem *rm_packet; - struct pm4__indirect_buffer_pasid *ib_packet; - struct kernel_queue *kq = dbgdev->kq; - size_t pq_packets_size_in_bytes = -@@ -78,7 +78,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - */ - if (sync) - pq_packets_size_in_bytes += -- sizeof(struct pm4__release_mem); -+ sizeof(struct pm4_mec_release_mem); - status = kq->ops.acquire_packet_buffer(kq, pq_packets_size_in_bytes / sizeof(uint32_t), &ib_packet_buff); - if (status != 0) { - pr_debug("Error! kfd: In func %s >> acquire_packet_buffer failed\n", __func__); -@@ -116,7 +116,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - * (a) Sync with HW - * (b) Sync var is written by CP to mem. - */ -- rm_packet = (struct pm4__release_mem *) (ib_packet_buff + -+ rm_packet = (struct pm4_mec_release_mem *) (ib_packet_buff + - (sizeof(struct pm4__indirect_buffer_pasid) / sizeof(unsigned int))); - - status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), -@@ -130,7 +130,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, - - rm_packet->header.opcode = IT_RELEASE_MEM; - rm_packet->header.type = PM4_TYPE_3; -- rm_packet->header.count = sizeof(struct pm4__release_mem) / sizeof(unsigned int) - 2; -+ rm_packet->header.count = sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int) - 2; - - rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; - rm_packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c -index 6874eb5..4eda7c5 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c -@@ -29,8 +29,9 @@ - #include <linux/fence.h> - #include "kfd_priv.h" - #include "kfd_device_queue_manager.h" --#include "kfd_pm4_headers.h" -+#include "kfd_pm4_headers_vi.h" - #include "cwsr_trap_handler_carrizo.h" -+#include "cwsr_trap_handler_gfx9.asm" - - #define MQD_SIZE_ALIGNED 768 - -@@ -39,6 +40,7 @@ static const struct kfd_device_info kaveri_device_info = { - .max_pasid_bits = 16, - /* max num of queues for KV.TODO should be a dynamic value */ - .max_no_of_hqd = 24, -+ .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -@@ -53,6 +55,7 @@ static const struct kfd_device_info hawaii_device_info = { - .max_pasid_bits = 16, - /* max num of queues for KV.TODO should be a dynamic value */ - .max_no_of_hqd = 24, -+ .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -@@ -67,6 +70,7 @@ static const struct kfd_device_info carrizo_device_info = { - .max_pasid_bits = 16, - /* max num of queues for CZ.TODO should be a dynamic value */ - .max_no_of_hqd = 24, -+ .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -@@ -80,6 +84,7 @@ static const struct kfd_device_info tonga_device_info = { - .asic_family = CHIP_TONGA, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, -+ .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -@@ -93,6 +98,7 @@ static const struct kfd_device_info fiji_device_info = { - .asic_family = CHIP_FIJI, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, -+ .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -@@ -106,6 +112,7 @@ static const struct kfd_device_info polaris10_device_info = { - .asic_family = CHIP_POLARIS10, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, -+ .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -@@ -119,6 +126,7 @@ static const struct kfd_device_info polaris11_device_info = { - .asic_family = CHIP_POLARIS11, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, -+ .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, -@@ -128,6 +136,19 @@ static const struct kfd_device_info polaris11_device_info = { - .needs_pci_atomics = true, - }; - -+static const struct kfd_device_info vega10_device_info = { -+ .asic_family = CHIP_VEGA10, -+ .max_pasid_bits = 16, -+ .max_no_of_hqd = 24, -+ .doorbell_size = 8, -+ .ih_ring_entry_size = 8 * sizeof(uint32_t), -+ .event_interrupt_class = &event_interrupt_class_v9, -+ .num_of_watch_points = 4, -+ .mqd_size_aligned = MQD_SIZE_ALIGNED, -+ .is_need_iommu_device = false, -+ .supports_cwsr = true, -+}; -+ - struct kfd_deviceid { - unsigned short did; - const struct kfd_device_info *device_info; -@@ -215,7 +236,13 @@ static const struct kfd_deviceid supported_devices[] = { - { 0x67E9, &polaris11_device_info }, /* Polaris11 */ - { 0x67EB, &polaris11_device_info }, /* Polaris11 */ - { 0x67EF, &polaris11_device_info }, /* Polaris11 */ -- { 0x67FF, &polaris11_device_info } /* Polaris11 */ -+ { 0x67FF, &polaris11_device_info }, /* Polaris11 */ -+ { 0x6860, &vega10_device_info }, /* Vega10 */ -+ { 0x6861, &vega10_device_info }, /* Vega10 */ -+ { 0x6863, &vega10_device_info }, /* Vega10 */ -+ { 0x6867, &vega10_device_info }, /* Vega10 */ -+ { 0x686C, &vega10_device_info }, /* Vega10 */ -+ { 0x687F, &vega10_device_info } /* Vega10 */ - }; - - static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, -@@ -370,8 +397,17 @@ static int kfd_cwsr_init(struct kfd_dev *kfd) - * Initialize the CWSR required memory for TBA and TMA - */ - if (cwsr_enable && kfd->device_info->supports_cwsr) { -+ const uint32_t *cwsr_hex; - void *cwsr_addr = NULL; -- unsigned int size = sizeof(cwsr_trap_carrizo_hex); -+ unsigned int size; -+ -+ if (kfd->device_info->asic_family < CHIP_VEGA10) { -+ cwsr_hex = cwsr_trap_carrizo_hex; -+ size = sizeof(cwsr_trap_carrizo_hex); -+ } else { -+ cwsr_hex = cwsr_trap_gfx9_hex; -+ size = sizeof(cwsr_trap_gfx9_hex); -+ } - - if (size > PAGE_SIZE) { - pr_err("amdkfd: wrong CWSR ISA size.\n"); -@@ -388,7 +424,7 @@ static int kfd_cwsr_init(struct kfd_dev *kfd) - /*Only first page used for cwsr ISA code */ - cwsr_addr = kmap(kfd->cwsr_pages); - memset(cwsr_addr, 0, PAGE_SIZE); -- memcpy(cwsr_addr, cwsr_trap_carrizo_hex, size); -+ memcpy(cwsr_addr, cwsr_hex, size); - kunmap(kfd->cwsr_pages); - kfd->tma_offset = ALIGN(size, PAGE_SIZE); - kfd->cwsr_enabled = true; -@@ -460,9 +496,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - * calculate max size of runlist packet. - * There can be only 2 packets at once - */ -- size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_map_process) + -- max_num_of_queues_per_device * -- sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2; -+ size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_mes_map_process) + -+ max_num_of_queues_per_device * sizeof(struct pm4_mes_map_queues) -+ + sizeof(struct pm4_mes_runlist)) * 2; - - /* Add size of HIQ & DIQ */ - size += KFD_KERNEL_QUEUE_SIZE * 2; -@@ -989,7 +1025,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, - if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) - return -ENOMEM; - -- *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); -+ *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); - if ((*mem_obj) == NULL) - return -ENOMEM; - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c -index 1e28bb7..dcdc380 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c -@@ -111,6 +111,83 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, - qpd->sh_mem_bases); - } - -+static int init_doorbell_bitmap(struct device_queue_manager *dqm) -+{ -+ unsigned int i; -+ -+ if (!KFD_IS_SOC15(dqm->dev->device_info->asic_family)) -+ return 0; -+ -+ dqm->doorbell_bitmap = -+ kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, -+ BITS_PER_BYTE), GFP_KERNEL); -+ if (dqm->doorbell_bitmap == NULL) -+ return -ENOMEM; -+ -+ /* Mask out any reserved doorbells */ -+ for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) -+ if ((dqm->dev->shared_resources.reserved_doorbell_mask & i) == -+ dqm->dev->shared_resources.reserved_doorbell_val) { -+ set_bit(i, dqm->doorbell_bitmap); -+ pr_debug("reserved doorbell 0x%03x\n", i); -+ } -+ -+ return 0; -+} -+ -+static void uninit_doorbell_bitmap(struct device_queue_manager *dqm) -+{ -+ kfree(dqm->doorbell_bitmap); -+} -+ -+static int allocate_doorbell(struct device_queue_manager *dqm, struct queue *q) -+{ -+ if (!KFD_IS_SOC15(dqm->dev->device_info->asic_family)) { -+ /* On pre-SOC15 chips we need to use the queue ID to -+ * preserve the user mode ABI. -+ */ -+ q->doorbell_id = q->properties.queue_id; -+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -+ /* For SDMA queues on SOC15, use static doorbell -+ * assignments based on the engine and queue. -+ */ -+ q->doorbell_id = dqm->dev->shared_resources.sdma_doorbell -+ [q->properties.sdma_engine_id] -+ [q->properties.sdma_queue_id]; -+ } else { -+ /* For CP queues on SOC15 reserve a free doorbell ID */ -+ unsigned int found; -+ -+ found = find_first_zero_bit(dqm->doorbell_bitmap, -+ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); -+ if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { -+ pr_debug("amdkfd: No doorbells available"); -+ return -EBUSY; -+ } -+ set_bit(found, dqm->doorbell_bitmap); -+ q->doorbell_id = found; -+ } -+ -+ q->properties.doorbell_off = -+ kfd_doorbell_id_to_offset(dqm->dev, q->process, -+ q->doorbell_id); -+ -+ return 0; -+} -+ -+static void deallocate_doorbell(struct device_queue_manager *dqm, -+ struct queue *q) -+{ -+ unsigned int old; -+ -+ if (!KFD_IS_SOC15(dqm->dev->device_info->asic_family) || -+ q->properties.type == KFD_QUEUE_TYPE_SDMA) -+ return; -+ -+ old = test_and_clear_bit(q->doorbell_id, dqm->doorbell_bitmap); -+ WARN_ON(!old); -+} -+ - static int allocate_vmid(struct device_queue_manager *dqm, - struct qcm_process_device *qpd, - struct queue *q) -@@ -152,7 +229,8 @@ static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, - if (!qpd->ib_kaddr) - return -ENOMEM; - -- len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr); -+ len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base, -+ (uint32_t *)qpd->ib_kaddr); - - return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, - qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); -@@ -313,12 +391,14 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - if (retval != 0) - return retval; - -+ retval = allocate_doorbell(dqm, q); -+ if (retval) -+ goto out_deallocate_hqd; -+ - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); -- if (retval != 0) { -- deallocate_hqd(dqm, q); -- return retval; -- } -+ if (retval != 0) -+ goto out_deallocate_doorbell; - - pr_debug("kfd: loading mqd to hqd on pipe (%d) queue (%d)\n", - q->pipe, -@@ -332,13 +412,19 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, - q->process->mm); -- if (retval != 0) { -- deallocate_hqd(dqm, q); -- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -- return retval; -- } -+ if (retval != 0) -+ goto out_uninit_mqd; - - return 0; -+ -+out_uninit_mqd: -+ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -+out_deallocate_doorbell: -+ deallocate_doorbell(dqm, q); -+out_deallocate_hqd: -+ deallocate_hqd(dqm, q); -+ -+ return retval; - } - - /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked -@@ -360,6 +446,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, - goto out; - } - -+ deallocate_doorbell(dqm, q); -+ - if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) - deallocate_hqd(dqm, q); - else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { -@@ -741,24 +829,29 @@ static int init_scheduler(struct device_queue_manager *dqm) - - static int initialize_nocpsch(struct device_queue_manager *dqm) - { -- int i; -+ int i, ret; - - BUG_ON(!dqm); - - pr_debug("kfd: In func %s num of pipes: %d\n", - __func__, get_pipes_per_mec(dqm)); - -- mutex_init(&dqm->lock); -- INIT_LIST_HEAD(&dqm->queues); -- dqm->queue_count = dqm->next_pipe_to_allocate = 0; -- dqm->sdma_queue_count = 0; - dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm), - sizeof(unsigned int), GFP_KERNEL); -- if (!dqm->allocated_queues) { -- mutex_destroy(&dqm->lock); -+ if (!dqm->allocated_queues) - return -ENOMEM; -+ -+ ret = init_doorbell_bitmap(dqm); -+ if (ret) { -+ kfree(dqm->allocated_queues); -+ return ret; - } - -+ mutex_init(&dqm->lock); -+ INIT_LIST_HEAD(&dqm->queues); -+ dqm->queue_count = dqm->next_pipe_to_allocate = 0; -+ dqm->sdma_queue_count = 0; -+ - for (i = 0; i < get_pipes_per_mec(dqm); i++) - dqm->allocated_queues[i] = (1 << get_queues_per_pipe(dqm)) - 1; - -@@ -777,6 +870,7 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm) - - BUG_ON(dqm->queue_count > 0 || dqm->processes_count > 0); - -+ uninit_doorbell_bitmap(dqm); - kfree(dqm->allocated_queues); - for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) - kfree(dqm->mqds[i]); -@@ -839,6 +933,10 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; - q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; - -+ retval = allocate_doorbell(dqm, q); -+ if (retval) -+ goto out_deallocate_sdma_queue; -+ - pr_debug("kfd: sdma id is: %d\n", q->sdma_id); - pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); - pr_debug(" sdma engine id: %d\n", q->properties.sdma_engine_id); -@@ -846,19 +944,23 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); -- if (retval != 0) { -- deallocate_sdma_queue(dqm, q->sdma_id); -- return retval; -- } -+ if (retval != 0) -+ goto out_deallocate_doorbell; - - retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); -- if (retval != 0) { -- deallocate_sdma_queue(dqm, q->sdma_id); -- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -- return retval; -- } -+ if (retval != 0) -+ goto out_uninit_mqd; - - return 0; -+ -+out_uninit_mqd: -+ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); -+out_deallocate_doorbell: -+ deallocate_doorbell(dqm, q); -+out_deallocate_sdma_queue: -+ deallocate_sdma_queue(dqm, q->sdma_id); -+ -+ return retval; - } - - /* -@@ -918,6 +1020,10 @@ static int initialize_cpsch(struct device_queue_manager *dqm) - pr_debug("kfd: In func %s num of pipes: %d\n", - __func__, get_pipes_per_mec(dqm)); - -+ retval = init_doorbell_bitmap(dqm); -+ if (retval) -+ return retval; -+ - mutex_init(&dqm->lock); - INIT_LIST_HEAD(&dqm->queues); - dqm->queue_count = dqm->processes_count = 0; -@@ -931,6 +1037,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) - return 0; - - fail_init_pipelines: -+ uninit_doorbell_bitmap(dqm); - mutex_destroy(&dqm->lock); - return retval; - } -@@ -1069,24 +1176,29 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", - dqm->total_queue_count); - retval = -EPERM; -- goto out; -+ goto out_unlock; - } - - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - retval = allocate_sdma_queue(dqm, &q->sdma_id); - if (retval != 0) -- goto out; -+ goto out_unlock; - q->properties.sdma_queue_id = - q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; - q->properties.sdma_engine_id = - q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; - } -+ -+ retval = allocate_doorbell(dqm, q); -+ if (retval) -+ goto out_deallocate_sdma_queue; -+ - mqd = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - - if (mqd == NULL) { -- mutex_unlock(&dqm->lock); -- return -ENOMEM; -+ retval = -ENOMEM; -+ goto out_deallocate_doorbell; - } - /* - * Eviction state logic: we only mark active queues as evicted -@@ -1104,7 +1216,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); - if (retval != 0) -- goto out; -+ goto out_deallocate_doorbell; - - list_add(&q->list, &qpd->queues_list); - if (q->properties.is_active) { -@@ -1123,9 +1235,18 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - pr_debug("Total of %d queues are accountable so far\n", - dqm->total_queue_count); - --out: - mutex_unlock(&dqm->lock); - return retval; -+ -+out_deallocate_doorbell: -+ deallocate_doorbell(dqm, q); -+out_deallocate_sdma_queue: -+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) -+ deallocate_sdma_queue(dqm, q->sdma_id); -+out_unlock: -+ mutex_unlock(&dqm->lock); -+ -+ return retval; - } - - int amdkfd_fence_wait_timeout(unsigned int *fence_addr, -@@ -1286,6 +1407,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, - goto failed; - } - -+ deallocate_doorbell(dqm, q); -+ - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); -@@ -1333,10 +1456,13 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, - void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) - { -- bool retval; -+ bool retval = true; - - pr_debug("kfd: In func %s\n", __func__); - -+ if (!dqm->asic_ops.set_cache_memory_policy) -+ return retval; -+ - mutex_lock(&dqm->lock); - - if (alternate_aperture_size == 0) { -@@ -1590,6 +1716,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) - case CHIP_POLARIS11: - device_queue_manager_init_vi_tonga(&dqm->asic_ops); - break; -+ -+ case CHIP_VEGA10: -+ device_queue_manager_init_v9_vega10(&dqm->asic_ops); -+ break; - } - - if (dqm->ops.initialize(dqm) != 0) { -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h -index 05d0cc8..c269e5e 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h -@@ -182,6 +182,7 @@ struct device_queue_manager { - unsigned int *allocated_queues; - unsigned int sdma_bitmap; - unsigned int vmid_bitmap; -+ unsigned long *doorbell_bitmap; - uint64_t pipelines_addr; - struct kfd_mem_obj *pipeline_mem; - uint64_t fence_gpu_addr; -@@ -199,6 +200,8 @@ void device_queue_manager_init_vi( - struct device_queue_manager_asic_ops *asic_ops); - void device_queue_manager_init_vi_tonga( - struct device_queue_manager_asic_ops *asic_ops); -+void device_queue_manager_init_v9_vega10( -+ struct device_queue_manager_asic_ops *asic_ops); - void program_sh_mem_settings(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); - unsigned int get_queues_num(struct device_queue_manager *dqm); -@@ -216,6 +219,7 @@ static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) - return (pdd->lds_base >> 16) & 0xFF; - } - -+/* This function is only useful for GFXv7 and v8 */ - static inline unsigned int - get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) - { -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c -new file mode 100644 -index 0000000..2d81e2b ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c -@@ -0,0 +1,87 @@ -+/* -+ * Copyright 2016 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ * -+ */ -+ -+#include "kfd_device_queue_manager.h" -+#include "vega10/vega10_enum.h" -+#include "vega10/GC/gc_9_0_offset.h" -+#include "vega10/GC/gc_9_0_sh_mask.h" -+#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" -+ -+static int update_qpd_v9(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd); -+static int initialize_cpsch_v9(struct device_queue_manager *dqm); -+static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, -+ struct qcm_process_device *qpd); -+ -+void device_queue_manager_init_v9_vega10( -+ struct device_queue_manager_asic_ops *asic_ops) -+{ -+ asic_ops->update_qpd = update_qpd_v9; -+ asic_ops->init_cpsch = initialize_cpsch_v9; -+ asic_ops->init_sdma_vm = init_sdma_vm_v9; -+} -+ -+static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) -+{ -+ uint32_t shared_base = pdd->lds_base >> 48; -+ uint32_t private_base = pdd->scratch_base >> 48; -+ -+ return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | -+ private_base; -+} -+ -+static int update_qpd_v9(struct device_queue_manager *dqm, -+ struct qcm_process_device *qpd) -+{ -+ struct kfd_process_device *pdd; -+ -+ pdd = qpd_to_pdd(qpd); -+ -+ /* check if sh_mem_config register already configured */ -+ if (qpd->sh_mem_config == 0) { -+ qpd->sh_mem_config = -+ SH_MEM_ALIGNMENT_MODE_UNALIGNED << -+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; -+ -+ qpd->sh_mem_ape1_limit = 0; -+ qpd->sh_mem_ape1_base = 0; -+ } -+ -+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); -+ -+ pr_debug("kfd: sh_mem_bases 0x%X\n", qpd->sh_mem_bases); -+ -+ return 0; -+} -+ -+static int initialize_cpsch_v9(struct device_queue_manager *dqm) -+{ -+ return 0; -+} -+ -+static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, -+ struct qcm_process_device *qpd) -+{ -+ /* Not needed on SDMAv4 any more */ -+ q->properties.sdma_vm_addr = 0; -+} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c -index 9387b1d..9a86b98 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c -@@ -34,7 +34,6 @@ - */ - - #define KERNEL_DOORBELL_PASID 1 --#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 - - /* - * Each device exposes a doorbell aperture, a PCI MMIO aperture that -@@ -51,9 +50,9 @@ - */ - - /* # of doorbell bytes allocated for each process. */ --size_t kfd_doorbell_process_slice(void) -+size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) - { -- return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * -+ return roundup(kfd->device_info->doorbell_size * - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, - PAGE_SIZE); - } -@@ -73,16 +72,16 @@ void kfd_doorbell_init(struct kfd_dev *kfd) - - doorbell_start_offset = - roundup(kfd->shared_resources.doorbell_start_offset, -- kfd_doorbell_process_slice()); -+ kfd_doorbell_process_slice(kfd)); - - doorbell_aperture_size = - rounddown(kfd->shared_resources.doorbell_aperture_size, -- kfd_doorbell_process_slice()); -+ kfd_doorbell_process_slice(kfd)); - - if (doorbell_aperture_size > doorbell_start_offset) - doorbell_process_limit = - (doorbell_aperture_size - doorbell_start_offset) / -- kfd_doorbell_process_slice(); -+ kfd_doorbell_process_slice(kfd); - else - doorbell_process_limit = 0; - -@@ -93,7 +92,7 @@ void kfd_doorbell_init(struct kfd_dev *kfd) - kfd->doorbell_process_limit = doorbell_process_limit - 1; - - kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, -- kfd_doorbell_process_slice()); -+ kfd_doorbell_process_slice(kfd)); - - BUG_ON(!kfd->doorbell_kernel_ptr); - -@@ -117,21 +116,16 @@ void kfd_doorbell_init(struct kfd_dev *kfd) - (uintptr_t)kfd->doorbell_kernel_ptr); - } - --int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) -+int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, -+ struct vm_area_struct *vma) - { - phys_addr_t address; -- struct kfd_dev *dev; - - /* - * For simplicitly we only allow mapping of the entire doorbell - * allocation of a single device & process. - */ -- if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice()) -- return -EINVAL; -- -- /* Find kfd device according to gpu id */ -- dev = kfd_device_by_id(vma->vm_pgoff); -- if (dev == NULL) -+ if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) - return -EINVAL; - - /* Calculate physical address of doorbell */ -@@ -148,19 +142,19 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) - " vm_flags == 0x%04lX\n" - " size == 0x%04lX\n", - (unsigned long long) vma->vm_start, address, vma->vm_flags, -- kfd_doorbell_process_slice()); -+ kfd_doorbell_process_slice(dev)); - - - return io_remap_pfn_range(vma, - vma->vm_start, - address >> PAGE_SHIFT, -- kfd_doorbell_process_slice(), -+ kfd_doorbell_process_slice(dev), - vma->vm_page_prot); - } - - - /* get kernel iomem pointer for a doorbell */ --u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, -+void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, - unsigned int *doorbell_off) - { - u32 inx; -@@ -177,12 +171,15 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, - if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) - return NULL; - -+ inx *= kfd->device_info->doorbell_size / sizeof(u32); -+ - /* - * Calculating the kernel doorbell offset using "faked" kernel -- * pasid that allocated for kernel queues only -+ * pasid that allocated for kernel queues only. Offset is in -+ * dword units regardless of the ASIC-dependent doorbell size. - */ -- *doorbell_off = KERNEL_DOORBELL_PASID * (kfd_doorbell_process_slice() / -- sizeof(u32)) + inx; -+ *doorbell_off = KERNEL_DOORBELL_PASID * -+ (kfd_doorbell_process_slice(kfd) / sizeof(u32)) + inx; - - pr_debug("kfd: get kernel queue doorbell\n" - " doorbell offset == 0x%08X\n" -@@ -205,7 +202,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) - mutex_unlock(&kfd->doorbell_mutex); - } - --inline void write_kernel_doorbell(u32 __iomem *db, u32 value) -+void write_kernel_doorbell(void __iomem *db, u32 value) - { - if (db) { - writel(value, db); -@@ -213,29 +210,40 @@ inline void write_kernel_doorbell(u32 __iomem *db, u32 value) - } - } - -+void write_kernel_doorbell64(void __iomem *db, u64 value) -+{ -+ if (db) { -+ WARN(((unsigned long)db & 7) != 0, -+ "Unaligned 64-bit doorbell"); -+ writeq(value, (u64 __iomem *)db); -+ pr_debug("writing %llu to doorbell address 0x%p\n", value, db); -+ } -+} -+ - /* - * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 - * to doorbells with the process's doorbell page - */ --unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, -+unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, - struct kfd_process *process, -- unsigned int queue_id) -+ unsigned int doorbell_id) - { - /* - * doorbell_id_offset accounts for doorbells taken by KGD. -- * pasid * kfd_doorbell_process_slice/sizeof(u32) adjusts -- * to the process's doorbells -+ * pasid * kfd_doorbell_process_slice/sizeof(u32) adjusts to -+ * the process's doorbells. The offset returned is in dword -+ * units regardless of the ASIC-dependent doorbell size. - */ - return kfd->doorbell_id_offset + -- process->pasid * (kfd_doorbell_process_slice()/sizeof(u32)) + -- queue_id; -+ process->pasid * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) + -+ doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); - } - - uint64_t kfd_get_number_elems(struct kfd_dev *kfd) - { - uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - - kfd->shared_resources.doorbell_start_offset) / -- kfd_doorbell_process_slice() + 1; -+ kfd_doorbell_process_slice(kfd) + 1; - - return num_of_elems; - -@@ -245,5 +253,5 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, - struct kfd_process *process) - { - return dev->doorbell_base + -- process->pasid * kfd_doorbell_process_slice(); -+ process->pasid * kfd_doorbell_process_slice(dev); - } -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c -index 7d290bb..49a2a53 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c -@@ -279,25 +279,39 @@ - * for FLAT_* / S_LOAD operations. - */ - --#define MAKE_GPUVM_APP_BASE(gpu_num) \ -+#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ - (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) - - #define MAKE_GPUVM_APP_LIMIT(base, size) \ - (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) - --#define MAKE_SCRATCH_APP_BASE() \ -+#define MAKE_SCRATCH_APP_BASE_VI() \ - (((uint64_t)(0x1UL) << 61) + 0x100000000L) - - #define MAKE_SCRATCH_APP_LIMIT(base) \ - (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) - --#define MAKE_LDS_APP_BASE() \ -+#define MAKE_LDS_APP_BASE_VI() \ - (((uint64_t)(0x1UL) << 61) + 0x0) - - #define MAKE_LDS_APP_LIMIT(base) \ - (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) - -+/* On GFXv9 the LDS and scratch apertures are programmed independently -+ * using the high 16 bits of the 64-bit virtual address. They must be -+ * in the hole, which will be the case as long as the high 16 bits are -+ * not 0. -+ * -+ * The aperture sizes are still 4GB implicitly. -+ * -+ * A GPUVM aperture is not applicable on GFXv9. -+ */ -+#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) -+#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) - -+/* Some VM address space reserved for kernel use (CWSR trap handlers -+ * and kernel IBs) -+ */ - #define DGPU_VM_BASE_DEFAULT 0x100000 - #define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE) - -@@ -313,6 +327,32 @@ int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, - return 0; - } - -+void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) -+{ -+ /* -+ * node id couldn't be 0 - the three MSB bits of -+ * aperture shoudn't be 0 -+ */ -+ pdd->lds_base = MAKE_LDS_APP_BASE_VI(); -+ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); -+ -+ pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); -+ pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( -+ pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size); -+ -+ pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); -+ pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); -+} -+ -+void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) -+{ -+ pdd->lds_base = MAKE_LDS_APP_BASE_V9(); -+ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); -+ -+ pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); -+ pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); -+} -+ - int kfd_init_apertures(struct kfd_process *process) - { - uint8_t id = 0; -@@ -341,24 +381,23 @@ int kfd_init_apertures(struct kfd_process *process) - pdd->gpuvm_base = pdd->gpuvm_limit = 0; - pdd->scratch_base = pdd->scratch_limit = 0; - } else { -- /* -- * node id couldn't be 0 - the three MSB bits of -- * aperture shoudn't be 0 -- */ -- pdd->lds_base = MAKE_LDS_APP_BASE(); -- -- pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); -- -- pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); -- -- pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( -- pdd->gpuvm_base, -- dev->shared_resources.gpuvm_size); -- -- pdd->scratch_base = MAKE_SCRATCH_APP_BASE(); -- -- pdd->scratch_limit = -- MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); -+ switch (dev->device_info->asic_family) { -+ case CHIP_KAVERI: -+ case CHIP_HAWAII: -+ case CHIP_CARRIZO: -+ case CHIP_TONGA: -+ case CHIP_FIJI: -+ case CHIP_POLARIS10: -+ case CHIP_POLARIS11: -+ kfd_init_apertures_vi(pdd, id); -+ break; -+ case CHIP_VEGA10: -+ kfd_init_apertures_v9(pdd, id); -+ break; -+ default: -+ pr_err("Unknown chip in kfd_init_apertures\n"); -+ goto err; -+ } - - if (KFD_IS_DGPU(dev->device_info->asic_family)) { - pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c -new file mode 100644 -index 0000000..a479820 ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c -@@ -0,0 +1,130 @@ -+/* -+ * Copyright 2016 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#include "kfd_priv.h" -+#include "kfd_events.h" -+#include "soc15_int.h" -+ -+ -+static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid) -+{ -+ uint32_t pasid = 0; -+ const struct kfd2kgd_calls *f2g = dev->kfd2kgd; -+ -+ if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid)) -+ pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); -+ -+ return pasid; -+} -+ -+static bool event_interrupt_isr_v9(struct kfd_dev *dev, -+ const uint32_t *ih_ring_entry, -+ uint32_t *patched_ihre, -+ bool *patched_flag) -+{ -+ uint16_t source_id, client_id, pasid, vmid; -+ bool result = false; -+ -+ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); -+ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); -+ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); -+ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); -+ -+ if (pasid) { -+ const uint32_t *data = ih_ring_entry; -+ -+ pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", -+ client_id, source_id, pasid); -+ pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", -+ data[0], data[1], data[2], data[3], -+ data[4], data[5], data[6], data[7]); -+ } -+ -+ if ((vmid >= dev->vm_info.first_vmid_kfd && -+ vmid <= dev->vm_info.last_vmid_kfd) && -+ (source_id == SOC15_INTSRC_CP_END_OF_PIPE || -+ source_id == SOC15_INTSRC_SDMA_TRAP || -+ source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || -+ source_id == SOC15_INTSRC_CP_BAD_OPCODE || -+ client_id == SOC15_IH_CLIENTID_VMC)) { -+ -+ /* -+ * KFD want to handle this INT, but MEC firmware did -+ * not send pasid. Try to get it from vmid mapping -+ * and patch the ih entry. It's a temp workaround. -+ */ -+ WARN_ONCE((!pasid), "Fix me.\n"); -+ if (!pasid) { -+ uint32_t temp = le32_to_cpu(ih_ring_entry[3]); -+ -+ pasid = kfd_get_pasid_from_vmid(dev, vmid); -+ memcpy(patched_ihre, ih_ring_entry, -+ dev->device_info->ih_ring_entry_size); -+ patched_ihre[3] = cpu_to_le32(temp | pasid); -+ *patched_flag = true; -+ } -+ result = pasid ? true : false; -+ } -+ -+ /* Do not process in ISR, just request it to be forwarded to WQ. */ -+ return result; -+ -+} -+ -+static void event_interrupt_wq_v9(struct kfd_dev *dev, -+ const uint32_t *ih_ring_entry) -+{ -+ uint16_t source_id, client_id, pasid, vmid; -+ -+ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); -+ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); -+ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); -+ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); -+ -+ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) -+ kfd_signal_event_interrupt(pasid, 0, 0); -+ else if (source_id == SOC15_INTSRC_SDMA_TRAP) -+ kfd_signal_event_interrupt(pasid, 0, 0); -+ else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) -+ kfd_signal_event_interrupt(pasid, 0, 0); /*todo */ -+ else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) -+ kfd_signal_hw_exception_event(pasid); -+ else if (client_id == SOC15_IH_CLIENTID_VMC) { -+ struct kfd_vm_fault_info info; -+ -+ memset(&info, 0, sizeof(info)); -+ dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); -+ kfd_process_vm_fault(dev->dqm, pasid); -+ if (!info.page_addr && !info.status) -+ return; -+ -+ if (info.vmid == vmid) -+ kfd_signal_vm_fault_event(dev, pasid, &info); -+ else -+ kfd_signal_vm_fault_event(dev, pasid, NULL); -+ } -+} -+ -+const struct kfd_event_interrupt_class event_interrupt_class_v9 = { -+ .interrupt_isr = event_interrupt_isr_v9, -+ .interrupt_wq = event_interrupt_wq_v9, -+}; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c -index 126d848..b826689 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c -@@ -99,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, - kq->rptr_kernel = kq->rptr_mem->cpu_ptr; - kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; - -- retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), -+ retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, - &kq->wptr_mem); - - if (retval != 0) -@@ -211,6 +211,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq, - size_t available_size; - size_t queue_size_dwords; - uint32_t wptr, rptr; -+ uint64_t wptr64; - unsigned int *queue_address; - - BUG_ON(!kq || !buffer_ptr); -@@ -222,6 +223,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq, - */ - rptr = *kq->rptr_kernel; - wptr = kq->pending_wptr; -+ wptr64 = kq->pending_wptr64; - queue_address = (unsigned int *)kq->pq_kernel_addr; - queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); - -@@ -251,11 +253,13 @@ static int acquire_packet_buffer(struct kernel_queue *kq, - while (wptr > 0) { - queue_address[wptr] = kq->nop_packet; - wptr = (wptr + 1) % queue_size_dwords; -+ wptr64++; - } - } - - *buffer_ptr = &queue_address[wptr]; - kq->pending_wptr = wptr + packet_size_in_dwords; -+ kq->pending_wptr64 = wptr64 + packet_size_in_dwords; - - return 0; - } -@@ -310,9 +314,7 @@ static void submit_packet(struct kernel_queue *kq) - pr_debug("\n"); - #endif - -- *kq->wptr_kernel = kq->pending_wptr; -- write_kernel_doorbell(kq->queue->properties.doorbell_ptr, -- kq->pending_wptr); -+ kq->ops_asic_specific.submit_packet(kq); - } - - static void rollback_packet(struct kernel_queue *kq) -@@ -352,6 +354,10 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, - case CHIP_HAWAII: - kernel_queue_init_cik(&kq->ops_asic_specific); - break; -+ -+ case CHIP_VEGA10: -+ kernel_queue_init_v9(&kq->ops_asic_specific); -+ break; - } - - if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) { -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h -index a217f42..82c94a6 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h -@@ -82,6 +82,7 @@ struct kernel_queue { - struct kfd_dev *dev; - struct mqd_manager *mqd; - struct queue *queue; -+ uint64_t pending_wptr64; - uint32_t pending_wptr; - unsigned int nop_packet; - -@@ -89,7 +90,10 @@ struct kernel_queue { - uint32_t *rptr_kernel; - uint64_t rptr_gpu_addr; - struct kfd_mem_obj *wptr_mem; -- uint32_t *wptr_kernel; -+ union { -+ uint64_t *wptr64_kernel; -+ uint32_t *wptr_kernel; -+ }; - uint64_t wptr_gpu_addr; - struct kfd_mem_obj *pq; - uint64_t pq_gpu_addr; -@@ -107,5 +111,6 @@ struct kernel_queue { - - void kernel_queue_init_cik(struct kernel_queue_ops *ops); - void kernel_queue_init_vi(struct kernel_queue_ops *ops); -+void kernel_queue_init_v9(struct kernel_queue_ops *ops); - - #endif /* KFD_KERNEL_QUEUE_H_ */ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c -index a90eb44..8c69ea7 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c -@@ -22,15 +22,19 @@ - */ - - #include "kfd_kernel_queue.h" -+#include "kfd_pm4_headers.h" -+#include "kfd_pm4_opcodes.h" - - static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, - enum kfd_queue_type type, unsigned int queue_size); - static void uninitialize_cik(struct kernel_queue *kq); -+static void submit_packet_cik(struct kernel_queue *kq); - - void kernel_queue_init_cik(struct kernel_queue_ops *ops) - { - ops->initialize = initialize_cik; - ops->uninitialize = uninitialize_cik; -+ ops->submit_packet = submit_packet_cik; - } - - static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, -@@ -42,3 +46,124 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, - static void uninitialize_cik(struct kernel_queue *kq) - { - } -+ -+static void submit_packet_cik(struct kernel_queue *kq) -+{ -+ *kq->wptr_kernel = kq->pending_wptr; -+ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, -+ kq->pending_wptr); -+} -+ -+static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer, -+ struct qcm_process_device *qpd) -+{ -+ struct pm4_map_process *packet; -+ struct queue *cur; -+ uint32_t num_queues; -+ -+ packet = (struct pm4_map_process *)buffer; -+ -+ memset(buffer, 0, sizeof(struct pm4_map_process)); -+ -+ packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, -+ sizeof(struct pm4_map_process)); -+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -+ packet->bitfields2.process_quantum = 1; -+ packet->bitfields2.pasid = qpd->pqm->process->pasid; -+ packet->bitfields3.page_table_base = qpd->page_table_base; -+ packet->bitfields10.gds_size = qpd->gds_size; -+ packet->bitfields10.num_gws = qpd->num_gws; -+ packet->bitfields10.num_oac = qpd->num_oac; -+ num_queues = 0; -+ list_for_each_entry(cur, &qpd->queues_list, list) -+ num_queues++; -+ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; -+ -+ packet->sh_mem_config = qpd->sh_mem_config; -+ packet->sh_mem_bases = qpd->sh_mem_bases; -+ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -+ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -+ -+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -+ -+ return 0; -+} -+ -+static int pm_map_process_scratch_cik(struct packet_manager *pm, -+ uint32_t *buffer, struct qcm_process_device *qpd) -+{ -+ struct pm4_map_process_scratch_kv *packet; -+ struct queue *cur; -+ uint32_t num_queues; -+ -+ packet = (struct pm4_map_process_scratch_kv *)buffer; -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); -+ -+ packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, -+ sizeof(struct pm4_map_process_scratch_kv)); -+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -+ packet->bitfields2.process_quantum = 1; -+ packet->bitfields2.pasid = qpd->pqm->process->pasid; -+ packet->bitfields3.page_table_base = qpd->page_table_base; -+ packet->bitfields14.gds_size = qpd->gds_size; -+ packet->bitfields14.num_gws = qpd->num_gws; -+ packet->bitfields14.num_oac = qpd->num_oac; -+ num_queues = 0; -+ list_for_each_entry(cur, &qpd->queues_list, list) -+ num_queues++; -+ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues; -+ -+ packet->sh_mem_config = qpd->sh_mem_config; -+ packet->sh_mem_bases = qpd->sh_mem_bases; -+ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -+ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -+ -+ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; -+ -+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -+ -+ return 0; -+} -+ -+static uint32_t pm_get_map_process_packet_size_cik(void) -+{ -+ return sizeof(struct pm4_map_process); -+} -+static uint32_t pm_get_map_process_scratch_packet_size_cik(void) -+{ -+ return sizeof(struct pm4_map_process_scratch_kv); -+} -+ -+ -+static struct packet_manager_funcs kfd_cik_pm_funcs = { -+ .map_process = pm_map_process_cik, -+ .runlist = pm_runlist_vi, -+ .set_resources = pm_set_resources_vi, -+ .map_queues = pm_map_queues_vi, -+ .unmap_queues = pm_unmap_queues_vi, -+ .query_status = pm_query_status_vi, -+ .release_mem = pm_release_mem_vi, -+ .get_map_process_packet_size = pm_get_map_process_packet_size_cik, -+ .get_runlist_packet_size = pm_get_runlist_packet_size_vi, -+ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, -+ .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, -+ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, -+ .get_query_status_packet_size = pm_get_query_status_packet_size_vi, -+ .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, -+}; -+ -+ -+void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver) -+{ -+ pm->pmf = &kfd_cik_pm_funcs; -+ if (fw_ver >= KFD_SCRATCH_KV_FW_VER) { -+ pm->pmf->map_process = pm_map_process_scratch_cik; -+ pm->pmf->get_map_process_packet_size = -+ pm_get_map_process_scratch_packet_size_cik; -+ } -+} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c -new file mode 100644 -index 0000000..89edf3c ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c -@@ -0,0 +1,385 @@ -+/* -+ * Copyright 2016 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ * -+ */ -+ -+#include "kfd_kernel_queue.h" -+#include "kfd_device_queue_manager.h" -+#include "kfd_pm4_headers_ai.h" -+#include "kfd_pm4_opcodes.h" -+ -+static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, -+ enum kfd_queue_type type, unsigned int queue_size); -+static void uninitialize_v9(struct kernel_queue *kq); -+static void submit_packet_v9(struct kernel_queue *kq); -+ -+void kernel_queue_init_v9(struct kernel_queue_ops *ops) -+{ -+ ops->initialize = initialize_v9; -+ ops->uninitialize = uninitialize_v9; -+ ops->submit_packet = submit_packet_v9; -+} -+ -+static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, -+ enum kfd_queue_type type, unsigned int queue_size) -+{ -+ int retval; -+ -+ retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); -+ if (retval != 0) -+ return false; -+ -+ kq->eop_gpu_addr = kq->eop_mem->gpu_addr; -+ kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; -+ -+ memset(kq->eop_kernel_addr, 0, PAGE_SIZE); -+ -+ return true; -+} -+ -+static void uninitialize_v9(struct kernel_queue *kq) -+{ -+ kfd_gtt_sa_free(kq->dev, kq->eop_mem); -+} -+ -+static void submit_packet_v9(struct kernel_queue *kq) -+{ -+ *kq->wptr64_kernel = kq->pending_wptr64; -+ write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, -+ kq->pending_wptr64); -+} -+ -+static int pm_map_process_v9(struct packet_manager *pm, -+ uint32_t *buffer, struct qcm_process_device *qpd) -+{ -+ struct pm4_mes_map_process *packet; -+ struct queue *cur; -+ uint32_t num_queues; -+ uint64_t vm_page_table_base_addr = -+ (uint64_t)(qpd->page_table_base) << 12; -+ -+ packet = (struct pm4_mes_map_process *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mes_map_process)); -+ -+ packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, -+ sizeof(struct pm4_mes_map_process)); -+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -+ packet->bitfields2.process_quantum = 1; -+ packet->bitfields2.pasid = qpd->pqm->process->pasid; -+ packet->bitfields14.gds_size = qpd->gds_size; -+ packet->bitfields14.num_gws = qpd->num_gws; -+ packet->bitfields14.num_oac = qpd->num_oac; -+ packet->bitfields14.sdma_enable = 1; -+ -+ num_queues = 0; -+ list_for_each_entry(cur, &qpd->queues_list, list) -+ num_queues++; -+ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues; -+ -+ packet->sh_mem_config = qpd->sh_mem_config; -+ packet->sh_mem_bases = qpd->sh_mem_bases; -+ packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); -+ packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); -+ packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); -+ packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); -+ -+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -+ -+ packet->vm_context_page_table_base_addr_lo32 = -+ lower_32_bits(vm_page_table_base_addr); -+ packet->vm_context_page_table_base_addr_hi32 = -+ upper_32_bits(vm_page_table_base_addr); -+ -+ return 0; -+} -+ -+static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, -+ uint64_t ib, size_t ib_size_in_dwords, bool chain) -+{ -+ struct pm4_mes_runlist *packet; -+ -+ int concurrent_proc_cnt = 0; -+ struct kfd_dev *kfd = pm->dqm->dev; -+ -+ /* Determine the number of processes to map together to HW: -+ * it can not exceed the number of VMIDs available to the -+ * scheduler, and it is determined by the smaller of the number -+ * of processes in the runlist and kfd module parameter -+ * hws_max_conc_proc. -+ * Note: the arbitration between the number of VMIDs and -+ * hws_max_conc_proc has been done in -+ * kgd2kfd_device_init(). -+ */ -+ concurrent_proc_cnt = min(pm->dqm->processes_count, -+ kfd->max_proc_per_quantum); -+ -+ -+ packet = (struct pm4_mes_runlist *)buffer; -+ -+ memset(buffer, 0, sizeof(struct pm4_mes_runlist)); -+ packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, -+ sizeof(struct pm4_mes_runlist)); -+ -+ packet->bitfields4.ib_size = ib_size_in_dwords; -+ packet->bitfields4.chain = chain ? 1 : 0; -+ packet->bitfields4.offload_polling = 0; -+ packet->bitfields4.valid = 1; -+ packet->bitfields4.process_cnt = concurrent_proc_cnt; -+ packet->ordinal2 = lower_32_bits(ib); -+ packet->ib_base_hi = upper_32_bits(ib); -+ -+ return 0; -+} -+ -+static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, -+ struct queue *q, bool is_static) -+{ -+ struct pm4_mes_map_queues *packet; -+ bool use_static = is_static; -+ -+ packet = (struct pm4_mes_map_queues *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); -+ -+ packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, -+ sizeof(struct pm4_mes_map_queues)); -+ packet->bitfields2.alloc_format = -+ alloc_format__mes_map_queues__one_per_pipe_vi; -+ packet->bitfields2.num_queues = 1; -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; -+ -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_map_queues__compute_vi; -+ packet->bitfields2.queue_type = -+ queue_type__mes_map_queues__normal_compute_vi; -+ -+ switch (q->properties.type) { -+ case KFD_QUEUE_TYPE_COMPUTE: -+ if (use_static) -+ packet->bitfields2.queue_type = -+ queue_type__mes_map_queues__normal_latency_static_queue_vi; -+ break; -+ case KFD_QUEUE_TYPE_DIQ: -+ packet->bitfields2.queue_type = -+ queue_type__mes_map_queues__debug_interface_queue_vi; -+ break; -+ case KFD_QUEUE_TYPE_SDMA: -+ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + -+ engine_sel__mes_map_queues__sdma0_vi; -+ use_static = false; /* no static queues under SDMA */ -+ break; -+ default: -+ WARN(1, "queue type %d\n", q->properties.type); -+ break; -+ } -+ packet->bitfields3.doorbell_offset = -+ q->properties.doorbell_off; -+ -+ packet->mqd_addr_lo = -+ lower_32_bits(q->gart_mqd_addr); -+ -+ packet->mqd_addr_hi = -+ upper_32_bits(q->gart_mqd_addr); -+ -+ packet->wptr_addr_lo = -+ lower_32_bits((uint64_t)q->properties.write_ptr); -+ -+ packet->wptr_addr_hi = -+ upper_32_bits((uint64_t)q->properties.write_ptr); -+ -+ return 0; -+} -+ -+static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, -+ enum kfd_queue_type type, -+ enum kfd_unmap_queues_filter filter, -+ uint32_t filter_param, bool reset, -+ unsigned int sdma_engine) -+{ -+ struct pm4_mes_unmap_queues *packet; -+ -+ packet = (struct pm4_mes_unmap_queues *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); -+ -+ packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, -+ sizeof(struct pm4_mes_unmap_queues)); -+ switch (type) { -+ case KFD_QUEUE_TYPE_COMPUTE: -+ case KFD_QUEUE_TYPE_DIQ: -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_unmap_queues__compute; -+ break; -+ case KFD_QUEUE_TYPE_SDMA: -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_unmap_queues__sdma0 + sdma_engine; -+ break; -+ default: -+ WARN(1, "queue type %d\n", type); -+ break; -+ } -+ -+ if (reset) -+ packet->bitfields2.action = -+ action__mes_unmap_queues__reset_queues; -+ else -+ packet->bitfields2.action = -+ action__mes_unmap_queues__preempt_queues; -+ -+ switch (filter) { -+ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__perform_request_on_specified_queues; -+ packet->bitfields2.num_queues = 1; -+ packet->bitfields3b.doorbell_offset0 = filter_param; -+ break; -+ case KFD_UNMAP_QUEUES_FILTER_BY_PASID: -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; -+ packet->bitfields3a.pasid = filter_param; -+ break; -+ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__unmap_all_queues; -+ break; -+ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: -+ /* in this case, we do not preempt static queues */ -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__unmap_all_non_static_queues; -+ break; -+ default: -+ WARN(1, "filter %d\n", filter); -+ break; -+ } -+ -+ return 0; -+ -+} -+ -+static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, -+ uint64_t fence_address, uint32_t fence_value) -+{ -+ struct pm4_mes_query_status *packet; -+ -+ packet = (struct pm4_mes_query_status *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mes_query_status)); -+ -+ -+ packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, -+ sizeof(struct pm4_mes_query_status)); -+ -+ packet->bitfields2.context_id = 0; -+ packet->bitfields2.interrupt_sel = -+ interrupt_sel__mes_query_status__completion_status; -+ packet->bitfields2.command = -+ command__mes_query_status__fence_only_after_write_ack; -+ -+ packet->addr_hi = upper_32_bits((uint64_t)fence_address); -+ packet->addr_lo = lower_32_bits((uint64_t)fence_address); -+ packet->data_hi = upper_32_bits((uint64_t)fence_value); -+ packet->data_lo = lower_32_bits((uint64_t)fence_value); -+ -+ return 0; -+} -+ -+ -+static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) -+{ -+ struct pm4_mec_release_mem *packet; -+ -+ WARN_ON(!buffer); -+ -+ packet = (struct pm4_mec_release_mem *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); -+ -+ packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, -+ sizeof(struct pm4_mec_release_mem)); -+ -+ packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; -+ packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; -+ packet->bitfields2.tcl1_action_ena = 1; -+ packet->bitfields2.tc_action_ena = 1; -+ packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; -+ -+ packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; -+ packet->bitfields3.int_sel = -+ int_sel__mec_release_mem__send_interrupt_after_write_confirm; -+ -+ packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; -+ packet->address_hi = upper_32_bits(gpu_addr); -+ -+ packet->data_lo = 0; -+ -+ return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); -+} -+ -+static uint32_t pm_get_map_process_packet_size_v9(void) -+{ -+ return sizeof(struct pm4_mes_map_process); -+} -+ -+static uint32_t pm_get_runlist_packet_size_v9(void) -+{ -+ return sizeof(struct pm4_mes_runlist); -+} -+ -+static uint32_t pm_get_map_queues_packet_size_v9(void) -+{ -+ return sizeof(struct pm4_mes_map_queues); -+} -+ -+static uint32_t pm_get_unmap_queues_packet_size_v9(void) -+{ -+ return sizeof(struct pm4_mes_unmap_queues); -+} -+ -+static uint32_t pm_get_query_status_packet_size_v9(void) -+{ -+ return sizeof(struct pm4_mes_query_status); -+} -+ -+static uint32_t pm_get_release_mem_packet_size_v9(void) -+{ -+ return sizeof(struct pm4_mec_release_mem); -+} -+ -+static struct packet_manager_funcs kfd_v9_pm_funcs = { -+ .map_process = pm_map_process_v9, -+ .runlist = pm_runlist_v9, -+ .set_resources = pm_set_resources_vi, -+ .map_queues = pm_map_queues_v9, -+ .unmap_queues = pm_unmap_queues_v9, -+ .query_status = pm_query_status_v9, -+ .release_mem = pm_release_mem_v9, -+ .get_map_process_packet_size = pm_get_map_process_packet_size_v9, -+ .get_runlist_packet_size = pm_get_runlist_packet_size_v9, -+ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, -+ .get_map_queues_packet_size = pm_get_map_queues_packet_size_v9, -+ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_v9, -+ .get_query_status_packet_size = pm_get_query_status_packet_size_v9, -+ .get_release_mem_packet_size = pm_get_release_mem_packet_size_v9, -+}; -+ -+void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver) -+{ -+ pm->pmf = &kfd_v9_pm_funcs; -+} -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c -index f1d4828..6f12fe0 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c -@@ -22,15 +22,20 @@ - */ - - #include "kfd_kernel_queue.h" -+#include "kfd_device_queue_manager.h" -+#include "kfd_pm4_headers_vi.h" -+#include "kfd_pm4_opcodes.h" - - static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, - enum kfd_queue_type type, unsigned int queue_size); - static void uninitialize_vi(struct kernel_queue *kq); -+static void submit_packet_vi(struct kernel_queue *kq); - - void kernel_queue_init_vi(struct kernel_queue_ops *ops) - { - ops->initialize = initialize_vi; - ops->uninitialize = uninitialize_vi; -+ ops->submit_packet = submit_packet_vi; - } - - static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, -@@ -54,3 +59,366 @@ static void uninitialize_vi(struct kernel_queue *kq) - { - kfd_gtt_sa_free(kq->dev, kq->eop_mem); - } -+ -+static void submit_packet_vi(struct kernel_queue *kq) -+{ -+ *kq->wptr_kernel = kq->pending_wptr; -+ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, -+ kq->pending_wptr); -+} -+ -+static int pm_map_process_vi(struct packet_manager *pm, -+ uint32_t *buffer, struct qcm_process_device *qpd) -+{ -+ struct pm4_mes_map_process *packet; -+ struct queue *cur; -+ uint32_t num_queues; -+ -+ packet = (struct pm4_mes_map_process *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mes_map_process)); -+ -+ packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, -+ sizeof(struct pm4_mes_map_process)); -+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -+ packet->bitfields2.process_quantum = 1; -+ packet->bitfields2.pasid = qpd->pqm->process->pasid; -+ packet->bitfields3.page_table_base = qpd->page_table_base; -+ packet->bitfields10.gds_size = qpd->gds_size; -+ packet->bitfields10.num_gws = qpd->num_gws; -+ packet->bitfields10.num_oac = qpd->num_oac; -+ num_queues = 0; -+ list_for_each_entry(cur, &qpd->queues_list, list) -+ num_queues++; -+ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; -+ -+ packet->sh_mem_config = qpd->sh_mem_config; -+ packet->sh_mem_bases = qpd->sh_mem_bases; -+ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -+ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -+ -+ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; -+ -+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -+ -+ return 0; -+} -+ -+ -+unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) -+{ -+ union PM4_MES_TYPE_3_HEADER header; -+ -+ header.u32All = 0; -+ header.opcode = opcode; -+ header.count = packet_size/sizeof(uint32_t) - 2; -+ header.type = PM4_TYPE_3; -+ -+ return header.u32All; -+} -+ -+int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, -+ uint64_t ib, size_t ib_size_in_dwords, bool chain) -+{ -+ struct pm4_mes_runlist *packet; -+ -+ int concurrent_proc_cnt = 0; -+ struct kfd_dev *kfd = pm->dqm->dev; -+ -+ /* Determine the number of processes to map together to HW: -+ * it can not exceed the number of VMIDs available to the -+ * scheduler, and it is determined by the smaller of the number -+ * of processes in the runlist and kfd module parameter -+ * hws_max_conc_proc. -+ * Note: the arbitration between the number of VMIDs and -+ * hws_max_conc_proc has been done in -+ * kgd2kfd_device_init(). -+ */ -+ concurrent_proc_cnt = min(pm->dqm->processes_count, -+ kfd->max_proc_per_quantum); -+ -+ -+ packet = (struct pm4_mes_runlist *)buffer; -+ -+ memset(buffer, 0, sizeof(struct pm4_mes_runlist)); -+ packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, -+ sizeof(struct pm4_mes_runlist)); -+ -+ packet->bitfields4.ib_size = ib_size_in_dwords; -+ packet->bitfields4.chain = chain ? 1 : 0; -+ packet->bitfields4.offload_polling = 0; -+ packet->bitfields4.valid = 1; -+ packet->bitfields4.process_cnt = concurrent_proc_cnt; -+ packet->ordinal2 = lower_32_bits(ib); -+ packet->bitfields3.ib_base_hi = upper_32_bits(ib); -+ -+ return 0; -+} -+ -+int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, -+ struct queue *q, bool is_static) -+{ -+ struct pm4_mes_map_queues *packet; -+ bool use_static = is_static; -+ -+ packet = (struct pm4_mes_map_queues *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); -+ -+ packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, -+ sizeof(struct pm4_mes_map_queues)); -+ packet->bitfields2.alloc_format = -+ alloc_format__mes_map_queues__one_per_pipe_vi; -+ packet->bitfields2.num_queues = 1; -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; -+ -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_map_queues__compute_vi; -+ packet->bitfields2.queue_type = -+ queue_type__mes_map_queues__normal_compute_vi; -+ -+ switch (q->properties.type) { -+ case KFD_QUEUE_TYPE_COMPUTE: -+ if (use_static) -+ packet->bitfields2.queue_type = -+ queue_type__mes_map_queues__normal_latency_static_queue_vi; -+ break; -+ case KFD_QUEUE_TYPE_DIQ: -+ packet->bitfields2.queue_type = -+ queue_type__mes_map_queues__debug_interface_queue_vi; -+ break; -+ case KFD_QUEUE_TYPE_SDMA: -+ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + -+ engine_sel__mes_map_queues__sdma0_vi; -+ use_static = false; /* no static queues under SDMA */ -+ break; -+ default: -+ WARN(1, "queue type %d\n", q->properties.type); -+ break; -+ } -+ packet->bitfields3.doorbell_offset = -+ q->properties.doorbell_off; -+ -+ packet->mqd_addr_lo = -+ lower_32_bits(q->gart_mqd_addr); -+ -+ packet->mqd_addr_hi = -+ upper_32_bits(q->gart_mqd_addr); -+ -+ packet->wptr_addr_lo = -+ lower_32_bits((uint64_t)q->properties.write_ptr); -+ -+ packet->wptr_addr_hi = -+ upper_32_bits((uint64_t)q->properties.write_ptr); -+ -+ return 0; -+} -+ -+int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, -+ struct scheduling_resources *res) -+{ -+ struct pm4_mes_set_resources *packet; -+ -+ packet = (struct pm4_mes_set_resources *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); -+ -+ packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, -+ sizeof(struct pm4_mes_set_resources)); -+ -+ packet->bitfields2.queue_type = -+ queue_type__mes_set_resources__hsa_interface_queue_hiq; -+ packet->bitfields2.vmid_mask = res->vmid_mask; -+ packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; -+ packet->bitfields7.oac_mask = res->oac_mask; -+ packet->bitfields8.gds_heap_base = res->gds_heap_base; -+ packet->bitfields8.gds_heap_size = res->gds_heap_size; -+ -+ packet->gws_mask_lo = lower_32_bits(res->gws_mask); -+ packet->gws_mask_hi = upper_32_bits(res->gws_mask); -+ -+ packet->queue_mask_lo = lower_32_bits(res->queue_mask); -+ packet->queue_mask_hi = upper_32_bits(res->queue_mask); -+ -+ return 0; -+} -+ -+int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, -+ enum kfd_queue_type type, -+ enum kfd_unmap_queues_filter filter, -+ uint32_t filter_param, bool reset, -+ unsigned int sdma_engine) -+{ -+ struct pm4_mes_unmap_queues *packet; -+ -+ packet = (struct pm4_mes_unmap_queues *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); -+ -+ packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, -+ sizeof(struct pm4_mes_unmap_queues)); -+ switch (type) { -+ case KFD_QUEUE_TYPE_COMPUTE: -+ case KFD_QUEUE_TYPE_DIQ: -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_unmap_queues__compute; -+ break; -+ case KFD_QUEUE_TYPE_SDMA: -+ packet->bitfields2.engine_sel = -+ engine_sel__mes_unmap_queues__sdma0 + sdma_engine; -+ break; -+ default: -+ WARN(1, "queue type %d\n", type); -+ break; -+ } -+ -+ if (reset) -+ packet->bitfields2.action = -+ action__mes_unmap_queues__reset_queues; -+ else -+ packet->bitfields2.action = -+ action__mes_unmap_queues__preempt_queues; -+ -+ switch (filter) { -+ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__perform_request_on_specified_queues; -+ packet->bitfields2.num_queues = 1; -+ packet->bitfields3b.doorbell_offset0 = filter_param; -+ break; -+ case KFD_UNMAP_QUEUES_FILTER_BY_PASID: -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; -+ packet->bitfields3a.pasid = filter_param; -+ break; -+ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__unmap_all_queues; -+ break; -+ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: -+ /* in this case, we do not preempt static queues */ -+ packet->bitfields2.queue_sel = -+ queue_sel__mes_unmap_queues__unmap_all_non_static_queues; -+ break; -+ default: -+ WARN(1, "filter %d\n", filter); -+ break; -+ } -+ -+ return 0; -+ -+} -+ -+int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, -+ uint64_t fence_address, uint32_t fence_value) -+{ -+ struct pm4_mes_query_status *packet; -+ -+ packet = (struct pm4_mes_query_status *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mes_query_status)); -+ -+ -+ packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, -+ sizeof(struct pm4_mes_query_status)); -+ -+ packet->bitfields2.context_id = 0; -+ packet->bitfields2.interrupt_sel = -+ interrupt_sel__mes_query_status__completion_status; -+ packet->bitfields2.command = -+ command__mes_query_status__fence_only_after_write_ack; -+ -+ packet->addr_hi = upper_32_bits((uint64_t)fence_address); -+ packet->addr_lo = lower_32_bits((uint64_t)fence_address); -+ packet->data_hi = upper_32_bits((uint64_t)fence_value); -+ packet->data_lo = lower_32_bits((uint64_t)fence_value); -+ -+ return 0; -+} -+ -+ -+uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) -+{ -+ struct pm4_mec_release_mem *packet; -+ -+ WARN_ON(!buffer); -+ -+ packet = (struct pm4_mec_release_mem *)buffer; -+ memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); -+ -+ packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, -+ sizeof(struct pm4_mec_release_mem)); -+ -+ packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; -+ packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; -+ packet->bitfields2.tcl1_action_ena = 1; -+ packet->bitfields2.tc_action_ena = 1; -+ packet->bitfields2.cache_policy = cache_policy___release_mem__lru; -+ packet->bitfields2.atc = 0; -+ -+ packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; -+ packet->bitfields3.int_sel = -+ int_sel___release_mem__send_interrupt_after_write_confirm; -+ -+ packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; -+ packet->address_hi = upper_32_bits(gpu_addr); -+ -+ packet->data_lo = 0; -+ -+ return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); -+} -+ -+uint32_t pm_get_map_process_packet_size_vi(void) -+{ -+ return sizeof(struct pm4_mes_map_process); -+} -+ -+uint32_t pm_get_runlist_packet_size_vi(void) -+{ -+ return sizeof(struct pm4_mes_runlist); -+} -+ -+uint32_t pm_get_set_resources_packet_size_vi(void) -+{ -+ return sizeof(struct pm4_mes_set_resources); -+} -+ -+uint32_t pm_get_map_queues_packet_size_vi(void) -+{ -+ return sizeof(struct pm4_mes_map_queues); -+} -+ -+uint32_t pm_get_unmap_queues_packet_size_vi(void) -+{ -+ return sizeof(struct pm4_mes_unmap_queues); -+} -+ -+uint32_t pm_get_query_status_packet_size_vi(void) -+{ -+ return sizeof(struct pm4_mes_query_status); -+} -+ -+uint32_t pm_get_release_mem_packet_size_vi(void) -+{ -+ return sizeof(struct pm4_mec_release_mem); -+} -+ -+ -+static struct packet_manager_funcs kfd_vi_pm_funcs = { -+ .map_process = pm_map_process_vi, -+ .runlist = pm_runlist_vi, -+ .set_resources = pm_set_resources_vi, -+ .map_queues = pm_map_queues_vi, -+ .unmap_queues = pm_unmap_queues_vi, -+ .query_status = pm_query_status_vi, -+ .release_mem = pm_release_mem_vi, -+ .get_map_process_packet_size = pm_get_map_process_packet_size_vi, -+ .get_runlist_packet_size = pm_get_runlist_packet_size_vi, -+ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, -+ .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, -+ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, -+ .get_query_status_packet_size = pm_get_query_status_packet_size_vi, -+ .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, -+}; -+ -+void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver) -+{ -+ pm->pmf = &kfd_vi_pm_funcs; -+} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c -index 0980995..046282a 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c -@@ -80,6 +80,8 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, - case CHIP_POLARIS10: - case CHIP_POLARIS11: - return mqd_manager_init_vi_tonga(type, dev); -+ case CHIP_VEGA10: -+ return mqd_manager_init_v9(type, dev); - } - - return NULL; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c -new file mode 100644 -index 0000000..3caeb6e ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c -@@ -0,0 +1,509 @@ -+/* -+ * Copyright 2016 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ * -+ */ -+ -+#include <linux/printk.h> -+#include <linux/slab.h> -+#include "kfd_priv.h" -+#include "kfd_mqd_manager.h" -+#include "v9_structs.h" -+#include "vega10/GC/gc_9_0_offset.h" -+#include "vega10/GC/gc_9_0_sh_mask.h" -+#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" -+ -+static inline struct v9_mqd *get_mqd(void *mqd) -+{ -+ return (struct v9_mqd *)mqd; -+} -+ -+static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) -+{ -+ return (struct v9_sdma_mqd *)mqd; -+} -+ -+static void update_cu_mask(struct mqd_manager *mm, void *mqd, -+ struct queue_properties *q) -+{ -+ struct v9_mqd *m; -+ struct kfd_cu_info cu_info; -+ uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ -+ uint32_t cu_mask_count = q->cu_mask_count; -+ const uint32_t *cu_mask = q->cu_mask; -+ int se, cu_per_sh, cu_index, i; -+ -+ if (cu_mask_count == 0) -+ return; -+ -+ m = get_mqd(mqd); -+ m->compute_static_thread_mgmt_se0 = 0; -+ m->compute_static_thread_mgmt_se1 = 0; -+ m->compute_static_thread_mgmt_se2 = 0; -+ m->compute_static_thread_mgmt_se3 = 0; -+ -+ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); -+ -+ /* If # CU mask bits > # CUs, set it to the # of CUs */ -+ if (cu_mask_count > cu_info.cu_active_number) -+ cu_mask_count = cu_info.cu_active_number; -+ -+ cu_index = 0; -+ for (se = 0; se < cu_info.num_shader_engines; se++) { -+ cu_per_sh = 0; -+ -+ /* Get the number of CUs on this Shader Engine */ -+ for (i = 0; i < 4; i++) -+ cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); -+ -+ se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); -+ if ((cu_per_sh + (cu_index % 32)) > 32) -+ se_mask[se] |= cu_mask[(cu_index / 32) + 1] -+ << (32 - (cu_index % 32)); -+ se_mask[se] &= (1 << cu_per_sh) - 1; -+ cu_index += cu_per_sh; -+ } -+ m->compute_static_thread_mgmt_se0 = se_mask[0]; -+ m->compute_static_thread_mgmt_se1 = se_mask[1]; -+ m->compute_static_thread_mgmt_se2 = se_mask[2]; -+ m->compute_static_thread_mgmt_se3 = se_mask[3]; -+ -+ pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n", -+ m->compute_static_thread_mgmt_se0, -+ m->compute_static_thread_mgmt_se1, -+ m->compute_static_thread_mgmt_se2, -+ m->compute_static_thread_mgmt_se3); -+} -+ -+static int init_mqd(struct mqd_manager *mm, void **mqd, -+ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, -+ struct queue_properties *q) -+{ -+ int retval; -+ uint64_t addr; -+ struct v9_mqd *m; -+ struct kfd_dev *kfd = mm->dev; -+ -+ /* From V9, for CWSR, the control stack is located on the next page -+ * boundary after the mqd, we will use the gtt allocation function -+ * instead of sub-allocation function. -+ */ -+ if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { -+ *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); -+ retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, -+ ALIGN(q->ctl_stack_size, PAGE_SIZE) + -+ ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), -+ &((*mqd_mem_obj)->gtt_mem), -+ &((*mqd_mem_obj)->gpu_addr), -+ (void *)&((*mqd_mem_obj)->cpu_ptr)); -+ } else -+ retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), -+ mqd_mem_obj); -+ if (retval != 0) -+ return -ENOMEM; -+ -+ m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; -+ addr = (*mqd_mem_obj)->gpu_addr; -+ -+ memset(m, 0, sizeof(struct v9_mqd)); -+ -+ m->header = 0xC0310800; -+ m->compute_pipelinestat_enable = 1; -+ m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; -+ m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; -+ m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; -+ m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; -+ -+ m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | -+ 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; -+ -+ m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; -+ -+ m->cp_mqd_base_addr_lo = lower_32_bits(addr); -+ m->cp_mqd_base_addr_hi = upper_32_bits(addr); -+ -+ m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | -+ 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | -+ 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; -+ -+ m->cp_hqd_pipe_priority = 1; -+ m->cp_hqd_queue_priority = 15; -+ -+ if (q->format == KFD_QUEUE_FORMAT_AQL) { -+ m->cp_hqd_aql_control = -+ 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT | -+ 1 << CP_HQD_AQL_CONTROL__CONTROL0_EN__SHIFT; -+ } -+ -+ if (q->tba_addr) { -+ m->compute_pgm_rsrc2 |= -+ (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); -+ } -+ -+ if (mm->dev->cwsr_enabled) { -+ m->cp_hqd_persistent_state |= -+ (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); -+ m->cp_hqd_ctx_save_base_addr_lo = -+ lower_32_bits(q->ctx_save_restore_area_address); -+ m->cp_hqd_ctx_save_base_addr_hi = -+ upper_32_bits(q->ctx_save_restore_area_address); -+ m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; -+ m->cp_hqd_cntl_stack_size = q->ctl_stack_size; -+ m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; -+ m->cp_hqd_wg_state_offset = 0; -+ } -+ -+ *mqd = m; -+ if (gart_addr != NULL) -+ *gart_addr = addr; -+ retval = mm->update_mqd(mm, m, q); -+ -+ return retval; -+} -+ -+static int load_mqd(struct mqd_manager *mm, void *mqd, -+ uint32_t pipe_id, uint32_t queue_id, -+ struct queue_properties *p, struct mm_struct *mms) -+{ -+ /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ -+ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); -+ -+ return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, -+ (uint32_t __user *)p->write_ptr, -+ wptr_shift, 0, mms); -+} -+ -+static int update_mqd(struct mqd_manager *mm, void *mqd, -+ struct queue_properties *q) -+{ -+ struct v9_mqd *m; -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ m = get_mqd(mqd); -+ -+ m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; -+ m->cp_hqd_pq_control |= -+ ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; -+ pr_debug("kfd: cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); -+ -+ m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); -+ m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); -+ -+ m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); -+ m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -+ -+ m->cp_hqd_pq_doorbell_control = -+ 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_EN__SHIFT | -+ q->doorbell_off << -+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; -+ pr_debug("kfd: cp_hqd_pq_doorbell_control 0x%x\n", -+ m->cp_hqd_pq_doorbell_control); -+ -+ m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; -+ -+ /* -+ * HW does not clamp this field correctly. Maximum EOP queue size -+ * is constrained by per-SE EOP done signal count, which is 8-bit. -+ * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit -+ * more than (EOP entry count - 1) so a queue size of 0x800 dwords -+ * is safe, giving a maximum field value of 0xA. -+ */ -+ m->cp_hqd_eop_control = min(0xA, -+ ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); -+ m->cp_hqd_eop_base_addr_lo = -+ lower_32_bits(q->eop_ring_buffer_address >> 8); -+ m->cp_hqd_eop_base_addr_hi = -+ upper_32_bits(q->eop_ring_buffer_address >> 8); -+ -+ m->cp_hqd_iq_timer = 0; -+ -+ m->cp_hqd_vmid = q->vmid; -+ -+ if (q->format == KFD_QUEUE_FORMAT_AQL) { -+ m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | -+ 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | -+ 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | -+ 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; -+ m->cp_hqd_pq_doorbell_control |= -+ 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; -+ } -+ if (mm->dev->cwsr_enabled) -+ m->cp_hqd_ctx_save_control = 0; -+ -+ update_cu_mask(mm, mqd, q); -+ -+ m->cp_hqd_active = 0; -+ q->is_active = false; -+ if (q->queue_size > 0 && -+ q->queue_address != 0 && -+ q->queue_percent > 0 && -+ !q->is_evicted) { -+ m->cp_hqd_active = 1; -+ q->is_active = true; -+ } -+ -+ return 0; -+} -+ -+ -+static int destroy_mqd(struct mqd_manager *mm, void *mqd, -+ enum kfd_preempt_type type, -+ unsigned int timeout, uint32_t pipe_id, -+ uint32_t queue_id) -+{ -+ return mm->dev->kfd2kgd->hqd_destroy -+ (mm->dev->kgd, type, timeout, -+ pipe_id, queue_id); -+} -+ -+static void uninit_mqd(struct mqd_manager *mm, void *mqd, -+ struct kfd_mem_obj *mqd_mem_obj) -+{ -+ struct kfd_dev *kfd = mm->dev; -+ -+ if (mqd_mem_obj->gtt_mem) { -+ kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); -+ kfree(mqd_mem_obj); -+ } else { -+ kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -+ } -+} -+ -+static bool is_occupied(struct mqd_manager *mm, void *mqd, -+ uint64_t queue_address, uint32_t pipe_id, -+ uint32_t queue_id) -+{ -+ return mm->dev->kfd2kgd->hqd_is_occupied( -+ mm->dev->kgd, queue_address, -+ pipe_id, queue_id); -+} -+ -+static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, -+ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, -+ struct queue_properties *q) -+{ -+ struct v9_mqd *m; -+ int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); -+ -+ if (retval != 0) -+ return retval; -+ -+ m = get_mqd(*mqd); -+ -+ m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | -+ 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; -+ -+ return retval; -+} -+ -+static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, -+ struct queue_properties *q) -+{ -+ struct v9_mqd *m; -+ int retval = update_mqd(mm, mqd, q); -+ -+ if (retval != 0) -+ return retval; -+ -+ /* TODO: what's the point? update_mqd already does this. */ -+ m = get_mqd(mqd); -+ m->cp_hqd_vmid = q->vmid; -+ return retval; -+} -+ -+static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, -+ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, -+ struct queue_properties *q) -+{ -+ int retval; -+ struct v9_sdma_mqd *m; -+ -+ -+ retval = kfd_gtt_sa_allocate(mm->dev, -+ sizeof(struct v9_sdma_mqd), -+ mqd_mem_obj); -+ -+ if (retval != 0) -+ return -ENOMEM; -+ -+ m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; -+ -+ memset(m, 0, sizeof(struct v9_sdma_mqd)); -+ -+ *mqd = m; -+ if (gart_addr != NULL) -+ *gart_addr = (*mqd_mem_obj)->gpu_addr; -+ -+ retval = mm->update_mqd(mm, m, q); -+ -+ return retval; -+} -+ -+static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, -+ struct kfd_mem_obj *mqd_mem_obj) -+{ -+ kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -+} -+ -+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, -+ uint32_t pipe_id, uint32_t queue_id, -+ struct queue_properties *p, struct mm_struct *mms) -+{ -+ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, -+ (uint32_t __user *)p->write_ptr, -+ mms); -+} -+ -+#define SDMA_RLC_DUMMY_DEFAULT 0xf -+ -+static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, -+ struct queue_properties *q) -+{ -+ struct v9_sdma_mqd *m; -+ -+ m = get_sdma_mqd(mqd); -+ m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) -+ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | -+ q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | -+ 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | -+ 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; -+ -+ m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); -+ m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); -+ m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); -+ m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); -+ m->sdmax_rlcx_doorbell = 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; -+ m->sdmax_rlcx_doorbell_offset = -+ q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; -+ -+ m->sdma_engine_id = q->sdma_engine_id; -+ m->sdma_queue_id = q->sdma_queue_id; -+ m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; -+ -+ q->is_active = false; -+ if (q->queue_size > 0 && -+ q->queue_address != 0 && -+ q->queue_percent > 0 && -+ !q->is_evicted) { -+ m->sdmax_rlcx_rb_cntl |= -+ 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; -+ -+ q->is_active = true; -+ } -+ -+ return 0; -+} -+ -+/* -+ * * preempt type here is ignored because there is only one way -+ * * to preempt sdma queue -+ */ -+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, -+ enum kfd_preempt_type type, -+ unsigned int timeout, uint32_t pipe_id, -+ uint32_t queue_id) -+{ -+ return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); -+} -+ -+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, -+ uint64_t queue_address, uint32_t pipe_id, -+ uint32_t queue_id) -+{ -+ return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); -+} -+ -+#if defined(CONFIG_DEBUG_FS) -+ -+static int debugfs_show_mqd(struct seq_file *m, void *data) -+{ -+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, -+ data, sizeof(struct v9_mqd), false); -+ return 0; -+} -+ -+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) -+{ -+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, -+ data, sizeof(struct v9_sdma_mqd), false); -+ return 0; -+} -+ -+#endif -+ -+struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, -+ struct kfd_dev *dev) -+{ -+ struct mqd_manager *mqd; -+ -+ if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) -+ return NULL; -+ -+ pr_debug("kfd: In func %s\n", __func__); -+ -+ mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); -+ if (!mqd) -+ return NULL; -+ -+ mqd->dev = dev; -+ -+ switch (type) { -+ case KFD_MQD_TYPE_CP: -+ case KFD_MQD_TYPE_COMPUTE: -+ mqd->init_mqd = init_mqd; -+ mqd->uninit_mqd = uninit_mqd; -+ mqd->load_mqd = load_mqd; -+ mqd->update_mqd = update_mqd; -+ mqd->destroy_mqd = destroy_mqd; -+ mqd->is_occupied = is_occupied; -+#if defined(CONFIG_DEBUG_FS) -+ mqd->debugfs_show_mqd = debugfs_show_mqd; -+#endif -+ break; -+ case KFD_MQD_TYPE_HIQ: -+ mqd->init_mqd = init_mqd_hiq; -+ mqd->uninit_mqd = uninit_mqd; -+ mqd->load_mqd = load_mqd; -+ mqd->update_mqd = update_mqd_hiq; -+ mqd->destroy_mqd = destroy_mqd; -+ mqd->is_occupied = is_occupied; -+#if defined(CONFIG_DEBUG_FS) -+ mqd->debugfs_show_mqd = debugfs_show_mqd; -+#endif -+ break; -+ case KFD_MQD_TYPE_SDMA: -+ mqd->init_mqd = init_mqd_sdma; -+ mqd->uninit_mqd = uninit_mqd_sdma; -+ mqd->load_mqd = load_mqd_sdma; -+ mqd->update_mqd = update_mqd_sdma; -+ mqd->destroy_mqd = destroy_mqd_sdma; -+ mqd->is_occupied = is_occupied_sdma; -+#if defined(CONFIG_DEBUG_FS) -+ mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; -+#endif -+ break; -+ default: -+ kfree(mqd); -+ return NULL; -+ } -+ -+ return mqd; -+} -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -index 6361c2e..f7c99ad 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c -@@ -26,8 +26,6 @@ - #include "kfd_device_queue_manager.h" - #include "kfd_kernel_queue.h" - #include "kfd_priv.h" --#include "kfd_pm4_headers.h" --#include "kfd_pm4_headers_vi.h" - #include "kfd_pm4_opcodes.h" - - static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, -@@ -39,18 +37,6 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, - *wptr = temp; - } - --static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) --{ -- union PM4_MES_TYPE_3_HEADER header; -- -- header.u32all = 0; -- header.opcode = opcode; -- header.count = packet_size/sizeof(uint32_t) - 2; -- header.type = PM4_TYPE_3; -- -- return header.u32all; --} -- - static void pm_calc_rlib_size(struct packet_manager *pm, - unsigned int *rlib_size, - bool *over_subscription) -@@ -84,9 +70,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, - pr_debug("kfd: over subscribed runlist\n"); - } - -- map_queue_size = KFD_IS_VI(pm->dqm->dev->device_info->asic_family) ? -- sizeof(struct pm4_mes_map_queues) : -- sizeof(struct pm4_map_queues); -+ map_queue_size = pm->pmf->get_map_queues_packet_size(); - /* calculate run list ib allocation size */ - *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + - queue_count * map_queue_size; -@@ -96,7 +80,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, - * when over subscription - */ - if (*over_subscription) -- *rlib_size += sizeof(struct pm4_runlist); -+ *rlib_size += pm->pmf->get_runlist_packet_size(); - - pr_debug("kfd: runlist ib size %d\n", *rlib_size); - } -@@ -136,296 +120,6 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, - return retval; - } - --static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, -- uint64_t ib, size_t ib_size_in_dwords, bool chain) --{ -- struct pm4_runlist *packet; -- int concurrent_proc_cnt = 0; -- struct kfd_dev *kfd = pm->dqm->dev; -- -- BUG_ON(!pm || !buffer || !ib); -- -- /* Determine the number of processes to map together to HW: -- * it can not exceed the number of VMIDs available to the -- * scheduler, and it is determined by the smaller of the number -- * of processes in the runlist and kfd module parameter -- * hws_max_conc_proc. -- * Note: the arbitration between the number of VMIDs and -- * hws_max_conc_proc has been done in -- * kgd2kfd_device_init(). -- */ -- concurrent_proc_cnt = min(pm->dqm->processes_count, -- kfd->max_proc_per_quantum); -- -- -- packet = (struct pm4_runlist *)buffer; -- -- memset(buffer, 0, sizeof(struct pm4_runlist)); -- packet->header.u32all = build_pm4_header(IT_RUN_LIST, -- sizeof(struct pm4_runlist)); -- -- packet->bitfields4.ib_size = ib_size_in_dwords; -- packet->bitfields4.chain = chain ? 1 : 0; -- packet->bitfields4.offload_polling = 0; -- packet->bitfields4.valid = 1; -- packet->bitfields4.process_cnt = concurrent_proc_cnt; -- packet->ordinal2 = lower_32_bits(ib); -- packet->bitfields3.ib_base_hi = upper_32_bits(ib); -- -- return 0; --} -- --static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, -- struct qcm_process_device *qpd) --{ -- struct pm4_map_process *packet; -- struct queue *cur; -- uint32_t num_queues; -- -- BUG_ON(!pm || !buffer || !qpd); -- -- packet = (struct pm4_map_process *)buffer; -- -- pr_debug("kfd: In func %s\n", __func__); -- -- memset(buffer, 0, sizeof(struct pm4_map_process)); -- -- packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, -- sizeof(struct pm4_map_process)); -- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -- packet->bitfields2.process_quantum = 1; -- packet->bitfields2.pasid = qpd->pqm->process->pasid; -- packet->bitfields3.page_table_base = qpd->page_table_base; -- packet->bitfields10.gds_size = qpd->gds_size; -- packet->bitfields10.num_gws = qpd->num_gws; -- packet->bitfields10.num_oac = qpd->num_oac; -- num_queues = 0; -- list_for_each_entry(cur, &qpd->queues_list, list) -- num_queues++; -- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; -- -- packet->sh_mem_config = qpd->sh_mem_config; -- packet->sh_mem_bases = qpd->sh_mem_bases; -- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -- -- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -- -- return 0; --} -- --static int pm_create_map_process_scratch_kv(struct packet_manager *pm, -- uint32_t *buffer, struct qcm_process_device *qpd) --{ -- struct pm4_map_process_scratch_kv *packet; -- struct queue *cur; -- uint32_t num_queues; -- -- BUG_ON(!pm || !buffer || !qpd); -- -- packet = (struct pm4_map_process_scratch_kv *)buffer; -- -- pr_debug("kfd: In func %s\n", __func__); -- -- memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); -- -- packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, -- sizeof(struct pm4_map_process_scratch_kv)); -- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -- packet->bitfields2.process_quantum = 1; -- packet->bitfields2.pasid = qpd->pqm->process->pasid; -- packet->bitfields3.page_table_base = qpd->page_table_base; -- packet->bitfields14.gds_size = qpd->gds_size; -- packet->bitfields14.num_gws = qpd->num_gws; -- packet->bitfields14.num_oac = qpd->num_oac; -- num_queues = 0; -- list_for_each_entry(cur, &qpd->queues_list, list) -- num_queues++; -- packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues; -- -- packet->sh_mem_config = qpd->sh_mem_config; -- packet->sh_mem_bases = qpd->sh_mem_bases; -- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -- -- packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; -- -- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -- -- return 0; --} -- --static int pm_create_map_process_scratch(struct packet_manager *pm, -- uint32_t *buffer, struct qcm_process_device *qpd) --{ -- struct pm4_map_process_scratch *packet; -- struct queue *cur; -- uint32_t num_queues; -- -- BUG_ON(!pm || !buffer || !qpd); -- -- packet = (struct pm4_map_process_scratch *)buffer; -- -- pr_debug("kfd: In func %s\n", __func__); -- -- memset(buffer, 0, sizeof(struct pm4_map_process_scratch)); -- -- packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, -- sizeof(struct pm4_map_process_scratch)); -- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; -- packet->bitfields2.process_quantum = 1; -- packet->bitfields2.pasid = qpd->pqm->process->pasid; -- packet->bitfields3.page_table_base = qpd->page_table_base; -- packet->bitfields10.gds_size = qpd->gds_size; -- packet->bitfields10.num_gws = qpd->num_gws; -- packet->bitfields10.num_oac = qpd->num_oac; -- num_queues = 0; -- list_for_each_entry(cur, &qpd->queues_list, list) -- num_queues++; -- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; -- -- packet->sh_mem_config = qpd->sh_mem_config; -- packet->sh_mem_bases = qpd->sh_mem_bases; -- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; -- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; -- -- packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; -- -- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); -- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); -- -- return 0; --} -- --static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, -- struct queue *q, bool is_static) --{ -- struct pm4_mes_map_queues *packet; -- bool use_static = is_static; -- -- BUG_ON(!pm || !buffer || !q); -- -- pr_debug("kfd: In func %s\n", __func__); -- -- packet = (struct pm4_mes_map_queues *)buffer; -- memset(buffer, 0, sizeof(struct pm4_map_queues)); -- -- packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, -- sizeof(struct pm4_map_queues)); -- packet->bitfields2.alloc_format = -- alloc_format__mes_map_queues__one_per_pipe_vi; -- packet->bitfields2.num_queues = 1; -- packet->bitfields2.queue_sel = -- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; -- -- packet->bitfields2.engine_sel = -- engine_sel__mes_map_queues__compute_vi; -- packet->bitfields2.queue_type = -- queue_type__mes_map_queues__normal_compute_vi; -- -- switch (q->properties.type) { -- case KFD_QUEUE_TYPE_COMPUTE: -- if (use_static) -- packet->bitfields2.queue_type = -- queue_type__mes_map_queues__normal_latency_static_queue_vi; -- break; -- case KFD_QUEUE_TYPE_DIQ: -- packet->bitfields2.queue_type = -- queue_type__mes_map_queues__debug_interface_queue_vi; -- break; -- case KFD_QUEUE_TYPE_SDMA: -- packet->bitfields2.engine_sel = q->properties.sdma_engine_id + -- engine_sel__mes_map_queues__sdma0_vi; -- use_static = false; /* no static queues under SDMA */ -- break; -- default: -- pr_err("kfd: in %s queue type %d\n", __func__, -- q->properties.type); -- BUG(); -- break; -- } -- packet->bitfields3.doorbell_offset = -- q->properties.doorbell_off; -- -- packet->mqd_addr_lo = -- lower_32_bits(q->gart_mqd_addr); -- -- packet->mqd_addr_hi = -- upper_32_bits(q->gart_mqd_addr); -- -- packet->wptr_addr_lo = -- lower_32_bits((uint64_t)q->properties.write_ptr); -- -- packet->wptr_addr_hi = -- upper_32_bits((uint64_t)q->properties.write_ptr); -- -- return 0; --} -- --static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, -- struct queue *q, bool is_static) --{ -- struct pm4_map_queues *packet; -- bool use_static = is_static; -- -- BUG_ON(!pm || !buffer || !q); -- -- pr_debug("kfd: In func %s\n", __func__); -- -- packet = (struct pm4_map_queues *)buffer; -- memset(buffer, 0, sizeof(struct pm4_map_queues)); -- -- packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, -- sizeof(struct pm4_map_queues)); -- packet->bitfields2.alloc_format = -- alloc_format__mes_map_queues__one_per_pipe; -- packet->bitfields2.num_queues = 1; -- packet->bitfields2.queue_sel = -- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots; -- -- packet->bitfields2.vidmem = (q->properties.is_interop) ? -- vidmem__mes_map_queues__uses_video_memory : -- vidmem__mes_map_queues__uses_no_video_memory; -- -- switch (q->properties.type) { -- case KFD_QUEUE_TYPE_COMPUTE: -- case KFD_QUEUE_TYPE_DIQ: -- packet->bitfields2.engine_sel = -- engine_sel__mes_map_queues__compute; -- break; -- case KFD_QUEUE_TYPE_SDMA: -- packet->bitfields2.engine_sel = q->properties.sdma_engine_id + -- engine_sel__mes_map_queues__sdma0; -- use_static = false; /* no static queues under SDMA */ -- break; -- default: -- BUG(); -- break; -- } -- -- packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset = -- q->properties.doorbell_off; -- -- packet->mes_map_queues_ordinals[0].bitfields3.is_static = -- (use_static) ? 1 : 0; -- -- packet->mes_map_queues_ordinals[0].mqd_addr_lo = -- lower_32_bits(q->gart_mqd_addr); -- -- packet->mes_map_queues_ordinals[0].mqd_addr_hi = -- upper_32_bits(q->gart_mqd_addr); -- -- packet->mes_map_queues_ordinals[0].wptr_addr_lo = -- lower_32_bits((uint64_t)q->properties.write_ptr); -- -- packet->mes_map_queues_ordinals[0].wptr_addr_hi = -- upper_32_bits((uint64_t)q->properties.write_ptr); -- -- return 0; --} -- - static int pm_create_runlist_ib(struct packet_manager *pm, - struct list_head *queues, - uint64_t *rl_gpu_addr, -@@ -481,13 +175,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n", - kq->queue->queue, qpd->is_debug); - -- if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) -- retval = pm_create_map_queue_vi(pm, -- &rl_buffer[rl_wptr], -- kq->queue, -- qpd->is_debug); -- else -- retval = pm_create_map_queue(pm, -+ retval = pm->pmf->map_queues(pm, - &rl_buffer[rl_wptr], - kq->queue, - qpd->is_debug); -@@ -495,7 +183,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - return retval; - - inc_wptr(&rl_wptr, -- sizeof(struct pm4_map_queues), -+ pm->pmf->get_map_queues_packet_size(), - alloc_size_bytes); - } - -@@ -506,22 +194,15 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n", - q->queue, qpd->is_debug); - -- if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) -- retval = pm_create_map_queue_vi(pm, -- &rl_buffer[rl_wptr], -- q, -- qpd->is_debug); -- else -- retval = pm_create_map_queue(pm, -+ retval = pm->pmf->map_queues(pm, - &rl_buffer[rl_wptr], - q, - qpd->is_debug); -- - if (retval != 0) - return retval; - - inc_wptr(&rl_wptr, -- sizeof(struct pm4_map_queues), -+ pm->pmf->get_map_queues_packet_size(), - alloc_size_bytes); - } - } -@@ -529,68 +210,17 @@ static int pm_create_runlist_ib(struct packet_manager *pm, - pr_debug("kfd: finished map process and queues to runlist\n"); - - if (is_over_subscription) -- pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, -+ pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, - alloc_size_bytes / sizeof(uint32_t), true); - - for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) - pr_debug("0x%2X ", rl_buffer[i]); -+ - pr_debug("\n"); - - return 0; - } - --static int get_map_process_packet_size(void) --{ -- return sizeof(struct pm4_map_process); --} -- --static int get_map_process_packet_size_scratch_kv(void) --{ -- return sizeof(struct pm4_map_process_scratch_kv); --} -- --static int get_map_process_packet_size_scratch(void) --{ -- return sizeof(struct pm4_map_process_scratch); --} -- --/* pm_create_release_mem - Create a RELEASE_MEM packet and return the size -- * of this packet -- * @gpu_addr - GPU address of the packet. It's a virtual address. -- * @buffer - buffer to fill up with the packet. It's a CPU kernel pointer -- * Return - length of the packet -- */ --uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer) --{ -- struct pm4__release_mem *packet; -- -- WARN_ON(!buffer); -- -- packet = (struct pm4__release_mem *)buffer; -- memset(buffer, 0, sizeof(struct pm4__release_mem)); -- -- packet->header.u32all = build_pm4_header(IT_RELEASE_MEM, -- sizeof(struct pm4__release_mem)); -- -- packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; -- packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; -- packet->bitfields2.tcl1_action_ena = 1; -- packet->bitfields2.tc_action_ena = 1; -- packet->bitfields2.cache_policy = cache_policy___release_mem__lru; -- packet->bitfields2.atc = 0; -- -- packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; -- packet->bitfields3.int_sel = -- int_sel___release_mem__send_interrupt_after_write_confirm; -- -- packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; -- packet->address_hi = upper_32_bits(gpu_addr); -- -- packet->data_lo = 0; -- -- return sizeof(struct pm4__release_mem) / sizeof(unsigned int); --} -- - int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, - uint16_t fw_ver) - { -@@ -603,36 +233,23 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, - mutex_destroy(&pm->lock); - return -ENOMEM; - } -- pm->pmf = kzalloc(sizeof(struct packet_manager_firmware), GFP_KERNEL); -+ pm->pmf = kzalloc(sizeof(struct packet_manager_funcs), GFP_KERNEL); - pm->allocated = false; - - switch (pm->dqm->dev->device_info->asic_family) { - case CHIP_KAVERI: - case CHIP_HAWAII: -- if (fw_ver >= KFD_SCRATCH_KV_FW_VER) { -- pm->pmf->map_process = pm_create_map_process_scratch_kv; -- pm->pmf->get_map_process_packet_size = -- get_map_process_packet_size_scratch_kv; -- } else { -- pm->pmf->map_process = pm_create_map_process; -- pm->pmf->get_map_process_packet_size = -- get_map_process_packet_size; -- } -+ kfd_pm_func_init_cik(pm, fw_ver); - break; - case CHIP_CARRIZO: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: -- if (fw_ver >= KFD_SCRATCH_CZ_FW_VER) { -- pm->pmf->map_process = pm_create_map_process_scratch; -- pm->pmf->get_map_process_packet_size = -- get_map_process_packet_size_scratch; -- } else { -- pm->pmf->map_process = pm_create_map_process; -- pm->pmf->get_map_process_packet_size = -- get_map_process_packet_size; -- } -+ kfd_pm_func_init_vi(pm, fw_ver); -+ break; -+ case CHIP_VEGA10: -+ kfd_pm_func_init_v9(pm, fw_ver); - break; - - } -@@ -652,39 +269,22 @@ void pm_uninit(struct packet_manager *pm) - int pm_send_set_resources(struct packet_manager *pm, - struct scheduling_resources *res) - { -- struct pm4_set_resources *packet; -- -- BUG_ON(!pm || !res); -+ uint32_t *buffer, size; - - pr_debug("kfd: In func %s\n", __func__); - -+ size = pm->pmf->get_set_resources_packet_size(); - mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, -- sizeof(*packet) / sizeof(uint32_t), -- (unsigned int **)&packet); -- if (packet == NULL) { -+ size / sizeof(uint32_t), -+ (unsigned int **)&buffer); -+ if (buffer == NULL) { - mutex_unlock(&pm->lock); - pr_err("kfd: failed to allocate buffer on kernel queue\n"); - return -ENOMEM; - } - -- memset(packet, 0, sizeof(struct pm4_set_resources)); -- packet->header.u32all = build_pm4_header(IT_SET_RESOURCES, -- sizeof(struct pm4_set_resources)); -- -- packet->bitfields2.queue_type = -- queue_type__mes_set_resources__hsa_interface_queue_hiq; -- packet->bitfields2.vmid_mask = res->vmid_mask; -- packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; -- packet->bitfields7.oac_mask = res->oac_mask; -- packet->bitfields8.gds_heap_base = res->gds_heap_base; -- packet->bitfields8.gds_heap_size = res->gds_heap_size; -- -- packet->gws_mask_lo = lower_32_bits(res->gws_mask); -- packet->gws_mask_hi = upper_32_bits(res->gws_mask); -- -- packet->queue_mask_lo = lower_32_bits(res->queue_mask); -- packet->queue_mask_hi = upper_32_bits(res->queue_mask); -+ pm->pmf->set_resources(pm, buffer, res); - - pm->priv_queue->ops.submit_packet(pm->priv_queue); - -@@ -709,7 +309,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) - - pr_debug("kfd: runlist IB address: 0x%llX\n", rl_gpu_ib_addr); - -- packet_size_dwords = sizeof(struct pm4_runlist) / sizeof(uint32_t); -+ packet_size_dwords = pm->pmf->get_runlist_packet_size() / -+ sizeof(uint32_t); - mutex_lock(&pm->lock); - - retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, -@@ -717,8 +318,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) - if (retval != 0) - goto fail_acquire_packet_buffer; - -- retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, -- rl_ib_size / sizeof(uint32_t), false); -+ retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, -+ rl_ib_size / sizeof(uint32_t), false); - if (retval != 0) - goto fail_create_runlist; - -@@ -741,41 +342,22 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) - int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, - uint32_t fence_value) - { -- int retval; -- struct pm4_query_status *packet; -- -- BUG_ON(!pm || !fence_address); -+ uint32_t *buffer, size; - -+ size = pm->pmf->get_query_status_packet_size(); - mutex_lock(&pm->lock); -- retval = pm->priv_queue->ops.acquire_packet_buffer( -- pm->priv_queue, -- sizeof(struct pm4_query_status) / sizeof(uint32_t), -- (unsigned int **)&packet); -- if (retval != 0) -- goto fail_acquire_packet_buffer; -- -- packet->header.u32all = build_pm4_header(IT_QUERY_STATUS, -- sizeof(struct pm4_query_status)); -- -- packet->bitfields2.context_id = 0; -- packet->bitfields2.interrupt_sel = -- interrupt_sel__mes_query_status__completion_status; -- packet->bitfields2.command = -- command__mes_query_status__fence_only_after_write_ack; -- -- packet->addr_hi = upper_32_bits((uint64_t)fence_address); -- packet->addr_lo = lower_32_bits((uint64_t)fence_address); -- packet->data_hi = upper_32_bits((uint64_t)fence_value); -- packet->data_lo = lower_32_bits((uint64_t)fence_value); -- -+ pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, -+ size / sizeof(uint32_t), (unsigned int **)&buffer); -+ if (buffer == NULL) { -+ mutex_unlock(&pm->lock); -+ pr_err("kfd: failed to allocate buffer on kernel queue\n"); -+ return -ENOMEM; -+ } -+ pm->pmf->query_status(pm, buffer, fence_address, fence_value); - pm->priv_queue->ops.submit_packet(pm->priv_queue); - mutex_unlock(&pm->lock); - - return 0; -- --fail_acquire_packet_buffer: -- mutex_unlock(&pm->lock); -- return retval; - } - - int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, -@@ -783,82 +365,23 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - uint32_t filter_param, bool reset, - unsigned int sdma_engine) - { -- int retval; -- uint32_t *buffer; -- struct pm4_unmap_queues *packet; -- -- BUG_ON(!pm); -+ uint32_t *buffer, size; - -+ size = pm->pmf->get_unmap_queues_packet_size(); - mutex_lock(&pm->lock); -- retval = pm->priv_queue->ops.acquire_packet_buffer( -- pm->priv_queue, -- sizeof(struct pm4_unmap_queues) / sizeof(uint32_t), -- &buffer); -- if (retval != 0) -- goto err_acquire_packet_buffer; -- -- packet = (struct pm4_unmap_queues *)buffer; -- memset(buffer, 0, sizeof(struct pm4_unmap_queues)); -- pr_debug("kfd: static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n", -- filter, reset, type); -- packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, -- sizeof(struct pm4_unmap_queues)); -- switch (type) { -- case KFD_QUEUE_TYPE_COMPUTE: -- case KFD_QUEUE_TYPE_DIQ: -- packet->bitfields2.engine_sel = -- engine_sel__mes_unmap_queues__compute; -- break; -- case KFD_QUEUE_TYPE_SDMA: -- packet->bitfields2.engine_sel = -- engine_sel__mes_unmap_queues__sdma0 + sdma_engine; -- break; -- default: -- BUG(); -- break; -- } -- -- if (reset) -- packet->bitfields2.action = -- action__mes_unmap_queues__reset_queues; -- else -- packet->bitfields2.action = -- action__mes_unmap_queues__preempt_queues; -- -- switch (filter) { -- case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__perform_request_on_specified_queues; -- packet->bitfields2.num_queues = 1; -- packet->bitfields3b.doorbell_offset0 = filter_param; -- break; -- case KFD_UNMAP_QUEUES_FILTER_BY_PASID: -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; -- packet->bitfields3a.pasid = filter_param; -- break; -- case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; -- break; -- case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: -- /* in this case, we do not preempt static queues */ -- packet->bitfields2.queue_sel = -- queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; -- break; -- default: -- BUG(); -- break; -+ pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, -+ size / sizeof(uint32_t), (unsigned int **)&buffer); -+ if (buffer == NULL) { -+ mutex_unlock(&pm->lock); -+ pr_err("kfd: failed to allocate buffer on kernel queue\n"); -+ return -ENOMEM; - } -- -+ pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, reset, -+ sdma_engine); - pm->priv_queue->ops.submit_packet(pm->priv_queue); -- - mutex_unlock(&pm->lock); -- return 0; - --err_acquire_packet_buffer: -- mutex_unlock(&pm->lock); -- return retval; -+ return 0; - } - - void pm_release_ib(struct packet_manager *pm) -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h -index 058ba1b..05e692b 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h -@@ -21,8 +21,8 @@ - * - */ - --#ifndef KFD_PM4_HEADERS_H_ --#define KFD_PM4_HEADERS_H_ -+#ifndef KFD_PM4_HEADERS_CIK_H_ -+#define KFD_PM4_HEADERS_CIK_H_ - - #ifndef PM4_MES_HEADER_DEFINED - #define PM4_MES_HEADER_DEFINED -@@ -41,100 +41,6 @@ union PM4_MES_TYPE_3_HEADER { - }; - #endif /* PM4_MES_HEADER_DEFINED */ - --/* --------------------MES_SET_RESOURCES-------------------- */ -- --#ifndef PM4_MES_SET_RESOURCES_DEFINED --#define PM4_MES_SET_RESOURCES_DEFINED --enum set_resources_queue_type_enum { -- queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, -- queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, -- queue_type__mes_set_resources__hsa_debug_interface_queue = 4 --}; -- --struct pm4_set_resources { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t vmid_mask:16; -- uint32_t unmap_latency:8; -- uint32_t reserved1:5; -- enum set_resources_queue_type_enum queue_type:3; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- uint32_t queue_mask_lo; -- uint32_t queue_mask_hi; -- uint32_t gws_mask_lo; -- uint32_t gws_mask_hi; -- -- union { -- struct { -- uint32_t oac_mask:16; -- uint32_t reserved2:16; -- } bitfields7; -- uint32_t ordinal7; -- }; -- -- union { -- struct { -- uint32_t gds_heap_base:6; -- uint32_t reserved3:5; -- uint32_t gds_heap_size:6; -- uint32_t reserved4:15; -- } bitfields8; -- uint32_t ordinal8; -- }; -- --}; --#endif -- --/*--------------------MES_RUN_LIST-------------------- */ -- --#ifndef PM4_MES_RUN_LIST_DEFINED --#define PM4_MES_RUN_LIST_DEFINED -- --struct pm4_runlist { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t reserved1:2; -- uint32_t ib_base_lo:30; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- union { -- struct { -- uint32_t ib_base_hi:16; -- uint32_t reserved2:16; -- } bitfields3; -- uint32_t ordinal3; -- }; -- -- union { -- struct { -- uint32_t ib_size:20; -- uint32_t chain:1; -- uint32_t offload_polling:1; -- uint32_t reserved3:1; -- uint32_t valid:1; -- uint32_t process_cnt:4; -- uint32_t reserved4:4; -- } bitfields4; -- uint32_t ordinal4; -- }; -- --}; --#endif - - /*--------------------MES_MAP_PROCESS-------------------- */ - -@@ -187,68 +93,6 @@ struct pm4_map_process { - }; - #endif - --/*--------------------MES_MAP_PROCESS_SCRATCH-------------------- */ -- --#ifndef PM4_MES_MAP_PROCESS_SCRATCH_DEFINED --#define PM4_MES_MAP_PROCESS_SCRATCH_DEFINED -- --struct pm4_map_process_scratch { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t pasid:16; -- uint32_t reserved1:8; -- uint32_t diq_enable:1; -- uint32_t process_quantum:7; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- union { -- struct { -- uint32_t page_table_base:28; -- uint32_t reserved3:4; -- } bitfields3; -- uint32_t ordinal3; -- }; -- -- uint32_t reserved; -- -- uint32_t sh_mem_bases; -- uint32_t sh_mem_config; -- uint32_t sh_mem_ape1_base; -- uint32_t sh_mem_ape1_limit; -- -- uint32_t sh_hidden_private_base_vmid; -- -- uint32_t reserved2; -- uint32_t reserved3; -- -- uint32_t gds_addr_lo; -- uint32_t gds_addr_hi; -- -- union { -- struct { -- uint32_t num_gws:6; -- uint32_t reserved4:2; -- uint32_t num_oac:4; -- uint32_t reserved5:4; -- uint32_t gds_size:6; -- uint32_t num_queues:10; -- } bitfields10; -- uint32_t ordinal10; -- }; -- -- uint32_t completion_signal_lo; -- uint32_t completion_signal_hi; -- --}; --#endif -- - #ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH - #define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH - -@@ -304,315 +148,6 @@ uint32_t completion_signal_hi32; - }; - #endif - --/*--------------------MES_MAP_QUEUES--------------------*/ -- --#ifndef PM4_MES_MAP_QUEUES_DEFINED --#define PM4_MES_MAP_QUEUES_DEFINED --enum map_queues_queue_sel_enum { -- queue_sel__mes_map_queues__map_to_specified_queue_slots = 0, -- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots = 1, -- queue_sel__mes_map_queues__enable_process_queues = 2 --}; -- --enum map_queues_vidmem_enum { -- vidmem__mes_map_queues__uses_no_video_memory = 0, -- vidmem__mes_map_queues__uses_video_memory = 1 --}; -- --enum map_queues_alloc_format_enum { -- alloc_format__mes_map_queues__one_per_pipe = 0, -- alloc_format__mes_map_queues__all_on_one_pipe = 1 --}; -- --enum map_queues_engine_sel_enum { -- engine_sel__mes_map_queues__compute = 0, -- engine_sel__mes_map_queues__sdma0 = 2, -- engine_sel__mes_map_queues__sdma1 = 3 --}; -- --struct pm4_map_queues { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t reserved1:4; -- enum map_queues_queue_sel_enum queue_sel:2; -- uint32_t reserved2:2; -- uint32_t vmid:4; -- uint32_t reserved3:4; -- enum map_queues_vidmem_enum vidmem:2; -- uint32_t reserved4:6; -- enum map_queues_alloc_format_enum alloc_format:2; -- enum map_queues_engine_sel_enum engine_sel:3; -- uint32_t num_queues:3; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- struct { -- union { -- struct { -- uint32_t is_static:1; -- uint32_t reserved5:1; -- uint32_t doorbell_offset:21; -- uint32_t reserved6:3; -- uint32_t queue:6; -- } bitfields3; -- uint32_t ordinal3; -- }; -- -- uint32_t mqd_addr_lo; -- uint32_t mqd_addr_hi; -- uint32_t wptr_addr_lo; -- uint32_t wptr_addr_hi; -- -- } mes_map_queues_ordinals[1]; /* 1..N of these ordinal groups */ -- --}; --#endif -- --/*--------------------MES_QUERY_STATUS--------------------*/ -- --#ifndef PM4_MES_QUERY_STATUS_DEFINED --#define PM4_MES_QUERY_STATUS_DEFINED --enum query_status_interrupt_sel_enum { -- interrupt_sel__mes_query_status__completion_status = 0, -- interrupt_sel__mes_query_status__process_status = 1, -- interrupt_sel__mes_query_status__queue_status = 2 --}; -- --enum query_status_command_enum { -- command__mes_query_status__interrupt_only = 0, -- command__mes_query_status__fence_only_immediate = 1, -- command__mes_query_status__fence_only_after_write_ack = 2, -- command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 --}; -- --enum query_status_engine_sel_enum { -- engine_sel__mes_query_status__compute = 0, -- engine_sel__mes_query_status__sdma0_queue = 2, -- engine_sel__mes_query_status__sdma1_queue = 3 --}; -- --struct pm4_query_status { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- uint32_t context_id:28; -- enum query_status_interrupt_sel_enum interrupt_sel:2; -- enum query_status_command_enum command:2; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- union { -- struct { -- uint32_t pasid:16; -- uint32_t reserved1:16; -- } bitfields3a; -- struct { -- uint32_t reserved2:2; -- uint32_t doorbell_offset:21; -- uint32_t reserved3:3; -- enum query_status_engine_sel_enum engine_sel:3; -- uint32_t reserved4:3; -- } bitfields3b; -- uint32_t ordinal3; -- }; -- -- uint32_t addr_lo; -- uint32_t addr_hi; -- uint32_t data_lo; -- uint32_t data_hi; --}; --#endif -- --/*--------------------MES_UNMAP_QUEUES--------------------*/ -- --#ifndef PM4_MES_UNMAP_QUEUES_DEFINED --#define PM4_MES_UNMAP_QUEUES_DEFINED --enum unmap_queues_action_enum { -- action__mes_unmap_queues__preempt_queues = 0, -- action__mes_unmap_queues__reset_queues = 1, -- action__mes_unmap_queues__disable_process_queues = 2 --}; -- --enum unmap_queues_queue_sel_enum { -- queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, -- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, -- queue_sel__mes_unmap_queues__perform_request_on_all_active_queues = 2, -- queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only = 3 --}; -- --enum unmap_queues_engine_sel_enum { -- engine_sel__mes_unmap_queues__compute = 0, -- engine_sel__mes_unmap_queues__sdma0 = 2, -- engine_sel__mes_unmap_queues__sdma1 = 3 --}; -- --struct pm4_unmap_queues { -- union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -- }; -- -- union { -- struct { -- enum unmap_queues_action_enum action:2; -- uint32_t reserved1:2; -- enum unmap_queues_queue_sel_enum queue_sel:2; -- uint32_t reserved2:20; -- enum unmap_queues_engine_sel_enum engine_sel:3; -- uint32_t num_queues:3; -- } bitfields2; -- uint32_t ordinal2; -- }; -- -- union { -- struct { -- uint32_t pasid:16; -- uint32_t reserved3:16; -- } bitfields3a; -- struct { -- uint32_t reserved4:2; -- uint32_t doorbell_offset0:21; -- uint32_t reserved5:9; -- } bitfields3b; -- uint32_t ordinal3; -- }; -- -- union { -- struct { -- uint32_t reserved6:2; -- uint32_t doorbell_offset1:21; -- uint32_t reserved7:9; -- } bitfields4; -- uint32_t ordinal4; -- }; -- -- union { -- struct { -- uint32_t reserved8:2; -- uint32_t doorbell_offset2:21; -- uint32_t reserved9:9; -- } bitfields5; -- uint32_t ordinal5; -- }; -- -- union { -- struct { -- uint32_t reserved10:2; -- uint32_t doorbell_offset3:21; -- uint32_t reserved11:9; -- } bitfields6; -- uint32_t ordinal6; -- }; -- --}; --#endif -- --/*--------------------_RELEASE_MEM-------------------- */ -- --#ifndef PM4__RELEASE_MEM_DEFINED --#define PM4__RELEASE_MEM_DEFINED --enum RELEASE_MEM_event_index_enum { -- event_index___release_mem__end_of_pipe = 5, -- event_index___release_mem__shader_done = 6 --}; -- --enum RELEASE_MEM_cache_policy_enum { -- cache_policy___release_mem__lru = 0, -- cache_policy___release_mem__stream = 1, -- cache_policy___release_mem__bypass = 2 --}; -- --enum RELEASE_MEM_dst_sel_enum { -- dst_sel___release_mem__memory_controller = 0, -- dst_sel___release_mem__tc_l2 = 1, -- dst_sel___release_mem__queue_write_pointer_register = 2, -- dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 --}; -- --enum RELEASE_MEM_int_sel_enum { -- int_sel___release_mem__none = 0, -- int_sel___release_mem__send_interrupt_only = 1, -- int_sel___release_mem__send_interrupt_after_write_confirm = 2, -- int_sel___release_mem__send_data_after_write_confirm = 3 --}; -- --enum RELEASE_MEM_data_sel_enum { -- data_sel___release_mem__none = 0, -- data_sel___release_mem__send_32_bit_low = 1, -- data_sel___release_mem__send_64_bit_data = 2, -- data_sel___release_mem__send_gpu_clock_counter = 3, -- data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, -- data_sel___release_mem__store_gds_data_to_memory = 5 --}; -- --struct pm4__release_mem { -- union { -- union PM4_MES_TYPE_3_HEADER header; /*header */ -- unsigned int ordinal1; -- }; -- -- union { -- struct { -- unsigned int event_type:6; -- unsigned int reserved1:2; -- enum RELEASE_MEM_event_index_enum event_index:4; -- unsigned int tcl1_vol_action_ena:1; -- unsigned int tc_vol_action_ena:1; -- unsigned int reserved2:1; -- unsigned int tc_wb_action_ena:1; -- unsigned int tcl1_action_ena:1; -- unsigned int tc_action_ena:1; -- unsigned int reserved3:6; -- unsigned int atc:1; -- enum RELEASE_MEM_cache_policy_enum cache_policy:2; -- unsigned int reserved4:5; -- } bitfields2; -- unsigned int ordinal2; -- }; -- -- union { -- struct { -- unsigned int reserved5:16; -- enum RELEASE_MEM_dst_sel_enum dst_sel:2; -- unsigned int reserved6:6; -- enum RELEASE_MEM_int_sel_enum int_sel:3; -- unsigned int reserved7:2; -- enum RELEASE_MEM_data_sel_enum data_sel:3; -- } bitfields3; -- unsigned int ordinal3; -- }; -- -- union { -- struct { -- unsigned int reserved8:2; -- unsigned int address_lo_32b:30; -- } bitfields4; -- struct { -- unsigned int reserved9:3; -- unsigned int address_lo_64b:29; -- } bitfields5; -- unsigned int ordinal4; -- }; -- -- unsigned int address_hi; -- -- unsigned int data_lo; -- -- unsigned int data_hi; --}; --#endif -- - enum { - CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 - }; -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h -new file mode 100644 -index 0000000..ddad9be ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h -@@ -0,0 +1,583 @@ -+/* -+ * Copyright 2016 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ * -+ */ -+ -+#ifndef F32_MES_PM4_PACKETS_H -+#define F32_MES_PM4_PACKETS_H -+ -+#ifndef PM4_MES_HEADER_DEFINED -+#define PM4_MES_HEADER_DEFINED -+union PM4_MES_TYPE_3_HEADER { -+ struct { -+ uint32_t reserved1 : 8; /* < reserved */ -+ uint32_t opcode : 8; /* < IT opcode */ -+ uint32_t count : 14;/* < number of DWORDs - 1 in the -+ * information body. -+ */ -+ uint32_t type : 2; /* < packet identifier. -+ * It should be 3 for type 3 packets -+ */ -+ }; -+ uint32_t u32All; -+}; -+#endif /* PM4_MES_HEADER_DEFINED */ -+ -+/*--------------------MES_SET_RESOURCES--------------------*/ -+ -+#ifndef PM4_MES_SET_RESOURCES_DEFINED -+#define PM4_MES_SET_RESOURCES_DEFINED -+enum mes_set_resources_queue_type_enum { -+ queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, -+ queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, -+ queue_type__mes_set_resources__hsa_debug_interface_queue = 4 -+}; -+ -+ -+struct pm4_mes_set_resources { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ uint32_t vmid_mask:16; -+ uint32_t unmap_latency:8; -+ uint32_t reserved1:5; -+ enum mes_set_resources_queue_type_enum queue_type:3; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ uint32_t queue_mask_lo; -+ uint32_t queue_mask_hi; -+ uint32_t gws_mask_lo; -+ uint32_t gws_mask_hi; -+ -+ union { -+ struct { -+ uint32_t oac_mask:16; -+ uint32_t reserved2:16; -+ } bitfields7; -+ uint32_t ordinal7; -+ }; -+ -+ union { -+ struct { -+ uint32_t gds_heap_base:6; -+ uint32_t reserved3:5; -+ uint32_t gds_heap_size:6; -+ uint32_t reserved4:15; -+ } bitfields8; -+ uint32_t ordinal8; -+ }; -+ -+}; -+#endif -+ -+/*--------------------MES_RUN_LIST--------------------*/ -+ -+#ifndef PM4_MES_RUN_LIST_DEFINED -+#define PM4_MES_RUN_LIST_DEFINED -+ -+struct pm4_mes_runlist { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved1:2; -+ uint32_t ib_base_lo:30; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ uint32_t ib_base_hi; -+ -+ union { -+ struct { -+ uint32_t ib_size:20; -+ uint32_t chain:1; -+ uint32_t offload_polling:1; -+ uint32_t reserved2:1; -+ uint32_t valid:1; -+ uint32_t process_cnt:4; -+ uint32_t reserved3:4; -+ } bitfields4; -+ uint32_t ordinal4; -+ }; -+ -+}; -+#endif -+ -+/*--------------------MES_MAP_PROCESS--------------------*/ -+ -+#ifndef PM4_MES_MAP_PROCESS_DEFINED -+#define PM4_MES_MAP_PROCESS_DEFINED -+ -+struct pm4_mes_map_process { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ uint32_t pasid:16; -+ uint32_t reserved1:8; -+ uint32_t diq_enable:1; -+ uint32_t process_quantum:7; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ uint32_t vm_context_page_table_base_addr_lo32; -+ -+ uint32_t vm_context_page_table_base_addr_hi32; -+ -+ uint32_t sh_mem_bases; -+ -+ uint32_t sh_mem_config; -+ -+ uint32_t sq_shader_tba_lo; -+ -+ uint32_t sq_shader_tba_hi; -+ -+ uint32_t sq_shader_tma_lo; -+ -+ uint32_t sq_shader_tma_hi; -+ -+ uint32_t reserved6; -+ -+ uint32_t gds_addr_lo; -+ -+ uint32_t gds_addr_hi; -+ -+ union { -+ struct { -+ uint32_t num_gws:6; -+ uint32_t reserved7:1; -+ uint32_t sdma_enable:1; -+ uint32_t num_oac:4; -+ uint32_t reserved8:4; -+ uint32_t gds_size:6; -+ uint32_t num_queues:10; -+ } bitfields14; -+ uint32_t ordinal14; -+ }; -+ -+ uint32_t completion_signal_lo; -+ -+ uint32_t completion_signal_hi; -+ -+}; -+ -+#endif -+ -+/*--------------------MES_MAP_PROCESS_VM--------------------*/ -+ -+#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED -+#define PM4_MES_MAP_PROCESS_VM_DEFINED -+ -+struct PM4_MES_MAP_PROCESS_VM { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ uint32_t reserved1; -+ -+ uint32_t vm_context_cntl; -+ -+ uint32_t reserved2; -+ -+ uint32_t vm_context_page_table_end_addr_lo32; -+ -+ uint32_t vm_context_page_table_end_addr_hi32; -+ -+ uint32_t vm_context_page_table_start_addr_lo32; -+ -+ uint32_t vm_context_page_table_start_addr_hi32; -+ -+ uint32_t reserved3; -+ -+ uint32_t reserved4; -+ -+ uint32_t reserved5; -+ -+ uint32_t reserved6; -+ -+ uint32_t reserved7; -+ -+ uint32_t reserved8; -+ -+ uint32_t completion_signal_lo32; -+ -+ uint32_t completion_signal_hi32; -+ -+}; -+#endif -+ -+/*--------------------MES_MAP_QUEUES--------------------*/ -+ -+#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED -+#define PM4_MES_MAP_QUEUES_VI_DEFINED -+enum mes_map_queues_queue_sel_enum { -+ queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, -+queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 -+}; -+ -+enum mes_map_queues_queue_type_enum { -+ queue_type__mes_map_queues__normal_compute_vi = 0, -+ queue_type__mes_map_queues__debug_interface_queue_vi = 1, -+ queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, -+queue_type__mes_map_queues__low_latency_static_queue_vi = 3 -+}; -+ -+enum mes_map_queues_alloc_format_enum { -+ alloc_format__mes_map_queues__one_per_pipe_vi = 0, -+alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 -+}; -+ -+enum mes_map_queues_engine_sel_enum { -+ engine_sel__mes_map_queues__compute_vi = 0, -+ engine_sel__mes_map_queues__sdma0_vi = 2, -+ engine_sel__mes_map_queues__sdma1_vi = 3 -+}; -+ -+ -+struct pm4_mes_map_queues { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved1:4; -+ enum mes_map_queues_queue_sel_enum queue_sel:2; -+ uint32_t reserved2:15; -+ enum mes_map_queues_queue_type_enum queue_type:3; -+ enum mes_map_queues_alloc_format_enum alloc_format:2; -+ enum mes_map_queues_engine_sel_enum engine_sel:3; -+ uint32_t num_queues:3; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved3:1; -+ uint32_t check_disable:1; -+ uint32_t doorbell_offset:26; -+ uint32_t reserved4:4; -+ } bitfields3; -+ uint32_t ordinal3; -+ }; -+ -+ uint32_t mqd_addr_lo; -+ uint32_t mqd_addr_hi; -+ uint32_t wptr_addr_lo; -+ uint32_t wptr_addr_hi; -+}; -+#endif -+ -+/*--------------------MES_QUERY_STATUS--------------------*/ -+ -+#ifndef PM4_MES_QUERY_STATUS_DEFINED -+#define PM4_MES_QUERY_STATUS_DEFINED -+enum mes_query_status_interrupt_sel_enum { -+ interrupt_sel__mes_query_status__completion_status = 0, -+ interrupt_sel__mes_query_status__process_status = 1, -+ interrupt_sel__mes_query_status__queue_status = 2 -+}; -+ -+enum mes_query_status_command_enum { -+ command__mes_query_status__interrupt_only = 0, -+ command__mes_query_status__fence_only_immediate = 1, -+ command__mes_query_status__fence_only_after_write_ack = 2, -+ command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 -+}; -+ -+enum mes_query_status_engine_sel_enum { -+ engine_sel__mes_query_status__compute = 0, -+ engine_sel__mes_query_status__sdma0_queue = 2, -+ engine_sel__mes_query_status__sdma1_queue = 3 -+}; -+ -+struct pm4_mes_query_status { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ uint32_t context_id:28; -+ enum mes_query_status_interrupt_sel_enum interrupt_sel:2; -+ enum mes_query_status_command_enum command:2; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ union { -+ struct { -+ uint32_t pasid:16; -+ uint32_t reserved1:16; -+ } bitfields3a; -+ struct { -+ uint32_t reserved2:2; -+ uint32_t doorbell_offset:26; -+ enum mes_query_status_engine_sel_enum engine_sel:3; -+ uint32_t reserved3:1; -+ } bitfields3b; -+ uint32_t ordinal3; -+ }; -+ -+ uint32_t addr_lo; -+ uint32_t addr_hi; -+ uint32_t data_lo; -+ uint32_t data_hi; -+}; -+#endif -+ -+/*--------------------MES_UNMAP_QUEUES--------------------*/ -+ -+#ifndef PM4_MES_UNMAP_QUEUES_DEFINED -+#define PM4_MES_UNMAP_QUEUES_DEFINED -+enum mes_unmap_queues_action_enum { -+ action__mes_unmap_queues__preempt_queues = 0, -+ action__mes_unmap_queues__reset_queues = 1, -+ action__mes_unmap_queues__disable_process_queues = 2, -+ action__mes_unmap_queues__reserved = 3 -+}; -+ -+enum mes_unmap_queues_queue_sel_enum { -+ queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, -+ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, -+ queue_sel__mes_unmap_queues__unmap_all_queues = 2, -+ queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 -+}; -+ -+enum mes_unmap_queues_engine_sel_enum { -+ engine_sel__mes_unmap_queues__compute = 0, -+ engine_sel__mes_unmap_queues__sdma0 = 2, -+ engine_sel__mes_unmap_queues__sdmal = 3 -+}; -+ -+struct pm4_mes_unmap_queues { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; -+ }; -+ -+ union { -+ struct { -+ enum mes_unmap_queues_action_enum action:2; -+ uint32_t reserved1:2; -+ enum mes_unmap_queues_queue_sel_enum queue_sel:2; -+ uint32_t reserved2:20; -+ enum mes_unmap_queues_engine_sel_enum engine_sel:3; -+ uint32_t num_queues:3; -+ } bitfields2; -+ uint32_t ordinal2; -+ }; -+ -+ union { -+ struct { -+ uint32_t pasid:16; -+ uint32_t reserved3:16; -+ } bitfields3a; -+ struct { -+ uint32_t reserved4:2; -+ uint32_t doorbell_offset0:26; -+ int32_t reserved5:4; -+ } bitfields3b; -+ uint32_t ordinal3; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved6:2; -+ uint32_t doorbell_offset1:26; -+ uint32_t reserved7:4; -+ } bitfields4; -+ uint32_t ordinal4; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved8:2; -+ uint32_t doorbell_offset2:26; -+ uint32_t reserved9:4; -+ } bitfields5; -+ uint32_t ordinal5; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved10:2; -+ uint32_t doorbell_offset3:26; -+ uint32_t reserved11:4; -+ } bitfields6; -+ uint32_t ordinal6; -+ }; -+}; -+#endif -+ -+#ifndef PM4_MEC_RELEASE_MEM_DEFINED -+#define PM4_MEC_RELEASE_MEM_DEFINED -+ -+enum mec_release_mem_event_index_enum { -+ event_index__mec_release_mem__end_of_pipe = 5, -+ event_index__mec_release_mem__shader_done = 6 -+}; -+ -+enum mec_release_mem_cache_policy_enum { -+ cache_policy__mec_release_mem__lru = 0, -+ cache_policy__mec_release_mem__stream = 1 -+}; -+ -+enum mec_release_mem_pq_exe_status_enum { -+ pq_exe_status__mec_release_mem__default = 0, -+ pq_exe_status__mec_release_mem__phase_update = 1 -+}; -+ -+enum mec_release_mem_dst_sel_enum { -+ dst_sel__mec_release_mem__memory_controller = 0, -+ dst_sel__mec_release_mem__tc_l2 = 1, -+ dst_sel__mec_release_mem__queue_write_pointer_register = 2, -+ dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 -+}; -+ -+enum mec_release_mem_int_sel_enum { -+ int_sel__mec_release_mem__none = 0, -+ int_sel__mec_release_mem__send_interrupt_only = 1, -+ int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, -+ int_sel__mec_release_mem__send_data_after_write_confirm = 3, -+ int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, -+ int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, -+ int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 -+}; -+ -+enum mec_release_mem_data_sel_enum { -+ data_sel__mec_release_mem__none = 0, -+ data_sel__mec_release_mem__send_32_bit_low = 1, -+ data_sel__mec_release_mem__send_64_bit_data = 2, -+ data_sel__mec_release_mem__send_gpu_clock_counter = 3, -+ data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, -+ data_sel__mec_release_mem__store_gds_data_to_memory = 5 -+}; -+ -+struct pm4_mec_release_mem { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /*header */ -+ unsigned int ordinal1; -+ }; -+ -+ union { -+ struct { -+ unsigned int event_type:6; -+ unsigned int reserved1:2; -+ enum mec_release_mem_event_index_enum event_index:4; -+ unsigned int tcl1_vol_action_ena:1; -+ unsigned int tc_vol_action_ena:1; -+ unsigned int reserved2:1; -+ unsigned int tc_wb_action_ena:1; -+ unsigned int tcl1_action_ena:1; -+ unsigned int tc_action_ena:1; -+ uint32_t reserved3:1; -+ uint32_t tc_nc_action_ena:1; -+ uint32_t tc_wc_action_ena:1; -+ uint32_t tc_md_action_ena:1; -+ uint32_t reserved4:3; -+ enum mec_release_mem_cache_policy_enum cache_policy:2; -+ uint32_t reserved5:2; -+ enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; -+ uint32_t reserved6:2; -+ } bitfields2; -+ unsigned int ordinal2; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved7:16; -+ enum mec_release_mem_dst_sel_enum dst_sel:2; -+ uint32_t reserved8:6; -+ enum mec_release_mem_int_sel_enum int_sel:3; -+ uint32_t reserved9:2; -+ enum mec_release_mem_data_sel_enum data_sel:3; -+ } bitfields3; -+ unsigned int ordinal3; -+ }; -+ -+ union { -+ struct { -+ uint32_t reserved10:2; -+ unsigned int address_lo_32b:30; -+ } bitfields4; -+ struct { -+ uint32_t reserved11:3; -+ uint32_t address_lo_64b:29; -+ } bitfields4b; -+ uint32_t reserved12; -+ unsigned int ordinal4; -+ }; -+ -+ union { -+ uint32_t address_hi; -+ uint32_t reserved13; -+ uint32_t ordinal5; -+ }; -+ -+ union { -+ uint32_t data_lo; -+ uint32_t cmp_data_lo; -+ struct { -+ uint32_t dw_offset:16; -+ uint32_t num_dwords:16; -+ } bitfields6c; -+ uint32_t reserved14; -+ uint32_t ordinal6; -+ }; -+ -+ union { -+ uint32_t data_hi; -+ uint32_t cmp_data_hi; -+ uint32_t reserved15; -+ uint32_t reserved16; -+ uint32_t ordinal7; -+ }; -+ -+ uint32_t int_ctxid; -+ -+}; -+ -+#endif -+ -+enum { -+ CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 -+}; -+#endif -+ -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h -index 08c7219..8cb3094 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h -@@ -124,9 +124,10 @@ struct pm4_mes_runlist { - uint32_t ib_size:20; - uint32_t chain:1; - uint32_t offload_polling:1; -- uint32_t reserved3:1; -+ uint32_t reserved2:1; - uint32_t valid:1; -- uint32_t reserved4:8; -+ uint32_t process_cnt:4; -+ uint32_t reserved3:4; - } bitfields4; - uint32_t ordinal4; - }; -@@ -141,8 +142,8 @@ struct pm4_mes_runlist { - - struct pm4_mes_map_process { - union { -- union PM4_MES_TYPE_3_HEADER header; /* header */ -- uint32_t ordinal1; -+ union PM4_MES_TYPE_3_HEADER header; /* header */ -+ uint32_t ordinal1; - }; - - union { -@@ -153,36 +154,48 @@ struct pm4_mes_map_process { - uint32_t process_quantum:7; - } bitfields2; - uint32_t ordinal2; --}; -+ }; - - union { - struct { - uint32_t page_table_base:28; -- uint32_t reserved2:4; -+ uint32_t reserved3:4; - } bitfields3; - uint32_t ordinal3; - }; - -+ uint32_t reserved; -+ - uint32_t sh_mem_bases; -+ uint32_t sh_mem_config; - uint32_t sh_mem_ape1_base; - uint32_t sh_mem_ape1_limit; -- uint32_t sh_mem_config; -+ -+ uint32_t sh_hidden_private_base_vmid; -+ -+ uint32_t reserved2; -+ uint32_t reserved3; -+ - uint32_t gds_addr_lo; - uint32_t gds_addr_hi; - - union { - struct { - uint32_t num_gws:6; -- uint32_t reserved3:2; -+ uint32_t reserved4:2; - uint32_t num_oac:4; -- uint32_t reserved4:4; -+ uint32_t reserved5:4; - uint32_t gds_size:6; - uint32_t num_queues:10; - } bitfields10; - uint32_t ordinal10; - }; - -+ uint32_t completion_signal_lo; -+ uint32_t completion_signal_hi; -+ - }; -+ - #endif - - /*--------------------MES_MAP_QUEUES--------------------*/ -@@ -335,7 +348,7 @@ enum mes_unmap_queues_engine_sel_enum { - engine_sel__mes_unmap_queues__sdmal = 3 - }; - --struct PM4_MES_UNMAP_QUEUES { -+struct pm4_mes_unmap_queues { - union { - union PM4_MES_TYPE_3_HEADER header; /* header */ - uint32_t ordinal1; -@@ -395,4 +408,101 @@ struct PM4_MES_UNMAP_QUEUES { - }; - #endif - -+#ifndef PM4_MEC_RELEASE_MEM_DEFINED -+#define PM4_MEC_RELEASE_MEM_DEFINED -+enum RELEASE_MEM_event_index_enum { -+ event_index___release_mem__end_of_pipe = 5, -+ event_index___release_mem__shader_done = 6 -+}; -+ -+enum RELEASE_MEM_cache_policy_enum { -+ cache_policy___release_mem__lru = 0, -+ cache_policy___release_mem__stream = 1, -+ cache_policy___release_mem__bypass = 2 -+}; -+ -+enum RELEASE_MEM_dst_sel_enum { -+ dst_sel___release_mem__memory_controller = 0, -+ dst_sel___release_mem__tc_l2 = 1, -+ dst_sel___release_mem__queue_write_pointer_register = 2, -+ dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 -+}; -+ -+enum RELEASE_MEM_int_sel_enum { -+ int_sel___release_mem__none = 0, -+ int_sel___release_mem__send_interrupt_only = 1, -+ int_sel___release_mem__send_interrupt_after_write_confirm = 2, -+ int_sel___release_mem__send_data_after_write_confirm = 3 -+}; -+ -+enum RELEASE_MEM_data_sel_enum { -+ data_sel___release_mem__none = 0, -+ data_sel___release_mem__send_32_bit_low = 1, -+ data_sel___release_mem__send_64_bit_data = 2, -+ data_sel___release_mem__send_gpu_clock_counter = 3, -+ data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, -+ data_sel___release_mem__store_gds_data_to_memory = 5 -+}; -+ -+struct pm4_mec_release_mem { -+ union { -+ union PM4_MES_TYPE_3_HEADER header; /*header */ -+ unsigned int ordinal1; -+ }; -+ -+ union { -+ struct { -+ unsigned int event_type:6; -+ unsigned int reserved1:2; -+ enum RELEASE_MEM_event_index_enum event_index:4; -+ unsigned int tcl1_vol_action_ena:1; -+ unsigned int tc_vol_action_ena:1; -+ unsigned int reserved2:1; -+ unsigned int tc_wb_action_ena:1; -+ unsigned int tcl1_action_ena:1; -+ unsigned int tc_action_ena:1; -+ unsigned int reserved3:6; -+ unsigned int atc:1; -+ enum RELEASE_MEM_cache_policy_enum cache_policy:2; -+ unsigned int reserved4:5; -+ } bitfields2; -+ unsigned int ordinal2; -+ }; -+ -+ union { -+ struct { -+ unsigned int reserved5:16; -+ enum RELEASE_MEM_dst_sel_enum dst_sel:2; -+ unsigned int reserved6:6; -+ enum RELEASE_MEM_int_sel_enum int_sel:3; -+ unsigned int reserved7:2; -+ enum RELEASE_MEM_data_sel_enum data_sel:3; -+ } bitfields3; -+ unsigned int ordinal3; -+ }; -+ -+ union { -+ struct { -+ unsigned int reserved8:2; -+ unsigned int address_lo_32b:30; -+ } bitfields4; -+ struct { -+ unsigned int reserved9:3; -+ unsigned int address_lo_64b:29; -+ } bitfields5; -+ unsigned int ordinal4; -+ }; -+ -+ unsigned int address_hi; -+ -+ unsigned int data_lo; -+ -+ unsigned int data_hi; -+}; -+#endif -+ -+enum { -+ CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 -+}; -+ - #endif -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -index aa2133e..4962d7b 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h -@@ -172,13 +172,15 @@ enum asic_family_type { - CHIP_TONGA, - CHIP_FIJI, - CHIP_POLARIS10, -- CHIP_POLARIS11 -+ CHIP_POLARIS11, -+ CHIP_VEGA10 - }; - - #define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_POLARIS11) - #define KFD_IS_DGPU(chip) (((chip) >= CHIP_TONGA && \ -- (chip) <= CHIP_POLARIS11) || \ -+ (chip) <= CHIP_VEGA10) || \ - (chip) == CHIP_HAWAII) -+#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) - - struct kfd_event_interrupt_class { - bool (*interrupt_isr)(struct kfd_dev *dev, const uint32_t *ih_ring_entry, -@@ -191,6 +193,7 @@ struct kfd_device_info { - const struct kfd_event_interrupt_class *event_interrupt_class; - unsigned int max_pasid_bits; - unsigned int max_no_of_hqd; -+ unsigned int doorbell_size; - size_t ih_ring_entry_size; - uint8_t num_of_watch_points; - uint16_t mqd_size_aligned; -@@ -204,6 +207,7 @@ struct kfd_mem_obj { - uint32_t range_end; - uint64_t gpu_addr; - uint32_t *cpu_ptr; -+ void *gtt_mem; - }; - - struct kfd_vmid_info { -@@ -417,7 +421,7 @@ struct queue_properties { - uint32_t queue_percent; - uint32_t *read_ptr; - uint32_t *write_ptr; -- uint32_t __iomem *doorbell_ptr; -+ void __iomem *doorbell_ptr; - uint32_t doorbell_off; - bool is_interop; - bool is_evicted; /* true -> queue is evicted */ -@@ -482,6 +486,7 @@ struct queue { - uint32_t queue; - - unsigned int sdma_id; -+ unsigned int doorbell_id; - - struct kfd_process *process; - struct kfd_dev *device; -@@ -794,17 +799,19 @@ unsigned int kfd_pasid_alloc(void); - void kfd_pasid_free(unsigned int pasid); - - /* Doorbells */ --size_t kfd_doorbell_process_slice(void); -+size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); - void kfd_doorbell_init(struct kfd_dev *kfd); --int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); --u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, -+int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process, -+ struct vm_area_struct *vma); -+void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, - unsigned int *doorbell_off); - void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); - u32 read_kernel_doorbell(u32 __iomem *db); --void write_kernel_doorbell(u32 __iomem *db, u32 value); --unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, -+void write_kernel_doorbell(void __iomem *db, u32 value); -+void write_kernel_doorbell64(void __iomem *db, u64 value); -+unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, - struct kfd_process *process, -- unsigned int queue_id); -+ unsigned int doorbell_id); - - /* GTT Sub-Allocator */ - -@@ -865,6 +872,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); - struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); -+struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, -+ struct kfd_dev *dev); - struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); - void device_queue_manager_uninit(struct device_queue_manager *dqm); - struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, -@@ -904,7 +913,7 @@ int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); - #define KFD_FENCE_COMPLETED (100) - #define KFD_FENCE_INIT (10) - --struct packet_manager_firmware; -+struct packet_manager_func; - - struct packet_manager { - struct device_queue_manager *dqm; -@@ -914,17 +923,38 @@ struct packet_manager { - struct kfd_mem_obj *ib_buffer_obj; - unsigned ib_size_bytes; - -- struct packet_manager_firmware *pmf; -+ struct packet_manager_funcs *pmf; - }; - --struct packet_manager_firmware { -- /* Support different firmware versions for map process packet */ -+struct packet_manager_funcs { -+ /* Support different firmware versions for PM4 packets */ - int (*map_process)(struct packet_manager *pm, uint32_t *buffer, -- struct qcm_process_device *qpd); -- int (*get_map_process_packet_size)(void); -+ struct qcm_process_device *qpd); -+ int (*runlist)(struct packet_manager *pm, uint32_t *buffer, -+ uint64_t ib, size_t ib_size_in_dwords, bool chain); -+ int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, -+ struct scheduling_resources *res); -+ int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, -+ struct queue *q, bool is_static); -+ int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, -+ enum kfd_queue_type type, -+ enum kfd_unmap_queues_filter mode, -+ uint32_t filter_param, bool reset, -+ unsigned int sdma_engine); -+ int (*query_status)(struct packet_manager *pm, uint32_t *buffer, -+ uint64_t fence_address, uint32_t fence_value); -+ uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); -+ -+ uint32_t (*get_map_process_packet_size)(void); -+ uint32_t (*get_runlist_packet_size)(void); -+ uint32_t (*get_set_resources_packet_size)(void); -+ uint32_t (*get_map_queues_packet_size)(void); -+ uint32_t (*get_unmap_queues_packet_size)(void); -+ uint32_t (*get_query_status_packet_size)(void); -+ uint32_t (*get_release_mem_packet_size)(void); -+ - }; - --uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer); - int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, - uint16_t fw_ver); - void pm_uninit(struct packet_manager *pm); -@@ -941,6 +971,38 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - - void pm_release_ib(struct packet_manager *pm); - -+/* Following PM funcs can be shared among KV and VI */ -+unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); -+int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, -+ uint64_t ib, size_t ib_size_in_dwords, bool chain); -+int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, -+ struct queue *q, bool is_static); -+int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, -+ struct scheduling_resources *res); -+int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, -+ enum kfd_queue_type type, -+ enum kfd_unmap_queues_filter filter, -+ uint32_t filter_param, bool reset, -+ unsigned int sdma_engine); -+int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, -+ uint64_t fence_address, uint32_t fence_value); -+uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer); -+ -+uint32_t pm_get_map_process_packet_size_vi(void); -+uint32_t pm_get_runlist_packet_size_vi(void); -+uint32_t pm_get_set_resources_packet_size_vi(void); -+uint32_t pm_get_map_queues_packet_size_vi(void); -+uint32_t pm_get_unmap_queues_packet_size_vi(void); -+uint32_t pm_get_query_status_packet_size_vi(void); -+uint32_t pm_get_release_mem_packet_size_vi(void); -+ -+ -+void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver); -+void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver); -+ -+void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver); -+ -+ - uint64_t kfd_get_number_elems(struct kfd_dev *kfd); - phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, - struct kfd_process *process); -@@ -950,6 +1012,8 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - - /* Events */ - extern const struct kfd_event_interrupt_class event_interrupt_class_cik; -+extern const struct kfd_event_interrupt_class event_interrupt_class_v9; -+ - extern const struct kfd_device_global_init_class device_global_init_class_cik; - - enum kfd_event_wait_result { -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c -index af90b0a..94e07ee 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c -@@ -146,9 +146,6 @@ static int create_cp_queue(struct process_queue_manager *pqm, - /* Doorbell initialized in user space*/ - q_properties->doorbell_ptr = NULL; - -- q_properties->doorbell_off = -- kfd_queue_id_to_doorbell(dev, pqm->process, qid); -- - /* let DQM handle it*/ - q_properties->vmid = 0; - q_properties->queue_id = qid; -@@ -283,6 +280,15 @@ int pqm_create_queue(struct process_queue_manager *pqm, - goto err_create_queue; - } - -+ if (q) -+ /* Return the doorbell offset within the doorbell page -+ * to the caller so it can be passed up to user mode -+ * (in bytes). -+ */ -+ properties->doorbell_off = -+ (q->properties.doorbell_off * sizeof(uint32_t)) & -+ (kfd_doorbell_process_slice(dev) - 1); -+ - pr_debug("kfd: PQM After DQM create queue\n"); - - list_add(&pqn->process_queue_list, &pqm->queues); -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -index 36e2cba..7603967 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c -@@ -1233,6 +1233,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu) - HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & - HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); - break; -+ case CHIP_VEGA10: -+ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << -+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & -+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); -+ break; - } - - /* Fix errors in CZ CRAT. -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -index aa8056b..a968e58 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h -@@ -45,6 +45,7 @@ - - #define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 - #define HSA_CAP_DOORBELL_TYPE_1_0 0x1 -+#define HSA_CAP_DOORBELL_TYPE_2_0 0x2 - #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 - #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 - #define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 -diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h -new file mode 100644 -index 0000000..e00d03d ---- /dev/null -+++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h -@@ -0,0 +1,84 @@ -+/* -+ * Copyright 2016 Advanced Micro Devices, Inc. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -+ * OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#ifndef HSA_SOC15_INT_H_INCLUDED -+#define HSA_SOC15_INT_H_INCLUDED -+/* -+ * vega10+ IH clients -+ */ -+enum soc15_ih_client_id { -+ SOC15_IH_CLIENTID_IH = 0x00, -+ SOC15_IH_CLIENTID_ACP = 0x01, -+ SOC15_IH_CLIENTID_ATHUB = 0x02, -+ SOC15_IH_CLIENTID_BIF = 0x03, -+ SOC15_IH_CLIENTID_DCE = 0x04, -+ SOC15_IH_CLIENTID_ISP = 0x05, -+ SOC15_IH_CLIENTID_PCIE0 = 0x06, -+ SOC15_IH_CLIENTID_RLC = 0x07, -+ SOC15_IH_CLIENTID_SDMA0 = 0x08, -+ SOC15_IH_CLIENTID_SDMA1 = 0x09, -+ SOC15_IH_CLIENTID_SE0SH = 0x0a, -+ SOC15_IH_CLIENTID_SE1SH = 0x0b, -+ SOC15_IH_CLIENTID_SE2SH = 0x0c, -+ SOC15_IH_CLIENTID_SE3SH = 0x0d, -+ SOC15_IH_CLIENTID_SYSHUB = 0x0e, -+ SOC15_IH_CLIENTID_THM = 0x0f, -+ SOC15_IH_CLIENTID_UVD = 0x10, -+ SOC15_IH_CLIENTID_VCE0 = 0x11, -+ SOC15_IH_CLIENTID_VMC = 0x12, -+ SOC15_IH_CLIENTID_XDMA = 0x13, -+ SOC15_IH_CLIENTID_GRBM_CP = 0x14, -+ SOC15_IH_CLIENTID_ATS = 0x15, -+ SOC15_IH_CLIENTID_ROM_SMUIO = 0x16, -+ SOC15_IH_CLIENTID_DF = 0x17, -+ SOC15_IH_CLIENTID_VCE1 = 0x18, -+ SOC15_IH_CLIENTID_PWR = 0x19, -+ SOC15_IH_CLIENTID_UTCL2 = 0x1b, -+ SOC15_IH_CLIENTID_EA = 0x1c, -+ SOC15_IH_CLIENTID_UTCL2LOG = 0x1d, -+ SOC15_IH_CLIENTID_MP0 = 0x1e, -+ SOC15_IH_CLIENTID_MP1 = 0x1f, -+ -+ SOC15_IH_CLIENTID_MAX -+}; -+ -+ -+#define SOC15_INTSRC_CP_END_OF_PIPE 181 -+#define SOC15_INTSRC_CP_BAD_OPCODE 183 -+#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 -+#define SOC15_INTSRC_VMC_FAULT 0 -+#define SOC15_INTSRC_SDMA_TRAP 224 -+ -+ -+#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) -+#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) -+#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) -+#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) -+#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) -+#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) -+#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) -+#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) -+#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) -+#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) -+ -+#endif -+ --- -2.7.4 - |