From 2fc07e832225e21b66ab8518025b07004df8225d Mon Sep 17 00:00:00 2001 From: Shaoyun Liu Date: Tue, 21 Mar 2017 17:39:08 -0400 Subject: [PATCH 1630/4131] drm/amdkfd: Use ttmp10 and ttmp11 to store TMA info for second level trap handler Second level trap handler will return to ISA directly, so first level trap handler will not have chance to change back the correct TMA setting. This will cause problem when the same trap happens again. Change to use ttmp10 and ttmp11 for the TMA info which will keep the same interface for asics GFX8, GFX9 and up. Change-Id: I975baa25297355da6a02eb430ffaca954eb74b4b Signed-off-by: Shaoyun Liu --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h | 43 ++++++++++++++++++---- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 15 ++++---- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h index 4e34083..48fcec5 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h @@ -22,7 +22,35 @@ #if 0 HW (VI) source code for CWSR trap handler -#Version 9 + multiple trap handler +#Version 18 + multiple trap handler + +// this performance-optimal version was originally from Seven Xu at SRDC + +// Revison #18 --... +/* Rev History +** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) +** #4. SR Memory Layout: +** 1. VGPR-SGPR-HWREG-{LDS} +** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. +** #5. Update: 1. Accurate g8sr_ts_save_d timestamp +** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) +** #7. Update: 1. don't barrier if noLDS +** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version +** 2. Fix SQ issue by s_sleep 2 +** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last +** 2. optimize s_buffer save by burst 16sgprs... +** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. +** #11. Update 1. Add 2 more timestamp for debug version +** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance +** #13. Integ 1. Always use MUBUF for PV trap shader... +** #14. Update 1. s_buffer_store soft clause... +** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. +** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree +** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] +** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... +** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 +** 2. FUNC - Handle non-CWSR traps +*/ var G8SR_WDMEM_HWREG_OFFSET = 0 var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes @@ -186,7 +214,7 @@ var s_restore_buf_rsrc3 = ttmp11 /* Shader Main*/ shader main - asic(CARRIZO) + asic(VI) type(CS) @@ -219,8 +247,6 @@ if (!EMU_RUN_HACK) s_waitcnt lgkmcnt(0) s_or_b32 ttmp7, ttmp8, ttmp9 s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set - s_mov_b32 tma_lo, ttmp10 //set tma_lo/hi for next level trap handler - s_mov_b32 tma_hi, ttmp11 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler @@ -1099,18 +1125,19 @@ end function get_hwreg_size_bytes return 128 //HWREG size 128 bytes +end + #endif static const uint32_t cwsr_trap_carrizo_hex[] = { - 0xbf820001, 0xbf820124, + 0xbf820001, 0xbf820122, 0xb8f4f802, 0x89748674, 0xb8f5f803, 0x8675ff75, - 0x00000400, 0xbf850013, + 0x00000400, 0xbf850011, 0xc00a1e37, 0x00000000, 0xbf8c007f, 0x87777978, - 0xbf840004, 0xbeee007a, - 0xbeef007b, 0xb974f802, + 0xbf840002, 0xb974f802, 0xbe801d78, 0xb8f5f803, 0x8675ff75, 0x000001ff, 0xbf850002, 0x80708470, diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index 0106e77..661bd0a 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -28,7 +28,7 @@ HW (GFX9) source code for CWSR trap handler // Revison #18 --... /* Rev History -** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) +** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) ** #4. SR Memory Layout: ** 1. VGPR-SGPR-HWREG-{LDS} ** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. @@ -248,12 +248,12 @@ if (!EMU_RUN_HACK) /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ s_getreg_b32 tma_lo,hwreg(HW_REG_SQ_SHADER_TMA_LO) s_getreg_b32 tma_hi,hwreg(HW_REG_SQ_SHADER_TMA_HI) - s_load_dwordx4 [tba_lo,tba_hi,tma_lo, tma_hi], [tma_lo,tma_hi], 0 + s_load_dwordx4 [ttmp8,ttmp9, ttmp10, ttmp11], [tma_lo,tma_hi], 0 s_waitcnt lgkmcnt(0) - s_or_b32 ttmp11, tba_lo, tba_hi + s_or_b32 ttmp7, ttmp8, ttmp9 s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) - s_setpc_b64 [tba_lo,tba_hi] //jump to next level trap handler + s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler L_NO_NEXT_TRAP: s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) @@ -1135,6 +1135,7 @@ function get_hwreg_size_bytes end + #endif static const uint32_t cwsr_trap_gfx9_hex[] = { @@ -1143,10 +1144,10 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xb8f1f803, 0x8671ff71, 0x00000400, 0xbf850013, 0xb8faf812, 0xb8fbf813, - 0xc00a1e3d, 0x00000000, - 0xbf8cc07f, 0x87777978, + 0xc00a1d3d, 0x00000000, + 0xbf8cc07f, 0x87737574, 0xbf840002, 0xb970f802, - 0xbe801d78, 0xb8f1f803, + 0xbe801d74, 0xb8f1f803, 0x8671ff71, 0x000001ff, 0xbf850002, 0x806c846c, 0x826d806d, 0x866dff6d, -- 2.7.4