diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.14.71/1244-drm-amdkfd-Use-ttmp10-and-ttmp11-to-store-TMA-info-f.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.14.71/1244-drm-amdkfd-Use-ttmp10-and-ttmp11-to-store-TMA-info-f.patch | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.14.71/1244-drm-amdkfd-Use-ttmp10-and-ttmp11-to-store-TMA-info-f.patch b/common/recipes-kernel/linux/linux-yocto-4.14.71/1244-drm-amdkfd-Use-ttmp10-and-ttmp11-to-store-TMA-info-f.patch new file mode 100644 index 00000000..4d9c8f40 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.14.71/1244-drm-amdkfd-Use-ttmp10-and-ttmp11-to-store-TMA-info-f.patch @@ -0,0 +1,156 @@ +From c9a5100c168cf7701615d72fd35c6c16c5fe7423 Mon Sep 17 00:00:00 2001 +From: Shaoyun Liu <Shaoyun.Liu@amd.com> +Date: Tue, 21 Mar 2017 17:39:08 -0400 +Subject: [PATCH 1244/4131] drm/amdkfd: Use ttmp10 and ttmp11 to store TMA info + for second level trap handler + +Second level trap handler will return to ISA directly, so first level trap +handler will not have chance to change back the correct TMA setting. +This will cause problem when the same trap happens again. +Change to use ttmp10 and ttmp11 for the TMA info which will keep the same +interface for asics GFX8, GFX9 and up. + +Change-Id: I975baa25297355da6a02eb430ffaca954eb74b4b +Signed-off-by: Shaoyun Liu <Shaoyun.Liu@amd.com> +--- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h | 43 ++++++++++++++++++---- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 15 ++++---- + 2 files changed, 43 insertions(+), 15 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h +index 4e34083..48fcec5 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h ++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h +@@ -22,7 +22,35 @@ + + #if 0 + HW (VI) source code for CWSR trap handler +-#Version 9 + multiple trap handler ++#Version 18 + multiple trap handler ++ ++// this performance-optimal version was originally from Seven Xu at SRDC ++ ++// Revison #18 --... ++/* Rev History ++** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) ++** #4. SR Memory Layout: ++** 1. VGPR-SGPR-HWREG-{LDS} ++** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. ++** #5. Update: 1. Accurate g8sr_ts_save_d timestamp ++** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) ++** #7. Update: 1. don't barrier if noLDS ++** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version ++** 2. Fix SQ issue by s_sleep 2 ++** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last ++** 2. optimize s_buffer save by burst 16sgprs... ++** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. ++** #11. Update 1. Add 2 more timestamp for debug version ++** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance ++** #13. Integ 1. Always use MUBUF for PV trap shader... ++** #14. Update 1. s_buffer_store soft clause... ++** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. ++** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree ++** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] ++** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... ++** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 ++** 2. FUNC - Handle non-CWSR traps ++*/ + + var G8SR_WDMEM_HWREG_OFFSET = 0 + var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes +@@ -186,7 +214,7 @@ var s_restore_buf_rsrc3 = ttmp11 + /* Shader Main*/ + + shader main +- asic(CARRIZO) ++ asic(VI) + type(CS) + + +@@ -219,8 +247,6 @@ if (!EMU_RUN_HACK) + s_waitcnt lgkmcnt(0) + s_or_b32 ttmp7, ttmp8, ttmp9 + s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set +- s_mov_b32 tma_lo, ttmp10 //set tma_lo/hi for next level trap handler +- s_mov_b32 tma_hi, ttmp11 + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) + s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler + +@@ -1099,18 +1125,19 @@ end + + function get_hwreg_size_bytes + return 128 //HWREG size 128 bytes ++end ++ + + #endif + + static const uint32_t cwsr_trap_carrizo_hex[] = { +- 0xbf820001, 0xbf820124, ++ 0xbf820001, 0xbf820122, + 0xb8f4f802, 0x89748674, + 0xb8f5f803, 0x8675ff75, +- 0x00000400, 0xbf850013, ++ 0x00000400, 0xbf850011, + 0xc00a1e37, 0x00000000, + 0xbf8c007f, 0x87777978, +- 0xbf840004, 0xbeee007a, +- 0xbeef007b, 0xb974f802, ++ 0xbf840002, 0xb974f802, + 0xbe801d78, 0xb8f5f803, + 0x8675ff75, 0x000001ff, + 0xbf850002, 0x80708470, +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +index 0106e77..661bd0a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm ++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +@@ -28,7 +28,7 @@ HW (GFX9) source code for CWSR trap handler + + // Revison #18 --... + /* Rev History +-** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) ++** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) + ** #4. SR Memory Layout: + ** 1. VGPR-SGPR-HWREG-{LDS} + ** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. +@@ -248,12 +248,12 @@ if (!EMU_RUN_HACK) + /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ + s_getreg_b32 tma_lo,hwreg(HW_REG_SQ_SHADER_TMA_LO) + s_getreg_b32 tma_hi,hwreg(HW_REG_SQ_SHADER_TMA_HI) +- s_load_dwordx4 [tba_lo,tba_hi,tma_lo, tma_hi], [tma_lo,tma_hi], 0 ++ s_load_dwordx4 [ttmp8,ttmp9, ttmp10, ttmp11], [tma_lo,tma_hi], 0 + s_waitcnt lgkmcnt(0) +- s_or_b32 ttmp11, tba_lo, tba_hi ++ s_or_b32 ttmp7, ttmp8, ttmp9 + s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) +- s_setpc_b64 [tba_lo,tba_hi] //jump to next level trap handler ++ s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler + + L_NO_NEXT_TRAP: + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +@@ -1135,6 +1135,7 @@ function get_hwreg_size_bytes + end + + ++ + #endif + + static const uint32_t cwsr_trap_gfx9_hex[] = { +@@ -1143,10 +1144,10 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { + 0xb8f1f803, 0x8671ff71, + 0x00000400, 0xbf850013, + 0xb8faf812, 0xb8fbf813, +- 0xc00a1e3d, 0x00000000, +- 0xbf8cc07f, 0x87777978, ++ 0xc00a1d3d, 0x00000000, ++ 0xbf8cc07f, 0x87737574, + 0xbf840002, 0xb970f802, +- 0xbe801d78, 0xb8f1f803, ++ 0xbe801d74, 0xb8f1f803, + 0x8671ff71, 0x000001ff, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x866dff6d, +-- +2.7.4 + |