aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1630-drm-amdkfd-Use-ttmp10-and-ttmp11-to-store-TMA-info-f.patch
blob: 8c0bfc75687bd36c279defb20de0a0836c61a430 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
From 2fc07e832225e21b66ab8518025b07004df8225d Mon Sep 17 00:00:00 2001
From: Shaoyun Liu <Shaoyun.Liu@amd.com>
Date: Tue, 21 Mar 2017 17:39:08 -0400
Subject: [PATCH 1630/4131] drm/amdkfd: Use ttmp10 and ttmp11 to store TMA info
 for second level trap handler

Second level trap handler will return to ISA directly, so first level trap
handler will not have chance to change back the correct TMA setting.
This will cause problem when the same trap happens again.
Change to use ttmp10 and ttmp11 for the TMA info which will keep the same
interface for asics GFX8, GFX9 and up.

Change-Id: I975baa25297355da6a02eb430ffaca954eb74b4b
Signed-off-by: Shaoyun Liu <Shaoyun.Liu@amd.com>
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h | 43 ++++++++++++++++++----
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm  | 15 ++++----
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h
index 4e34083..48fcec5 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h
@@ -22,7 +22,35 @@
 
 #if 0
 HW (VI) source code for CWSR trap handler
-#Version 9 + multiple trap handler
+#Version 18 + multiple trap handler
+
+// this performance-optimal version was originally from Seven Xu at SRDC
+
+// Revison #18   --...
+/* Rev History
+** #1. Branch from gc dv.   //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
+** #4. SR Memory Layout:
+**             1. VGPR-SGPR-HWREG-{LDS}
+**             2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
+** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
+** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
+** #7. Update: 1. don't barrier if noLDS
+** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
+**             2. Fix SQ issue by s_sleep 2
+** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
+**             2. optimize s_buffer save by burst 16sgprs...
+** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
+** #11. Update 1. Add 2 more timestamp for debug version
+** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
+** #13. Integ  1. Always use MUBUF for PV trap shader...
+** #14. Update 1. s_buffer_store soft clause...
+** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
+** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
+** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
+**             2. PERF - Save LDS before save VGPR to cover LDS save long latency...
+** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
+**             2. FUNC - Handle non-CWSR traps
+*/
 
 var G8SR_WDMEM_HWREG_OFFSET = 0
 var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
@@ -186,7 +214,7 @@ var	s_restore_buf_rsrc3		=	ttmp11
 /* Shader Main*/
 
 shader main
-  asic(CARRIZO)
+  asic(VI)
   type(CS)
 
 
@@ -219,8 +247,6 @@ if (!EMU_RUN_HACK)
 	s_waitcnt lgkmcnt(0)
 	s_or_b32        ttmp7, ttmp8, ttmp9
 	s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
-	s_mov_b32       tma_lo, ttmp10  //set tma_lo/hi for next level trap handler
-	s_mov_b32       tma_hi, ttmp11 
 	s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
 	s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler 
 
@@ -1099,18 +1125,19 @@ end
 
 function get_hwreg_size_bytes
     return 128 //HWREG size 128 bytes
+end
+
 
 #endif
 
 static const uint32_t cwsr_trap_carrizo_hex[] = {
-	0xbf820001, 0xbf820124,
+	0xbf820001, 0xbf820122,
 	0xb8f4f802, 0x89748674,
 	0xb8f5f803, 0x8675ff75,
-	0x00000400, 0xbf850013,
+	0x00000400, 0xbf850011,
 	0xc00a1e37, 0x00000000,
 	0xbf8c007f, 0x87777978,
-	0xbf840004, 0xbeee007a,
-	0xbeef007b, 0xb974f802,
+	0xbf840002, 0xb974f802,
 	0xbe801d78, 0xb8f5f803,
 	0x8675ff75, 0x000001ff,
 	0xbf850002, 0x80708470,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index 0106e77..661bd0a 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -28,7 +28,7 @@ HW (GFX9) source code for CWSR trap handler
 
 // Revison #18	 --...
 /* Rev History
-** #1. Branch from gc dv.   //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
+** #1. Branch from gc dv.   //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
 ** #4. SR Memory Layout:
 **			 1. VGPR-SGPR-HWREG-{LDS}
 **			 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
@@ -248,12 +248,12 @@ if (!EMU_RUN_HACK)
     /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
     s_getreg_b32    tma_lo,hwreg(HW_REG_SQ_SHADER_TMA_LO)
     s_getreg_b32    tma_hi,hwreg(HW_REG_SQ_SHADER_TMA_HI)
-    s_load_dwordx4  [tba_lo,tba_hi,tma_lo, tma_hi], [tma_lo,tma_hi], 0
+    s_load_dwordx4  [ttmp8,ttmp9, ttmp10, ttmp11], [tma_lo,tma_hi], 0
     s_waitcnt lgkmcnt(0)
-    s_or_b32	    ttmp11, tba_lo, tba_hi
+    s_or_b32	    ttmp7, ttmp8, ttmp9
     s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
     s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
-    s_setpc_b64	    [tba_lo,tba_hi] //jump to next level trap handler
+    s_setpc_b64	    [ttmp8,ttmp9] //jump to next level trap handler
 
 L_NO_NEXT_TRAP:
     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
@@ -1135,6 +1135,7 @@ function get_hwreg_size_bytes
 end
 
 
+
 #endif
 
 static const uint32_t cwsr_trap_gfx9_hex[] = {
@@ -1143,10 +1144,10 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 	0xb8f1f803, 0x8671ff71,
 	0x00000400, 0xbf850013,
 	0xb8faf812, 0xb8fbf813,
-	0xc00a1e3d, 0x00000000,
-	0xbf8cc07f, 0x87777978,
+	0xc00a1d3d, 0x00000000,
+	0xbf8cc07f, 0x87737574,
 	0xbf840002, 0xb970f802,
-	0xbe801d78, 0xb8f1f803,
+	0xbe801d74, 0xb8f1f803,
 	0x8671ff71, 0x000001ff,
 	0xbf850002, 0x806c846c,
 	0x826d806d, 0x866dff6d,
-- 
2.7.4